From f78b396ee7a5d4c47cf3e3a8cb9fb02a4d3fe250 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Jan 2025 14:13:11 +0200
Subject: [PATCH 01/84] llama : add struct llama_kv_cache (wip) [no ci]

---
 common/common.cpp                |   6 +-
 common/speculative.cpp           |  10 +-
 examples/embedding/embedding.cpp |   5 +-
 include/llama.h                  |  79 +++----
 src/llama-context.cpp            |  16 +-
 src/llama-kv-cache.cpp           | 286 ++++---------------------
 src/llama-kv-cache.h             | 350 ++++++++++++++++++++++++++-----
 src/llama.cpp                    |  91 ++------
 8 files changed, 428 insertions(+), 415 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 6dea8e3d25238..29de45189e2d3 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -952,7 +952,9 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
-    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
+    llama_kv_cache * kv = llama_get_kv_cache(lctx);
+
+    if (params.ctx_shift && !llama_kv_cache_can_shift(kv)) {
         LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
         params.ctx_shift = false;
     }
@@ -1057,7 +1059,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_decoder(model)) {
             llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
-        llama_kv_cache_clear(lctx);
+        llama_kv_cache_clear(kv);
         llama_synchronize(lctx);
         llama_perf_context_reset(lctx);
     }
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 318e96ea35468..6ac0585178ebd 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -171,8 +171,10 @@ llama_tokens common_speculative_gen_draft(
     llama_tokens result;
     result.reserve(params.n_draft);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     if (reuse_n == 0) {
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         prompt.clear();
     } else {
@@ -191,14 +193,14 @@ llama_tokens common_speculative_gen_draft(
         }
 
         if (reuse_i > 0) {
-            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
-            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+            llama_kv_cache_seq_rm (kv, 0, 0, reuse_i);
+            llama_kv_cache_seq_add(kv, 0, reuse_i, -1, -reuse_i);
 
             prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
         }
 
         if (reuse_n < (int) prompt.size()) {
-            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
+            llama_kv_cache_seq_rm (kv, 0, reuse_n, -1);
 
             prompt.erase(prompt.begin() + reuse_n, prompt.end());
         }
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 38d22c90f82bb..fda0949f1c4cf 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -34,10 +34,11 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-    const struct llama_model * model = llama_get_model(ctx);
+    const llama_model * model = llama_get_model(ctx);
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
+    llama_kv_cache_clear(kv);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/include/llama.h b/include/llama.h
index 3b75e760780ef..08b8658ad89ac 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -60,6 +60,7 @@ extern "C" {
     struct llama_model;
     struct llama_context;
     struct llama_sampler;
+    struct llama_kv_cache;
 
     typedef int32_t llama_pos;
     typedef int32_t llama_token;
@@ -467,8 +468,9 @@ extern "C" {
 
     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 
-    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx); // TODO: remove const?
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_cache(      struct llama_context * ctx);
+    LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx);
 
     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
     LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
@@ -584,7 +586,7 @@ extern "C" {
     // KV cache
     //
 
-    // TODO: remove llama_kv_cache_view_* API
+    // TODO: start using struct llama_kv_cache
 
     // Information associated with an individual cell in the KV cache view.
     struct llama_kv_cache_view_cell {
@@ -639,14 +641,20 @@ extern "C" {
 
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_cache_n_tokens(const struct llama_kv_cache * kv);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "use llama_kv_cache_n_tokens instead");
 
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_cache_used_cells(const struct llama_kv_cache * kv);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
+            "use llama_kv_cache_used_cells instead");
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
     LLAMA_API void llama_kv_cache_clear(
-            struct llama_context * ctx);
+            struct llama_kv_cache * kv);
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
     // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
@@ -654,26 +662,26 @@ extern "C" {
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
     LLAMA_API bool llama_kv_cache_seq_rm(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id,
+                        llama_pos   p0,
+                        llama_pos   p1);
 
     // Copy all tokens that belong to the specified sequence to another sequence
     // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_cp(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id_src,
-                    llama_seq_id   seq_id_dst,
-                       llama_pos   p0,
-                       llama_pos   p1);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id_src,
+                     llama_seq_id   seq_id_dst,
+                        llama_pos   p0,
+                        llama_pos   p1);
 
     // Removes all tokens that do not belong to the specified sequence
     LLAMA_API void llama_kv_cache_seq_keep(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id);
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
@@ -682,11 +690,11 @@ extern "C" {
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_add(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                       llama_pos   delta);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id,
+                        llama_pos   p0,
+                        llama_pos   p1,
+                        llama_pos   delta);
 
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
@@ -695,31 +703,28 @@ extern "C" {
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id,
+                        llama_pos   p0,
+                        llama_pos   p1,
+                              int   d);
 
     // Returns the largest position present in the KV cache for the specified sequence
     LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
-    //       how to avoid this?
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id);
 
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
     //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-
-    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
+    LLAMA_API void llama_kv_cache_defrag(struct llama_kv_cache * kv);
 
     // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
+    LLAMA_API bool llama_kv_cache_can_shift(const struct llama_kv_cache * kv);
+
+    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
+    LLAMA_API void llama_update_kv_cache(struct llama_context * ctx, struct llama_kv_cache * kv);
 
     //
     // State / sessions
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 671d2a81adabf..bf5a77ccaff1b 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -602,11 +602,15 @@ uint32_t llama_n_seq_max(const struct llama_context * ctx) {
     return ctx->kv_self.size;
 }
 
-const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+const llama_model * llama_get_model(const llama_context * ctx) {
     return &ctx->model;
 }
 
-enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
+llama_kv_cache * llama_get_kv_cache(llama_context * ctx) {
+    return &ctx->kv_self;
+}
+
+enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
     return ctx->cparams.pooling_type;
 }
 
@@ -1142,7 +1146,7 @@ struct llama_data_read {
         if (dest_seq_id != -1) {
             // single sequence
 
-            llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
+            kv_self.seq_rm(dest_seq_id, -1, -1);
 
             llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
             batch.n_tokens = cell_count;
@@ -1185,7 +1189,7 @@ struct llama_data_read {
                 return false;
             }
 
-            llama_kv_cache_clear(kv_self);
+            kv_self.clear();
 
             for (uint32_t i = 0; i < cell_count; ++i) {
                 llama_kv_cell & cell = kv_self.cells[i];
@@ -1362,9 +1366,9 @@ struct llama_data_read {
 
         if (!res) {
             if (seq_id == -1) {
-                llama_kv_cache_clear(ctx);
+                ctx->kv_self.clear();
             } else {
-                llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
+                ctx->kv_self.seq_rm(seq_id, -1, -1);
             }
             throw std::runtime_error("failed to restore kv cache");
         }
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index feffdf0de52cf..b0d5a931839f8 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -350,277 +350,67 @@ uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
     return 0;
 }
 
-void llama_kv_cache_clear(struct llama_kv_cache & cache) {
-    for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
-        cache.cells[i].pos = -1;
-        cache.cells[i].seq_id.clear();
-        cache.cells[i].src = -1;
-        cache.cells[i].tail = -1;
-    }
-    cache.head = 0;
-    cache.used = 0;
-
-    for (auto & buf : cache.bufs) {
-        ggml_backend_buffer_clear(buf.get(), 0);
-    }
+void llama_kv_cache_clear(llama_kv_cache * kv) {
+    kv->clear();
 }
 
 bool llama_kv_cache_seq_rm(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1) {
-    uint32_t new_head = cache.size;
-
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-
-    // models like Mamba or RWKV can't have a state partially erased
-    if (cache.recurrent) {
-        if (seq_id >= (int64_t) cache.size) {
-            // could be fatal
-            return false;
-        }
-        if (0 <= seq_id) {
-            int32_t & tail_id = cache.cells[seq_id].tail;
-            if (tail_id >= 0) {
-                const llama_kv_cell & cell = cache.cells[tail_id];
-                // partial intersection is invalid
-                if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
-                    return false;
-                }
-                // invalidate tails which will be cleared
-                if (p0 <= cell.pos && cell.pos < p1) {
-                    tail_id = -1;
-                }
-            }
-        } else {
-            // seq_id is negative, then the range should include everything or nothing
-            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
-                return false;
-            }
-        }
-    }
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            if (seq_id < 0) {
-                cache.cells[i].seq_id.clear();
-            } else if (cache.cells[i].has_seq_id(seq_id)) {
-                cache.cells[i].seq_id.erase(seq_id);
-            } else {
-                continue;
-            }
-            if (cache.cells[i].is_empty()) {
-                // keep count of the number of used cells
-                if (cache.cells[i].pos >= 0) cache.used--;
-
-                cache.cells[i].pos = -1;
-                cache.cells[i].src = -1;
-                if (new_head == cache.size) new_head = i;
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
-
-    return true;
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1) {
+    return kv->seq_rm(seq_id, p0, p1);
 }
 
 void llama_kv_cache_seq_cp(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id_src,
-                 llama_seq_id   seq_id_dst,
-                    llama_pos   p0,
-                    llama_pos   p1) {
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-
-    if (cache.recurrent) {
-        if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
-            llama_kv_cell & tail_src = cache.cells[seq_id_src];
-            llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
-            if (tail_dst.tail >= 0) {
-                // clear destination seq_id if it wasn't empty
-                llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
-
-                cell_dst.seq_id.erase(seq_id_dst);
-                tail_dst.tail = -1;
-                if (cell_dst.seq_id.empty()) {
-                    cell_dst.pos = -1;
-                    cell_dst.delta = -1;
-                    cell_dst.src = -1;
-                    cache.used -= 1;
-                }
-            }
-            if (tail_src.tail >= 0) {
-                llama_kv_cell & cell_src = cache.cells[tail_src.tail];
-
-                cell_src.seq_id.insert(seq_id_dst);
-                tail_dst.tail = tail_src.tail;
-            }
-        }
-
-        return;
-    }
-    // otherwise, this is the KV cache of a Transformer-like model
-
-    cache.head = 0;
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.cells[i].seq_id.insert(seq_id_dst);
-        }
-    }
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id_src,
+          llama_seq_id   seq_id_dst,
+             llama_pos   p0,
+             llama_pos   p1) {
+    kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 
-void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
-    uint32_t new_head = cache.size;
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.recurrent && (llama_seq_id) i != seq_id) {
-            cache.cells[i].tail = -1;
-        }
-        if (!cache.cells[i].has_seq_id(seq_id)) {
-            if (cache.cells[i].pos >= 0) cache.used--;
-            cache.cells[i].pos = -1;
-            cache.cells[i].src = -1;
-            cache.cells[i].seq_id.clear();
-            if (new_head == cache.size) new_head = i;
-        } else {
-            cache.cells[i].seq_id.clear();
-            cache.cells[i].seq_id.insert(seq_id);
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
+void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id) {
+    kv->seq_keep(seq_id);
 }
 
 void llama_kv_cache_seq_add(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                    llama_pos   delta) {
-    uint32_t new_head = cache.size;
-
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-    // If there is no range then return early to avoid looping over the cache.
-    if (p0 == p1) return;
-
-    if (cache.recurrent) {
-        // for Mamba-like or RWKV models, only the pos needs to be shifted
-        if (0 <= seq_id && seq_id < (int64_t) cache.size) {
-            const int32_t tail_id = cache.cells[seq_id].tail;
-            if (tail_id >= 0) {
-                llama_kv_cell & cell = cache.cells[tail_id];
-                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                    cell.pos += delta;
-                }
-            }
-        }
-        return;
-    }
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.has_shift = true;
-            cache.cells[i].pos   += delta;
-            cache.cells[i].delta += delta;
-
-            if (cache.cells[i].pos < 0) {
-                if (!cache.cells[i].is_empty()) {
-                    cache.used--;
-                }
-                cache.cells[i].pos = -1;
-                cache.cells[i].seq_id.clear();
-                if (new_head == cache.size) {
-                    new_head = i;
-                }
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    // Otherwise we just start the next search from the beginning.
-    cache.head = new_head != cache.size ? new_head : 0;
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1,
+             llama_pos   delta) {
+    kv->seq_add(seq_id, p0, p1, delta);
 }
 
 void llama_kv_cache_seq_div(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                          int   d) {
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-    // If there is no range then return early to avoid looping over the cache.
-    if (p0 == p1) return;
-
-    if (cache.recurrent) {
-        // for Mamba-like or RWKV models, only the pos needs to be changed
-        if (0 <= seq_id && seq_id < (int64_t) cache.size) {
-            const int32_t tail_id = cache.cells[seq_id].tail;
-            if (tail_id >= 0) {
-                llama_kv_cell & cell = cache.cells[tail_id];
-                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                    cell.pos /= d;
-                }
-            }
-        }
-        return;
-    }
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.has_shift = true;
-
-            {
-                llama_pos p_old = cache.cells[i].pos;
-                cache.cells[i].pos   /= d;
-                cache.cells[i].delta += cache.cells[i].pos - p_old;
-            }
-        }
-    }
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1,
+                   int   d) {
+    kv->seq_div(seq_id, p0, p1, d);
 }
 
-llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
-    llama_pos result = 0;
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id)) {
-            result = std::max(result, cache.cells[i].pos);
-        }
-    }
-
-    return result;
+llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id) {
+    return kv->seq_pos_max(seq_id);
 }
 
-void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
-    if (!cache.recurrent) {
-        cache.do_defrag = true;
-    }
+void llama_kv_cache_defrag(llama_kv_cache * kv) {
+    kv->defrag();
 }
 
-int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv) {
-    int result = 0;
-
-    for (uint32_t i = 0; i < kv.size; i++) {
-        result += kv.cells[i].seq_id.size();
-    }
-
-    return result;
+int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) {
+    return kv->n_tokens();
 }
 
-int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv) {
-    return kv.used;
+int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
+    return kv->used;
 }
 
-bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv) {
-    return kv.can_shift;
+bool llama_kv_cache_can_shift(const llama_kv_cache * kv) {
+    return kv->can_shift;
 }
 
 //
@@ -632,7 +422,7 @@ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache
         /*.n_cells            = */ 0,
         /*.n_seq_max          = */ n_seq_max,
         /*.token_count        = */ 0,
-        /*.used_cells         = */ llama_get_kv_cache_used_cells(kv),
+        /*.used_cells         = */ llama_kv_cache_used_cells(&kv),
         /*.max_contiguous     = */ 0,
         /*.max_contiguous_idx = */ -1,
         /*.cells              = */ nullptr,
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index dca6f3998c645..b0bb1cfb14f12 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -57,6 +57,16 @@ struct llama_kv_cache {
     std::vector<ggml_context_ptr> ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
 
+    int32_t n_tokens() const {
+        int32_t result = 0;
+
+        for (uint32_t i = 0; i < size; i++) {
+            result += cells[i].seq_id.size();
+        }
+
+        return result;
+    }
+
     size_t total_size() const {
         size_t size = 0;
         for (const auto & buf : bufs) {
@@ -75,6 +85,297 @@ struct llama_kv_cache {
 
         return max_pos;
     }
+
+    void clear() {
+        for (int32_t i = 0; i < (int32_t) size; ++i) {
+            cells[i].pos = -1;
+            cells[i].seq_id.clear();
+            cells[i].src = -1;
+            cells[i].tail = -1;
+        }
+        head = 0;
+        used = 0;
+
+        for (auto & buf : bufs) {
+            ggml_backend_buffer_clear(buf.get(), 0);
+        }
+    }
+
+    bool seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+        uint32_t new_head = size;
+
+        if (p0 < 0) {
+            p0 = 0;
+        }
+
+        if (p1 < 0) {
+            p1 = std::numeric_limits<llama_pos>::max();
+        }
+
+        // models like Mamba or RWKV can't have a state partially erased
+        if (recurrent) {
+            if (seq_id >= (int64_t) size) {
+                // could be fatal
+                return false;
+            }
+            if (0 <= seq_id) {
+                int32_t & tail_id = cells[seq_id].tail;
+                if (tail_id >= 0) {
+                    const llama_kv_cell & cell = cells[tail_id];
+                    // partial intersection is invalid
+                    if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+                        return false;
+                    }
+                    // invalidate tails which will be cleared
+                    if (p0 <= cell.pos && cell.pos < p1) {
+                        tail_id = -1;
+                    }
+                }
+            } else {
+                // seq_id is negative, then the range should include everything or nothing
+                if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+                    return false;
+                }
+            }
+        }
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (cells[i].pos >= p0 && cells[i].pos < p1) {
+                if (seq_id < 0) {
+                    cells[i].seq_id.clear();
+                } else if (cells[i].has_seq_id(seq_id)) {
+                    cells[i].seq_id.erase(seq_id);
+                } else {
+                    continue;
+                }
+                if (cells[i].is_empty()) {
+                    // keep count of the number of used cells
+                    if (cells[i].pos >= 0) {
+                        used--;
+                    }
+
+                    cells[i].pos = -1;
+                    cells[i].src = -1;
+
+                    if (new_head == size) {
+                        new_head = i;
+                    }
+                }
+            }
+        }
+
+        // If we freed up a slot, set head to it so searching can start there.
+        if (new_head != size && new_head < head) {
+            head = new_head;
+        }
+
+        return true;
+    }
+
+    void seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+        if (seq_id_src == seq_id_dst) {
+            return;
+        }
+
+        if (p0 < 0) {
+            p0 = 0;
+        }
+
+        if (p1 < 0) {
+            p1 = std::numeric_limits<llama_pos>::max();
+        }
+
+        if (recurrent) {
+            if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
+                llama_kv_cell & tail_src = cells[seq_id_src];
+                llama_kv_cell & tail_dst = cells[seq_id_dst];
+                if (tail_dst.tail >= 0) {
+                    // clear destination seq_id if it wasn't empty
+                    llama_kv_cell & cell_dst = cells[tail_dst.tail];
+
+                    cell_dst.seq_id.erase(seq_id_dst);
+                    tail_dst.tail = -1;
+                    if (cell_dst.seq_id.empty()) {
+                        cell_dst.pos = -1;
+                        cell_dst.delta = -1;
+                        cell_dst.src = -1;
+                        used -= 1;
+                    }
+                }
+                if (tail_src.tail >= 0) {
+                    llama_kv_cell & cell_src = cells[tail_src.tail];
+
+                    cell_src.seq_id.insert(seq_id_dst);
+                    tail_dst.tail = tail_src.tail;
+                }
+            }
+
+            return;
+        }
+
+        // otherwise, this is the KV of a Transformer-like model
+        head = 0;
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
+                cells[i].seq_id.insert(seq_id_dst);
+            }
+        }
+    }
+
+    void seq_keep(llama_seq_id seq_id) {
+        uint32_t new_head = size;
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (recurrent && (llama_seq_id) i != seq_id) {
+                cells[i].tail = -1;
+            }
+
+            if (!cells[i].has_seq_id(seq_id)) {
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+
+                cells[i].pos = -1;
+                cells[i].src = -1;
+                cells[i].seq_id.clear();
+
+                if (new_head == size){
+                    new_head = i;
+                }
+            } else {
+                cells[i].seq_id.clear();
+                cells[i].seq_id.insert(seq_id);
+            }
+        }
+
+        // If we freed up a slot, set head to it so searching can start there.
+        if (new_head != size && new_head < head) {
+            head = new_head;
+        }
+    }
+
+    void seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+        if (delta == 0) {
+            return;
+        }
+
+        uint32_t new_head = size;
+
+        if (p0 < 0) {
+            p0 = 0;
+        }
+
+        if (p1 < 0) {
+            p1 = std::numeric_limits<llama_pos>::max();
+        }
+
+        // If there is no range then return early to avoid looping over the
+        if (p0 == p1) {
+            return;
+        }
+
+        if (recurrent) {
+            // for Mamba-like or RWKV models, only the pos needs to be shifted
+            if (0 <= seq_id && seq_id < (int64_t) size) {
+                const int32_t tail_id = cells[seq_id].tail;
+                if (tail_id >= 0) {
+                    llama_kv_cell & cell = cells[tail_id];
+                    if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                        cell.pos += delta;
+                    }
+                }
+            }
+            return;
+        }
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+                has_shift = true;
+                cells[i].pos   += delta;
+                cells[i].delta += delta;
+
+                if (cells[i].pos < 0) {
+                    if (!cells[i].is_empty()) {
+                        used--;
+                    }
+                    cells[i].pos = -1;
+                    cells[i].seq_id.clear();
+                    if (new_head == size) {
+                        new_head = i;
+                    }
+                }
+            }
+        }
+
+        // If we freed up a slot, set head to it so searching can start there.
+        // Otherwise we just start the next search from the beginning.
+        head = new_head != size ? new_head : 0;
+    }
+
+    void seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+        if (d == 1) {
+            return;
+        }
+
+        if (p0 < 0) {
+            p0 = 0;
+        }
+
+        if (p1 < 0) {
+            p1 = std::numeric_limits<llama_pos>::max();
+        }
+
+        // If there is no range then return early to avoid looping over the cache.
+        if (p0 == p1) {
+            return;
+        }
+
+        if (recurrent) {
+            // for Mamba-like or RWKV models, only the pos needs to be changed
+            if (0 <= seq_id && seq_id < (int64_t) size) {
+                const int32_t tail_id = cells[seq_id].tail;
+                if (tail_id >= 0) {
+                    llama_kv_cell & cell = cells[tail_id];
+                    if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                        cell.pos /= d;
+                    }
+                }
+            }
+
+            return;
+        }
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+                has_shift = true;
+
+                {
+                    llama_pos p_old = cells[i].pos;
+                    cells[i].pos   /= d;
+                    cells[i].delta += cells[i].pos - p_old;
+                }
+            }
+        }
+    }
+
+    llama_pos seq_pos_max(llama_seq_id seq_id) {
+        llama_pos result = 0;
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (cells[i].has_seq_id(seq_id)) {
+                result = std::max(result, cells[i].pos);
+            }
+        }
+
+        return result;
+    }
+
+    void defrag() {
+        if (!recurrent) {
+            do_defrag = true;
+        }
+    }
 };
 
 // a structure holds information about the slot found in llama_kv_cache_find_slot
@@ -112,51 +413,6 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
 // find how many cells are currently in use
 uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
 
-void llama_kv_cache_clear(struct llama_kv_cache & cache);
-
-bool llama_kv_cache_seq_rm(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1);
-
-void llama_kv_cache_seq_cp(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id_src,
-                 llama_seq_id   seq_id_dst,
-                    llama_pos   p0,
-                    llama_pos   p1);
-
-void llama_kv_cache_seq_keep(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id);
-
-void llama_kv_cache_seq_add(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                    llama_pos   delta);
-
-void llama_kv_cache_seq_div(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                          int   d);
-
-llama_pos llama_kv_cache_seq_pos_max(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id);
-
-void llama_kv_cache_defrag(struct llama_kv_cache & cache);
-
-int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
-
-int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
-
-bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
-
 //
 // kv cache view
 //
@@ -206,10 +462,10 @@ struct llama_kv_slot_restorer {
             cache.n    = old_state.n;
 
             if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
-                llama_kv_cache_seq_rm(cache, -1, -1, -1);
+                cache.seq_rm(-1, -1, -1);
             } else {
                 for (auto & slot : slot_boundaries) {
-                    llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
+                    cache.seq_rm(-1, slot.first, slot.second);
                 }
             }
         }
diff --git a/src/llama.cpp b/src/llama.cpp
index 094157ccf2aa2..87dd512b2546a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8564,7 +8564,7 @@ static int llama_decode_impl(
 
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
-            llama_kv_cache_update(&lctx);
+            llama_update_kv_cache(&lctx, &lctx.kv_self); // TODO: lctx->update_kv_cache()
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
@@ -8760,7 +8760,7 @@ static int llama_decode_impl(
         if (fragmentation > cparams.defrag_thold) {
             //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
 
-            llama_kv_cache_defrag(kv_self);
+            kv_self.defrag();
         }
     }
 
@@ -9182,11 +9182,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
     //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
 }
 
-static void llama_kv_cache_update_impl(struct llama_context & lctx) {
+static void llama_update_kv_cache_impl(llama_context & lctx, llama_kv_cache & kv) {
     bool need_reserve = false;
 
-    if (lctx.kv_self.has_shift) {
-        if (!llama_kv_cache_can_shift(&lctx)) {
+    if (kv.has_shift) {
+        if (!kv.can_shift) {
             GGML_ABORT("The current context does not support K-shift");
         }
 
@@ -9206,23 +9206,21 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) {
         }
 
         {
-            auto & kv_self = lctx.kv_self;
+            kv.has_shift = false;
 
-            kv_self.has_shift = false;
-
-            for (uint32_t i = 0; i < kv_self.size; ++i) {
-                kv_self.cells[i].delta = 0;
+            for (uint32_t i = 0; i < kv.size; ++i) {
+                kv.cells[i].delta = 0;
             }
         }
     }
 
     // defragment the KV cache if needed
-    if (lctx.kv_self.do_defrag) {
+    if (kv.do_defrag) {
         llama_kv_cache_defrag_impl(lctx);
 
         need_reserve = true;
 
-        lctx.kv_self.do_defrag = false;
+        kv.do_defrag = false;
     }
 
     // reserve a worst case graph again
@@ -9845,6 +9843,7 @@ struct llama_context * llama_init_from_model(
     return ctx;
 }
 
+// deprecated
 struct llama_context * llama_new_context_with_model(
                  struct llama_model * model,
         struct llama_context_params   params) {
@@ -9855,73 +9854,27 @@ struct llama_context * llama_new_context_with_model(
 // kv cache
 //
 
-// TODO: tmp bridges below until `struct llama_kv_cache` is exposed through the public API
-
-struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
+struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
     return llama_kv_cache_view_init(ctx->kv_self, n_seq_max);
 }
 
-void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
+void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
     llama_kv_cache_view_update(view, ctx->kv_self);
 }
 
-int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
-    return llama_get_kv_cache_token_count(ctx->kv_self);
-}
-
-int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
-    return llama_get_kv_cache_used_cells(ctx->kv_self);
-}
-
-void llama_kv_cache_clear(struct llama_context * ctx) {
-    llama_kv_cache_clear(ctx->kv_self);
-}
-
-bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
-}
-
-void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    if (seq_id_src == seq_id_dst) {
-        return;
-    }
-    llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
-}
-
-void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
-    llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
-}
-
-void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    if (delta == 0) {
-        return;
-    }
-
-    llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
-}
-
-void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    if (d == 1) {
-        return;
-    }
-
-    llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
-}
-
-llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
-}
-
-void llama_kv_cache_defrag(struct llama_context * ctx) {
-    llama_kv_cache_defrag(ctx->kv_self);
+// deprecated
+int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
+    return llama_kv_cache_n_tokens(&ctx->kv_self);
 }
 
-void llama_kv_cache_update(struct llama_context * ctx) {
-    llama_kv_cache_update_impl(*ctx);
+// deprecated
+int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
+    return llama_kv_cache_used_cells(&ctx->kv_self);
 }
 
-bool llama_kv_cache_can_shift(struct llama_context * ctx) {
-    return llama_kv_cache_can_shift(ctx->kv_self);
+// TODO: move to llama-context
+void llama_update_kv_cache(llama_context * ctx, llama_kv_cache * kv) {
+    llama_update_kv_cache_impl(*ctx, *kv);
 }
 
 ///

From e4550fbafc44403b243fe029937a97a0aed7bbd6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Jan 2025 14:56:52 +0200
Subject: [PATCH 02/84] llama : cont

ggml-ci
---
 examples/batched-bench/batched-bench.cpp      |  6 ++--
 .../cvector-generator/cvector-generator.cpp   |  3 +-
 examples/gritlm/gritlm.cpp                    |  8 +++--
 examples/imatrix/imatrix.cpp                  |  4 ++-
 examples/infill/infill.cpp                    |  6 ++--
 examples/llama-bench/llama-bench.cpp          |  6 ++--
 examples/lookahead/lookahead.cpp              | 13 ++++----
 examples/lookup/lookup.cpp                    |  3 +-
 examples/main/main.cpp                        | 14 +++++----
 examples/parallel/parallel.cpp                | 11 +++----
 examples/passkey/passkey.cpp                  | 30 ++++++++++---------
 examples/perplexity/perplexity.cpp            | 24 +++++++++++----
 examples/retrieval/retrieval.cpp              |  4 ++-
 examples/run/run.cpp                          |  7 +++--
 examples/save-load-state/save-load-state.cpp  |  4 ++-
 examples/server/server.cpp                    | 25 +++++++++-------
 examples/simple-chat/simple-chat.cpp          |  6 ++--
 .../speculative-simple/speculative-simple.cpp |  4 ++-
 examples/speculative/speculative.cpp          | 29 ++++++++++--------
 19 files changed, 128 insertions(+), 79 deletions(-)

diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 0659ab6f119a7..fcbad37bb3f2f 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -57,6 +57,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const int32_t n_kv_max = llama_n_ctx(ctx);
 
     llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
@@ -132,7 +134,7 @@ int main(int argc, char ** argv) {
 
                 const auto t_pp_start = ggml_time_us();
 
-                llama_kv_cache_clear(ctx);
+                llama_kv_cache_clear(kv);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                     LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -141,7 +143,7 @@ int main(int argc, char ** argv) {
 
                 if (is_pp_shared) {
                     for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                        llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
                     }
                 }
 
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index 413b71d34c52b..adb4a60ada41f 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -342,7 +342,8 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 }
 
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_kv_cache_clear(ctx);
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+    llama_kv_cache_clear(kv);
     if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 72eb46257429e..16437453edb89 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -13,6 +13,8 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
     for (uint64_t i = 0; i < sentences.size(); i++) {
@@ -45,7 +47,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         }
 
         // clear previous kv_cache values (irrelevant for embeddings)
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
         llama_set_embeddings(ctx, true);
         llama_set_causal_attn(ctx, false);
 
@@ -100,9 +102,11 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     llama_token eos_token = llama_vocab_eos(vocab);
 
-    llama_kv_cache_clear(ctx);
+    llama_kv_cache_clear(kv);
     llama_set_embeddings(ctx, false);
     llama_set_causal_attn(ctx, true);
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index b5f3feb9f82e6..5efe4f019f562 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -431,6 +431,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     const int n_ctx = llama_n_ctx(ctx);
 
@@ -497,7 +499,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 489a208b66b34..de8e7769552bb 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -139,6 +139,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     const int n_ctx_train = llama_model_n_ctx_train(model);
@@ -332,8 +334,8 @@ int main(int argc, char ** argv) {
                 LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                llama_kv_cache_seq_rm (kv, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_add(kv, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
                 n_past -= n_discard;
 
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 4ac19ca86ec56..8843c0048d6cc 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1575,9 +1575,11 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
+        llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
         test t(inst, lmodel, ctx);
 
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         // cool off before the test
         if (params.delay) {
@@ -1617,7 +1619,7 @@ int main(int argc, char ** argv) {
         }
 
         for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_clear(ctx);
+            llama_kv_cache_clear(kv);
 
             uint64_t t_start = get_time_ns();
 
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 2f0898e6284a0..1219c207464d2 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -60,6 +60,7 @@ int main(int argc, char ** argv) {
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -95,7 +96,7 @@ int main(int argc, char ** argv) {
     llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 
     for (int s = 1; s < W + G + 1; ++s) {
-        llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
+        llama_kv_cache_seq_cp(kv, 0, s, -1, -1);
     }
 
     const auto t_enc_end = ggml_time_us();
@@ -437,17 +438,17 @@ int main(int argc, char ** argv) {
 
         // KV cache management
         // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
-        llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
+        llama_kv_cache_seq_rm(kv, -1, n_past, -1);
 
         if (seq_id_best != 0) {
             // if a verification token matched, we keep the best sequence and remove the rest
             // this leads to some KV cache fragmentation
-            llama_kv_cache_seq_keep(ctx, seq_id_best);
-            llama_kv_cache_seq_cp  (ctx, seq_id_best, 0, -1, -1);
-            llama_kv_cache_seq_rm  (ctx, seq_id_best,    -1, -1);
+            llama_kv_cache_seq_keep(kv, seq_id_best);
+            llama_kv_cache_seq_cp  (kv, seq_id_best, 0, -1, -1);
+            llama_kv_cache_seq_rm  (kv, seq_id_best,    -1, -1);
 
             for (int s = 1; s < W + G + 1; ++s) {
-                llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
+                llama_kv_cache_seq_cp(kv, 0, s, -1, -1);
             }
         }
     }
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index dbd0444ec8742..8628f7318556c 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -35,6 +35,7 @@ int main(int argc, char ** argv){
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -192,7 +193,7 @@ int main(int argc, char ** argv){
 
         // KV cache management
         // clean the cache of draft tokens that weren't accepted
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+        llama_kv_cache_seq_rm(kv, 0, n_past, -1);
 
         common_batch_clear(batch_tgt);
         common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index da2a03ab9ba10..9d79af79e2723 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -157,6 +157,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
     auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
 
@@ -328,7 +330,7 @@ int main(int argc, char ** argv) {
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+        llama_kv_cache_seq_rm(kv, -1, n_matching_session_tokens, -1);
     }
 
     LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@@ -569,8 +571,8 @@ int main(int argc, char ** argv) {
                     LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_kv_cache_seq_rm (kv, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_cache_seq_add(kv, 0, params.n_keep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
 
@@ -593,9 +595,9 @@ int main(int argc, char ** argv) {
                     LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                     LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
-                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_kv_cache_seq_add(kv, 0, ga_i,                n_past,              ib*bd);
+                    llama_kv_cache_seq_div(kv, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_kv_cache_seq_add(kv, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
 
                     n_past -= bd;
 
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 7ef43d5e12876..2ba0706dc5d24 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -134,6 +134,7 @@ int main(int argc, char ** argv) {
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -201,7 +202,7 @@ int main(int argc, char ** argv) {
 
         // assign the system KV cache to all parallel sequences
         for (int32_t i = 1; i <= n_clients; ++i) {
-            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+            llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
         }
 
         LOG_INF("\n");
@@ -233,9 +234,9 @@ int main(int argc, char ** argv) {
         if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
             for (int i = 1; i <= n_clients; ++i) {
-                llama_kv_cache_seq_rm(ctx, i, -1, -1);
+                llama_kv_cache_seq_rm(kv, i, -1, -1);
                 // but keep the system prompt
-                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
             }
 
             LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -371,8 +372,8 @@ int main(int argc, char ** argv) {
                     }
 
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
-                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
+                    llama_kv_cache_seq_rm(kv,    client.id + 1, -1, -1);
+                    llama_kv_cache_seq_cp(kv, 0, client.id + 1, -1, -1);
 
                     const auto t_main_end = ggml_time_us();
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 5953928d47d33..e2764313b2f01 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -86,6 +86,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     auto sparams = llama_sampler_chain_default_params();
 
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
@@ -132,11 +134,11 @@ int main(int argc, char ** argv) {
             const int ib = i/n_batch - 1;
             const int bd = n_batch_grp*(n_grp - 1);
 
-            llama_kv_cache_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_cache_update  (ctx);
+            llama_kv_cache_seq_add(kv, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_cache_seq_div(kv, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            llama_update_kv_cache (ctx, kv);
 
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
         }
 
         common_batch_clear(batch);
@@ -166,12 +168,12 @@ int main(int argc, char ** argv) {
 
         LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
 
-        llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_cache_defrag (ctx);
-        llama_kv_cache_update (ctx);
+        llama_kv_cache_seq_rm (kv, 0, n_keep            , n_keep + n_discard);
+        llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx,  -n_discard);
+      //llama_kv_cache_defrag (kv);
+        llama_update_kv_cache (ctx, kv);
 
-        n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+        n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
 
         common_batch_clear(batch);
 
@@ -197,12 +199,12 @@ int main(int argc, char ** argv) {
         if (n_discard > 0) {
             LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
 
-            llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_cache_defrag (ctx);
-            llama_kv_cache_update (ctx);
+            llama_kv_cache_seq_rm (kv, 0, n_keep            , n_keep + n_discard);
+            llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx,  -n_discard);
+          //llama_kv_cache_defrag (kv);
+            llama_update_kv_cache (ctx, kv);
 
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
         }
     }
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9bf6c57433ab2..6c9f716ede23c 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -299,6 +299,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
@@ -360,7 +362,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
@@ -450,6 +452,8 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
@@ -546,7 +550,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -741,6 +745,8 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     // Calculates hellaswag score (acc_norm) from prompt
     //
     // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -923,7 +929,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
             return;
         }
 
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1084,6 +1090,8 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     constexpr int k_min_trailing_ctx = 3;
 
     auto data = load_winogrande_from_csv(params.prompt);
@@ -1202,7 +1210,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
             return;
         }
 
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1388,6 +1396,8 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     std::istringstream strstream(params.prompt);
     uint32_t n_task;
     strstream.read((char *)&n_task, sizeof(n_task));
@@ -1574,7 +1584,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
             return;
         }
 
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1671,6 +1681,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     if (params.logits_file.empty()) {
         LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
         return;
@@ -1764,7 +1776,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         }
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 2439022a229b7..a907ea07607dd 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -82,8 +82,10 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 }
 
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
+    llama_kv_cache_clear(kv);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index 92a49eb744fda..8e2c174a955e8 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -756,7 +756,8 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
                            std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
-    const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
+    const llama_kv_cache * kv = llama_get_kv_cache(llama_data.context.get());
+    const bool is_first = llama_kv_cache_used_cells(kv) == 0;
 
     const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
     prompt_tokens.resize(n_prompt_tokens);
@@ -771,8 +772,10 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
 
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
+    llama_kv_cache * kv = llama_get_kv_cache(ctx.get());
+
     const int n_ctx      = llama_n_ctx(ctx.get());
-    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
+    const int n_ctx_used = llama_kv_cache_used_cells(kv);
     if (n_ctx_used + batch.n_tokens > n_ctx) {
         printf("\033[0m\n");
         printe("context size exceeded\n");
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index cf7cbd8159cf8..3839fbe8c84d5 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -156,6 +156,8 @@ int main(int argc, char ** argv) {
     // make new context
     llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));
 
+    llama_kv_cache * kv3 = llama_get_kv_cache(ctx3);
+
     llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
 
     llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
@@ -196,7 +198,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
 
         // erase whole kv
-        llama_kv_cache_clear(ctx3);
+        llama_kv_cache_clear(kv3);
         fprintf(stderr, "%s : kv cache cleared\n", __func__);
 
         // restore kv into seq 1
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b1cde2d7f48dd..076044d39679c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1693,6 +1693,7 @@ struct server_context {
 
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
+    llama_kv_cache * kv = nullptr;
 
     const llama_vocab * vocab = nullptr;
 
@@ -1755,6 +1756,8 @@ struct server_context {
             return false;
         }
 
+        kv = llama_get_kv_cache(ctx);
+
         vocab = llama_model_get_vocab(model);
 
         n_ctx = llama_n_ctx(ctx);
@@ -2023,7 +2026,7 @@ struct server_context {
         SRV_DBG("%s", "clearing KV cache\n");
 
         // clear the entire KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
         clean_kv_cache = false;
     }
 
@@ -2565,8 +2568,8 @@ struct server_context {
                     res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                     res->t_start             = metrics.t_start;
 
-                    res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
-                    res->kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
+                    res->kv_cache_tokens_count = llama_kv_cache_n_tokens(kv);
+                    res->kv_cache_used_cells   = llama_kv_cache_used_cells(kv);
 
                     res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                     res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
@@ -2682,7 +2685,7 @@ struct server_context {
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
+                    llama_kv_cache_seq_rm(kv, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
                     auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2750,8 +2753,8 @@ struct server_context {
 
                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-                llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_cache_seq_rm (kv, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_cache_seq_add(kv, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -2938,8 +2941,8 @@ struct server_context {
 
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
-                                            llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
-                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, -1,     kv_shift);
+                                            llama_kv_cache_seq_rm (kv, slot.id, head_p, head_c);
+                                            llama_kv_cache_seq_add(kv, slot.id, head_c, -1,     kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -2977,9 +2980,9 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
+                    if (!llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
+                        llama_kv_cache_seq_rm(kv, slot.id, -1, -1);
 
                         // there is no common part left
                         slot.n_past = 0;
@@ -3219,7 +3222,7 @@ struct server_context {
                 slot.cache_tokens.push_back(id);
                 slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
 
-                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
+                llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1);
 
                 for (size_t i = 0; i < ids.size(); ++i) {
                     completion_token_output result;
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index c5534cc13e4b4..130e326b55d4c 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -88,6 +88,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    const llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     // initialize the sampler
     llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
     llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
@@ -98,7 +100,7 @@ int main(int argc, char ** argv) {
     auto generate = [&](const std::string & prompt) {
         std::string response;
 
-        const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
+        const bool is_first = llama_kv_cache_used_cells(kv) == 0;
 
         // tokenize the prompt
         const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +115,7 @@ int main(int argc, char ** argv) {
         while (true) {
             // check if we have enough space in the context to evaluate this batch
             int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
+            int n_ctx_used = llama_kv_cache_used_cells(kv);
             if (n_ctx_used + batch.n_tokens > n_ctx) {
                 printf("\033[0m\n");
                 fprintf(stderr, "context size exceeded\n");
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 403ba2dd21914..24bdc806d5710 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -45,6 +45,8 @@ int main(int argc, char ** argv) {
     model_tgt = llama_init_tgt.model.get();
     ctx_tgt   = llama_init_tgt.context.get();
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx_tgt);
+
     const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
 
     // load the draft model
@@ -217,7 +219,7 @@ int main(int argc, char ** argv) {
         {
             LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
 
-            llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
+            llama_kv_cache_seq_rm(kv, 0, n_past, -1);
         }
 
         if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index c7ccea50dbbd4..b4e5259b5be46 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -90,6 +90,9 @@ int main(int argc, char ** argv) {
     model_dft = llama_init_dft.model.get();
     ctx_dft   = llama_init_dft.context.get();
 
+    llama_kv_cache * kv_tgt = llama_get_kv_cache(ctx_tgt);
+    llama_kv_cache * kv_dft = llama_get_kv_cache(ctx_dft);
+
     const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
     const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
 
@@ -420,14 +423,14 @@ int main(int argc, char ** argv) {
             {
                 LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
-                llama_kv_cache_seq_keep(ctx_dft, s_keep);
-                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_dft, 0);
+                llama_kv_cache_seq_keep(kv_dft, s_keep);
+                llama_kv_cache_seq_cp  (kv_dft, s_keep, 0, -1, -1);
+                llama_kv_cache_seq_keep(kv_dft, 0);
 
-                llama_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-                llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, 0);
+                llama_kv_cache_seq_rm  (kv_tgt, s_keep, n_past_tgt, -1);
+                llama_kv_cache_seq_keep(kv_tgt, s_keep);
+                llama_kv_cache_seq_cp  (kv_tgt, s_keep, 0, -1, -1);
+                llama_kv_cache_seq_keep(kv_tgt, 0);
             }
 
             for (int s = 0; s < n_seq_dft; ++s) {
@@ -444,8 +447,8 @@ int main(int argc, char ** argv) {
             common_batch_clear(batch_dft);
             common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
 
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+            llama_kv_cache_seq_rm(kv_dft, 0, n_past_dft, -1);
+            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(kv_dft, batch_dft).c_str());
             llama_decode(ctx_dft, batch_dft);
 
             ++n_past_dft;
@@ -503,8 +506,8 @@ int main(int argc, char ** argv) {
                     if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
                         LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
-                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
-                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
+                        llama_kv_cache_seq_rm(kv_dft,    n_seq_cur, -1, -1);
+                        llama_kv_cache_seq_cp(kv_dft, s, n_seq_cur, -1, -1);
 
                         // all previous tokens from this branch are now also part of the new branch
                         for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -585,9 +588,9 @@ int main(int argc, char ** argv) {
 
         // evaluate the target model on the drafted tokens
         {
-            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            llama_kv_cache_seq_keep(kv_tgt, 0);
             for (int s = 1; s < n_seq_dft; ++s) {
-                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
+                llama_kv_cache_seq_cp(kv_tgt, 0, s, -1, -1);
             }
 
             // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());

From 4d7bd03e653f24e00158ae7e819908e444a20353 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Jan 2025 15:50:39 +0200
Subject: [PATCH 03/84] kv_cache : functions -> members

ggml-ci
---
 src/llama-context.cpp  |   2 +-
 src/llama-kv-cache.cpp | 490 ++++++++++++++++++++++++++++++++++-------
 src/llama-kv-cache.h   | 402 +++++----------------------------
 src/llama.cpp          |  16 +-
 4 files changed, 466 insertions(+), 444 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index bf5a77ccaff1b..0654feccb8951 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1169,7 +1169,7 @@ struct llama_data_read {
             }
             batch.n_seq_id[0] = 1;
             batch.seq_id[0] = &dest_seq_id;
-            if (!llama_kv_cache_find_slot(kv_self, batch)) {
+            if (!kv_self.find_slot(batch)) {
                 LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
                 return false;
             }
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index b0d5a931839f8..8b2f6287b8ae7 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -11,41 +11,35 @@
 
 static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
 
-uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
-    // the FA kernels require padding to avoid extra runtime boundary checks
-    return cparams.flash_attn ? 256u : 32u;
-}
-
-bool llama_kv_cache_init(
-             struct llama_kv_cache & cache,
-                 const llama_model & model,
-               const llama_cparams & cparams,
-                         ggml_type   type_k,
-                         ggml_type   type_v,
-                          uint32_t   kv_size,
-                              bool   offload) {
+bool llama_kv_cache::init(
+        const llama_model & model,
+      const llama_cparams & cparams,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                 uint32_t   kv_size,
+                     bool   offload) {
     const struct llama_hparams & hparams = model.hparams;
 
     const int32_t n_layer = hparams.n_layer;
 
-    cache.has_shift = false;
+    has_shift = false;
 
-    cache.recurrent = llama_model_is_recurrent(&model);
-    cache.v_trans   = !cache.recurrent && !cparams.flash_attn;
-    cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+    recurrent = llama_model_is_recurrent(&model);
+    v_trans   = !recurrent && !cparams.flash_attn;
+    can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
 
     LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
-            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift);
+            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift);
 
-    cache.head = 0;
-    cache.size = kv_size;
-    cache.used = 0;
+    head = 0;
+    size = kv_size;
+    used = 0;
 
-    cache.type_k = type_k;
-    cache.type_v = type_v;
+    type_k = type_k;
+    type_v = type_v;
 
-    cache.cells.clear();
-    cache.cells.resize(kv_size);
+    cells.clear();
+    cells.resize(kv_size);
 
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -57,19 +51,23 @@ bool llama_kv_cache_init(
                 /*.mem_buffer =*/ NULL,
                 /*.no_alloc   =*/ true,
             };
+
             ggml_context * ctx = ggml_init(params);
             if (!ctx) {
                 return nullptr;
             }
+
             ctx_map[buft] = ctx;
-            cache.ctxs.emplace_back(ctx);
+            ctxs.emplace_back(ctx);
+
             return ctx;
         }
+
         return it->second;
     };
 
-    cache.k_l.reserve(n_layer);
-    cache.v_l.reserve(n_layer);
+    k_l.reserve(n_layer);
+    v_l.reserve(n_layer);
 
     for (int i = 0; i < n_layer; i++) {
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
@@ -95,8 +93,8 @@ bool llama_kv_cache_init(
         ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
-        cache.k_l.push_back(k);
-        cache.v_l.push_back(v);
+        k_l.push_back(k);
+        v_l.push_back(v);
     }
 
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
@@ -111,20 +109,339 @@ bool llama_kv_cache_init(
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-        cache.bufs.emplace_back(buf);
+        bufs.emplace_back(buf);
     }
 
     return true;
 }
 
-struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
-           struct llama_kv_cache & cache,
+int32_t llama_kv_cache::n_tokens() const {
+    int32_t result = 0;
+
+    for (uint32_t i = 0; i < size; i++) {
+        result += cells[i].seq_id.size();
+    }
+
+    return result;
+}
+
+size_t llama_kv_cache::total_size() const {
+    size_t size = 0;
+    for (const auto & buf : bufs) {
+        size += ggml_backend_buffer_get_size(buf.get());
+    }
+
+    return size;
+}
+
+// TODO: better data structures to reduce the cost of this operation
+llama_pos llama_kv_cache::max_pos() const {
+    llama_pos max_pos = -1;
+    for (const auto & cell : cells) {
+        max_pos = std::max(max_pos, cell.pos);
+    }
+
+    return max_pos;
+}
+
+void llama_kv_cache::clear() {
+    for (int32_t i = 0; i < (int32_t) size; ++i) {
+        cells[i].pos = -1;
+        cells[i].seq_id.clear();
+        cells[i].src = -1;
+        cells[i].tail = -1;
+    }
+    head = 0;
+    used = 0;
+
+    for (auto & buf : bufs) {
+        ggml_backend_buffer_clear(buf.get(), 0);
+    }
+}
+
+bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // models like Mamba or RWKV can't have a state partially erased
+    if (recurrent) {
+        if (seq_id >= (int64_t) size) {
+            // could be fatal
+            return false;
+        }
+        if (0 <= seq_id) {
+            int32_t & tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                const llama_kv_cell & cell = cells[tail_id];
+                // partial intersection is invalid
+                if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+                    return false;
+                }
+                // invalidate tails which will be cleared
+                if (p0 <= cell.pos && cell.pos < p1) {
+                    tail_id = -1;
+                }
+            }
+        } else {
+            // seq_id is negative, then the range should include everything or nothing
+            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+                return false;
+            }
+        }
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].pos >= p0 && cells[i].pos < p1) {
+            if (seq_id < 0) {
+                cells[i].seq_id.clear();
+            } else if (cells[i].has_seq_id(seq_id)) {
+                cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
+            if (cells[i].is_empty()) {
+                // keep count of the number of used cells
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+
+                cells[i].pos = -1;
+                cells[i].src = -1;
+
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+
+    return true;
+}
+
+void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    if (recurrent) {
+        if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
+            llama_kv_cell & tail_src = cells[seq_id_src];
+            llama_kv_cell & tail_dst = cells[seq_id_dst];
+            if (tail_dst.tail >= 0) {
+                // clear destination seq_id if it wasn't empty
+                llama_kv_cell & cell_dst = cells[tail_dst.tail];
+
+                cell_dst.seq_id.erase(seq_id_dst);
+                tail_dst.tail = -1;
+                if (cell_dst.seq_id.empty()) {
+                    cell_dst.pos = -1;
+                    cell_dst.delta = -1;
+                    cell_dst.src = -1;
+                    used -= 1;
+                }
+            }
+            if (tail_src.tail >= 0) {
+                llama_kv_cell & cell_src = cells[tail_src.tail];
+
+                cell_src.seq_id.insert(seq_id_dst);
+                tail_dst.tail = tail_src.tail;
+            }
+        }
+
+        return;
+    }
+
+    // otherwise, this is the KV of a Transformer-like model
+    head = 0;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            cells[i].seq_id.insert(seq_id_dst);
+        }
+    }
+}
+
+void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
+    uint32_t new_head = size;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (recurrent && (llama_seq_id) i != seq_id) {
+            cells[i].tail = -1;
+        }
+
+        if (!cells[i].has_seq_id(seq_id)) {
+            if (cells[i].pos >= 0) {
+                used--;
+            }
+
+            cells[i].pos = -1;
+            cells[i].src = -1;
+            cells[i].seq_id.clear();
+
+            if (new_head == size){
+                new_head = i;
+            }
+        } else {
+            cells[i].seq_id.clear();
+            cells[i].seq_id.insert(seq_id);
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    if (delta == 0) {
+        return;
+    }
+
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the
+    if (p0 == p1) {
+        return;
+    }
+
+    if (recurrent) {
+        // for Mamba-like or RWKV models, only the pos needs to be shifted
+        if (0 <= seq_id && seq_id < (int64_t) size) {
+            const int32_t tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                llama_kv_cell & cell = cells[tail_id];
+                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                    cell.pos += delta;
+                }
+            }
+        }
+        return;
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            has_shift = true;
+            cells[i].pos   += delta;
+            cells[i].delta += delta;
+
+            if (cells[i].pos < 0) {
+                if (!cells[i].is_empty()) {
+                    used--;
+                }
+                cells[i].pos = -1;
+                cells[i].seq_id.clear();
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    // Otherwise we just start the next search from the beginning.
+    head = new_head != size ? new_head : 0;
+}
+
+void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (d == 1) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) {
+        return;
+    }
+
+    if (recurrent) {
+        // for Mamba-like or RWKV models, only the pos needs to be changed
+        if (0 <= seq_id && seq_id < (int64_t) size) {
+            const int32_t tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                llama_kv_cell & cell = cells[tail_id];
+                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                    cell.pos /= d;
+                }
+            }
+        }
+
+        return;
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            has_shift = true;
+
+            {
+                llama_pos p_old = cells[i].pos;
+                cells[i].pos   /= d;
+                cells[i].delta += cells[i].pos - p_old;
+            }
+        }
+    }
+}
+
+llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) {
+    llama_pos result = 0;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::max(result, cells[i].pos);
+        }
+    }
+
+    return result;
+}
+
+void llama_kv_cache::defrag() {
+    if (!recurrent) {
+        do_defrag = true;
+    }
+}
+
+struct llama_kv_cache_slot_info llama_kv_cache::find_slot(
        const struct llama_ubatch & ubatch) {
     const uint32_t n_tokens = ubatch.n_tokens;
     const uint32_t n_seqs   = ubatch.n_seqs;
     const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
 
-    if (cache.recurrent) {
+    if (recurrent) {
         // For recurrent state architectures (like Mamba or RWKV),
         // each cache cell can store the state for a whole sequence.
         // A slot should be always be contiguous.
@@ -132,7 +449,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
         // can only process batches with an equal number of new tokens in each sequence
         GGML_ASSERT(ubatch.equal_seqs);
 
-        int32_t min = cache.size - 1;
+        int32_t min = size - 1;
         int32_t max = 0;
 
         // everything should fit if all seq_ids are smaller than the max
@@ -141,16 +458,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
             for (uint32_t j = 0; j < n_seq_id; ++j) {
                 const llama_seq_id seq_id = ubatch.seq_id[s][j];
 
-                if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
+                if (seq_id < 0 || (uint32_t) seq_id >= size) {
                     // too big seq_id
                     // TODO: would it be possible to resize the cache instead?
-                    LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
+                    LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
                     return llama_kv_cache_slot_info_failed;
                 }
                 if (j > 0) {
-                    llama_kv_cell & seq = cache.cells[seq_id];
+                    llama_kv_cell & seq = cells[seq_id];
                     if (seq.tail >= 0) {
-                        llama_kv_cell & cell = cache.cells[seq.tail];
+                        llama_kv_cell & cell = cells[seq.tail];
                         // clear cells from seq_ids that become shared
                         // (should not normally happen, but let's handle it anyway)
                         cell.seq_id.erase(seq_id);
@@ -158,7 +475,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
                         if (cell.seq_id.empty()) {
                             cell.pos = -1;
                             cell.src = -1;
-                            cache.used -= 1;
+                            used -= 1;
                         }
                     }
                 }
@@ -168,9 +485,9 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
 #ifndef NDEBUG
         {
             std::vector<int32_t> tails_verif;
-            tails_verif.assign(cache.size, -1);
-            for (uint32_t i = 0; i < cache.size; ++i) {
-                llama_kv_cell & cell = cache.cells[i];
+            tails_verif.assign(size, -1);
+            for (uint32_t i = 0; i < size; ++i) {
+                llama_kv_cell & cell = cells[i];
                 for (llama_seq_id seq_id : cell.seq_id) {
                     if (tails_verif[seq_id] != -1) {
                         LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
@@ -178,20 +495,20 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
                     tails_verif[seq_id] = i;
                 }
             }
-            for (uint32_t i = 0; i < cache.size; ++i) {
-                if (tails_verif[i] != cache.cells[i].tail) {
-                    LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
+            for (uint32_t i = 0; i < size; ++i) {
+                if (tails_verif[i] != cells[i].tail) {
+                    LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
                 }
             }
         }
 #endif
 
         // find next empty cell
-        uint32_t next_empty_cell = cache.head;
+        uint32_t next_empty_cell = head;
 
-        for (uint32_t i = 0; i < cache.size; ++i) {
-            if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
-            llama_kv_cell & cell = cache.cells[next_empty_cell];
+        for (uint32_t i = 0; i < size; ++i) {
+            if (next_empty_cell >= size) { next_empty_cell -= size; }
+            llama_kv_cell & cell = cells[next_empty_cell];
             if (cell.is_empty()) { break; }
             next_empty_cell += 1;
         }
@@ -199,20 +516,20 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
         // find usable cell range
         for (uint32_t s = 0; s < n_seqs; ++s) {
             const llama_seq_id seq_id = ubatch.seq_id[s][0];
-            llama_kv_cell & seq_meta = cache.cells[seq_id];
+            llama_kv_cell & seq_meta = cells[seq_id];
             bool has_cell = false;
             if (seq_meta.tail >= 0) {
-                llama_kv_cell & cell = cache.cells[seq_meta.tail];
+                llama_kv_cell & cell = cells[seq_meta.tail];
                 GGML_ASSERT(cell.has_seq_id(seq_id));
                 // does this seq_id "own" the cell?
                 if (cell.seq_id.size() == 1) { has_cell = true; }
             }
             if (!has_cell) {
-                llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
+                llama_kv_cell & empty_cell = cells[next_empty_cell];
                 GGML_ASSERT(empty_cell.is_empty());
                 // copy old tail into the empty cell
                 if (seq_meta.tail >= 0) {
-                    llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
+                    llama_kv_cell & orig_cell = cells[seq_meta.tail];
                     empty_cell.pos = orig_cell.pos;
                     empty_cell.src = orig_cell.src;
                     orig_cell.seq_id.erase(seq_id);
@@ -222,9 +539,9 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
                 // find next empty cell
                 if (s + 1 < n_seqs) {
                     next_empty_cell += 1;
-                    for (uint32_t i = 0; i < cache.size; ++i) {
-                        if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
-                        llama_kv_cell & cell = cache.cells[next_empty_cell];
+                    for (uint32_t i = 0; i < size; ++i) {
+                        if (next_empty_cell >= size) { next_empty_cell -= size; }
+                        llama_kv_cell & cell = cells[next_empty_cell];
                         if (cell.is_empty()) { break; }
                         next_empty_cell += 1;
                     }
@@ -237,10 +554,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
         // gather and re-order
         for (uint32_t s = 0; s < n_seqs; ++s) {
             int32_t dst_id = s + min;
-            int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
+            int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
             if (dst_id != src_id) {
-                llama_kv_cell & dst_cell = cache.cells[dst_id];
-                llama_kv_cell & src_cell = cache.cells[src_id];
+                llama_kv_cell & dst_cell = cells[dst_id];
+                llama_kv_cell & src_cell = cells[src_id];
 
                 std::swap(dst_cell.pos, src_cell.pos);
                 std::swap(dst_cell.src, src_cell.src);
@@ -248,10 +565,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
 
                 // swap tails (assuming they NEVER overlap)
                 for (const llama_seq_id seq_id : src_cell.seq_id) {
-                    cache.cells[seq_id].tail = src_id;
+                    cells[seq_id].tail = src_id;
                 }
                 for (const llama_seq_id seq_id : dst_cell.seq_id) {
-                    cache.cells[seq_id].tail = dst_id;
+                    cells[seq_id].tail = dst_id;
                 }
             }
         }
@@ -260,7 +577,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
         for (uint32_t s = 0; s < n_seqs; ++s) {
             const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
             int32_t cell_id = s + min;
-            llama_kv_cell & cell = cache.cells[cell_id];
+            llama_kv_cell & cell = cells[cell_id];
 
             if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
                 // What should happen when the pos backtracks or skips a value?
@@ -273,41 +590,41 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
             for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
                 const llama_seq_id seq_id = ubatch.seq_id[s][j];
                 cell.seq_id.insert(seq_id);
-                cache.cells[seq_id].tail = cell_id;
+                cells[seq_id].tail = cell_id;
             }
         }
 
         // allow getting the range of used cells, from head to head + n
-        cache.head = min;
-        cache.n    = max - min + 1;
-        cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
+        head = min;
+        n    = max - min + 1;
+        used = std::count_if(cells.begin(), cells.end(),
             [](const llama_kv_cell& cell){ return !cell.is_empty(); });
 
         // sanity check
-        return llama_kv_cache_slot_info(cache.n >= n_seqs);
+        return llama_kv_cache_slot_info(n >= n_seqs);
     }
     // otherwise, one cell per token.
 
-    if (n_tokens > cache.size) {
-        LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
+    if (n_tokens > size) {
+        LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
         return llama_kv_cache_slot_info_failed;
     }
 
     uint32_t n_tested = 0;
 
     while (true) {
-        if (cache.head + n_tokens > cache.size) {
-            n_tested += cache.size - cache.head;
-            cache.head = 0;
+        if (head + n_tokens > size) {
+            n_tested += size - head;
+            head = 0;
             continue;
         }
 
         bool found = true;
         for (uint32_t i = 0; i < n_tokens; i++) {
-            if (cache.cells[cache.head + i].pos >= 0) {
+            if (cells[head + i].pos >= 0) {
                 found = false;
-                cache.head += i + 1;
-                n_tested   += i + 1;
+                head     += i + 1;
+                n_tested += i + 1;
                 break;
             }
         }
@@ -316,7 +633,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
             break;
         }
 
-        if (n_tested >= cache.size) {
+        if (n_tested >= size) {
             //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
             return llama_kv_cache_slot_info_failed;
         }
@@ -325,22 +642,27 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
     for (uint32_t s = 0; s < n_seqs; s++) {
         for (uint32_t i = 0; i < n_seq_tokens; ++i) {
             uint32_t k = s*n_seq_tokens + i;
-            cache.cells[cache.head + k].pos = ubatch.pos[k];
+            cells[head + k].pos = ubatch.pos[k];
 
             for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
-                cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
+                cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
             }
         }
     }
 
-    cache.used += n_tokens;
+    used += n_tokens;
 
-    return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
+    return llama_kv_cache_slot_info(head, head + n_tokens);
+}
+
+uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) const {
+    // the FA kernels require padding to avoid extra runtime boundary checks
+    return cparams.flash_attn ? 256u : 32u;
 }
 
-uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
-    for (uint32_t i = cache.size; i > 0; --i) {
-        const llama_kv_cell & cell = cache.cells[i - 1];
+uint32_t llama_kv_cache::cell_max() const {
+    for (uint32_t i = size; i > 0; --i) {
+        const llama_kv_cell & cell = cells[i - 1];
 
         if (cell.pos >= 0 && !cell.is_empty()) {
             return i;
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index b0bb1cfb14f12..4ee3418d80334 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -7,6 +7,9 @@
 #include <set>
 #include <vector>
 
+struct llama_cparams;
+struct llama_ubatch;
+
 struct llama_kv_cell {
     llama_pos pos   = -1;
     llama_pos delta = 0;
@@ -28,7 +31,19 @@ struct llama_kv_cell {
     }
 };
 
+// a structure holds information about the slot found in llama_kv_cache_find_slot
+struct llama_kv_cache_slot_info {
+    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
+    bool found = false;                       // the slot was found
+
+    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
+    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
+
+    operator bool() const { return found; }
+};
+
 // ring-buffer of cached KV data
+// TODO: pimpl
 struct llama_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;
@@ -57,370 +72,48 @@ struct llama_kv_cache {
     std::vector<ggml_context_ptr> ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
 
-    int32_t n_tokens() const {
-        int32_t result = 0;
-
-        for (uint32_t i = 0; i < size; i++) {
-            result += cells[i].seq_id.size();
-        }
-
-        return result;
-    }
+    // TODO: become constructor
+    bool init(
+            const llama_model & model,
+          const llama_cparams & cparams,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                     uint32_t   kv_size,
+                         bool   offload);
 
-    size_t total_size() const {
-        size_t size = 0;
-        for (const auto & buf : bufs) {
-            size += ggml_backend_buffer_get_size(buf.get());
-        }
+    int32_t n_tokens() const;
 
-        return size;
-    }
+    size_t total_size() const;
 
     // TODO: better data structures to reduce the cost of this operation
-    llama_pos max_pos() const {
-        llama_pos max_pos = -1;
-        for (const auto & cell : cells) {
-            max_pos = std::max(max_pos, cell.pos);
-        }
-
-        return max_pos;
-    }
-
-    void clear() {
-        for (int32_t i = 0; i < (int32_t) size; ++i) {
-            cells[i].pos = -1;
-            cells[i].seq_id.clear();
-            cells[i].src = -1;
-            cells[i].tail = -1;
-        }
-        head = 0;
-        used = 0;
+    llama_pos max_pos() const;
 
-        for (auto & buf : bufs) {
-            ggml_backend_buffer_clear(buf.get(), 0);
-        }
-    }
+    void clear();
 
-    bool seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-        uint32_t new_head = size;
+    bool seq_rm  (llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
+    void seq_keep(llama_seq_id seq_id);
+    void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
+    void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d);
 
-        if (p0 < 0) {
-            p0 = 0;
-        }
+    llama_pos seq_pos_max(llama_seq_id seq_id);
 
-        if (p1 < 0) {
-            p1 = std::numeric_limits<llama_pos>::max();
-        }
+    void defrag();
 
-        // models like Mamba or RWKV can't have a state partially erased
-        if (recurrent) {
-            if (seq_id >= (int64_t) size) {
-                // could be fatal
-                return false;
-            }
-            if (0 <= seq_id) {
-                int32_t & tail_id = cells[seq_id].tail;
-                if (tail_id >= 0) {
-                    const llama_kv_cell & cell = cells[tail_id];
-                    // partial intersection is invalid
-                    if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
-                        return false;
-                    }
-                    // invalidate tails which will be cleared
-                    if (p0 <= cell.pos && cell.pos < p1) {
-                        tail_id = -1;
-                    }
-                }
-            } else {
-                // seq_id is negative, then the range should include everything or nothing
-                if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
-                    return false;
-                }
-            }
-        }
+    // find an empty slot of size "n_tokens" in the cache
+    // updates the cache head
+    // returns a structure holding information about the slot found
+    // Note: On success, it's important that cache.head points
+    // to the first cell of the slot.
+    llama_kv_cache_slot_info find_slot(const llama_ubatch & batch);
 
-        for (uint32_t i = 0; i < size; ++i) {
-            if (cells[i].pos >= p0 && cells[i].pos < p1) {
-                if (seq_id < 0) {
-                    cells[i].seq_id.clear();
-                } else if (cells[i].has_seq_id(seq_id)) {
-                    cells[i].seq_id.erase(seq_id);
-                } else {
-                    continue;
-                }
-                if (cells[i].is_empty()) {
-                    // keep count of the number of used cells
-                    if (cells[i].pos >= 0) {
-                        used--;
-                    }
-
-                    cells[i].pos = -1;
-                    cells[i].src = -1;
-
-                    if (new_head == size) {
-                        new_head = i;
-                    }
-                }
-            }
-        }
-
-        // If we freed up a slot, set head to it so searching can start there.
-        if (new_head != size && new_head < head) {
-            head = new_head;
-        }
-
-        return true;
-    }
-
-    void seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-        if (seq_id_src == seq_id_dst) {
-            return;
-        }
-
-        if (p0 < 0) {
-            p0 = 0;
-        }
-
-        if (p1 < 0) {
-            p1 = std::numeric_limits<llama_pos>::max();
-        }
-
-        if (recurrent) {
-            if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
-                llama_kv_cell & tail_src = cells[seq_id_src];
-                llama_kv_cell & tail_dst = cells[seq_id_dst];
-                if (tail_dst.tail >= 0) {
-                    // clear destination seq_id if it wasn't empty
-                    llama_kv_cell & cell_dst = cells[tail_dst.tail];
-
-                    cell_dst.seq_id.erase(seq_id_dst);
-                    tail_dst.tail = -1;
-                    if (cell_dst.seq_id.empty()) {
-                        cell_dst.pos = -1;
-                        cell_dst.delta = -1;
-                        cell_dst.src = -1;
-                        used -= 1;
-                    }
-                }
-                if (tail_src.tail >= 0) {
-                    llama_kv_cell & cell_src = cells[tail_src.tail];
-
-                    cell_src.seq_id.insert(seq_id_dst);
-                    tail_dst.tail = tail_src.tail;
-                }
-            }
-
-            return;
-        }
-
-        // otherwise, this is the KV of a Transformer-like model
-        head = 0;
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
-                cells[i].seq_id.insert(seq_id_dst);
-            }
-        }
-    }
-
-    void seq_keep(llama_seq_id seq_id) {
-        uint32_t new_head = size;
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (recurrent && (llama_seq_id) i != seq_id) {
-                cells[i].tail = -1;
-            }
-
-            if (!cells[i].has_seq_id(seq_id)) {
-                if (cells[i].pos >= 0) {
-                    used--;
-                }
-
-                cells[i].pos = -1;
-                cells[i].src = -1;
-                cells[i].seq_id.clear();
-
-                if (new_head == size){
-                    new_head = i;
-                }
-            } else {
-                cells[i].seq_id.clear();
-                cells[i].seq_id.insert(seq_id);
-            }
-        }
-
-        // If we freed up a slot, set head to it so searching can start there.
-        if (new_head != size && new_head < head) {
-            head = new_head;
-        }
-    }
-
-    void seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-        if (delta == 0) {
-            return;
-        }
+    // TODO: maybe not needed
+    uint32_t get_padding(const llama_cparams & cparams) const;
 
-        uint32_t new_head = size;
-
-        if (p0 < 0) {
-            p0 = 0;
-        }
-
-        if (p1 < 0) {
-            p1 = std::numeric_limits<llama_pos>::max();
-        }
-
-        // If there is no range then return early to avoid looping over the
-        if (p0 == p1) {
-            return;
-        }
-
-        if (recurrent) {
-            // for Mamba-like or RWKV models, only the pos needs to be shifted
-            if (0 <= seq_id && seq_id < (int64_t) size) {
-                const int32_t tail_id = cells[seq_id].tail;
-                if (tail_id >= 0) {
-                    llama_kv_cell & cell = cells[tail_id];
-                    if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                        cell.pos += delta;
-                    }
-                }
-            }
-            return;
-        }
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
-                has_shift = true;
-                cells[i].pos   += delta;
-                cells[i].delta += delta;
-
-                if (cells[i].pos < 0) {
-                    if (!cells[i].is_empty()) {
-                        used--;
-                    }
-                    cells[i].pos = -1;
-                    cells[i].seq_id.clear();
-                    if (new_head == size) {
-                        new_head = i;
-                    }
-                }
-            }
-        }
-
-        // If we freed up a slot, set head to it so searching can start there.
-        // Otherwise we just start the next search from the beginning.
-        head = new_head != size ? new_head : 0;
-    }
-
-    void seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-        if (d == 1) {
-            return;
-        }
-
-        if (p0 < 0) {
-            p0 = 0;
-        }
-
-        if (p1 < 0) {
-            p1 = std::numeric_limits<llama_pos>::max();
-        }
-
-        // If there is no range then return early to avoid looping over the cache.
-        if (p0 == p1) {
-            return;
-        }
-
-        if (recurrent) {
-            // for Mamba-like or RWKV models, only the pos needs to be changed
-            if (0 <= seq_id && seq_id < (int64_t) size) {
-                const int32_t tail_id = cells[seq_id].tail;
-                if (tail_id >= 0) {
-                    llama_kv_cell & cell = cells[tail_id];
-                    if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                        cell.pos /= d;
-                    }
-                }
-            }
-
-            return;
-        }
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
-                has_shift = true;
-
-                {
-                    llama_pos p_old = cells[i].pos;
-                    cells[i].pos   /= d;
-                    cells[i].delta += cells[i].pos - p_old;
-                }
-            }
-        }
-    }
-
-    llama_pos seq_pos_max(llama_seq_id seq_id) {
-        llama_pos result = 0;
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (cells[i].has_seq_id(seq_id)) {
-                result = std::max(result, cells[i].pos);
-            }
-        }
-
-        return result;
-    }
-
-    void defrag() {
-        if (!recurrent) {
-            do_defrag = true;
-        }
-    }
-};
-
-// a structure holds information about the slot found in llama_kv_cache_find_slot
-struct llama_kv_cache_slot_info {
-    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
-    bool found = false;                       // the slot was found
-
-    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
-    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
-
-    operator bool() const { return found; }
+    // find how many cells are currently in use
+    uint32_t cell_max() const;
 };
 
-// TODO: maybe not needed
-uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
-
-bool llama_kv_cache_init(
-        struct llama_kv_cache & cache,
-            const llama_model & model,
-          const llama_cparams & cparams,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                     uint32_t   kv_size,
-                         bool   offload);
-
-// find an empty slot of size "n_tokens" in the cache
-// updates the cache head
-// returns a structure holding information about the slot found
-// Note: On success, it's important that cache.head points
-// to the first cell of the slot.
-struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
-           struct llama_kv_cache & cache,
-       const struct llama_ubatch & batch);
-
-// find how many cells are currently in use
-uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
-
-//
-// kv cache view
-//
-
-struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
-
-void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
-
 //
 // kv cache restore
 //
@@ -472,3 +165,10 @@ struct llama_kv_slot_restorer {
     }
 };
 
+//
+// kv cache view
+//
+
+struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
+
+void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
diff --git a/src/llama.cpp b/src/llama.cpp
index 87dd512b2546a..d8427af9d1b6d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8572,18 +8572,18 @@ static int llama_decode_impl(
                 kv_self.head = 0;
             }
 
-            const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
-            if (!slot) {
+            const auto slot_info = kv_self.find_slot(ubatch);
+            if (!slot_info) {
                 return 1;
             }
-            kv_slot_restorer.save(slot);
+            kv_slot_restorer.save(slot_info);
 
             if (!kv_self.recurrent) {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
                 // after enough generations, the benefit from this heuristic disappears
                 // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = llama_kv_cache_get_padding(cparams);
-                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
+                const uint32_t pad = kv_self.get_padding(cparams);
+                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
                 //kv_self.n = llama_kv_cache_cell_max(kv_self);
             }
         }
@@ -8969,7 +8969,7 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
 
     const uint32_t n_layer = hparams.n_layer;
 
-    const uint32_t n_kv   = llama_kv_cache_cell_max(kv_self);
+    const uint32_t n_kv   = kv_self.cell_max();
     const uint32_t n_used = kv_self.used;
 
     assert(n_used <= n_kv);
@@ -9550,7 +9550,7 @@ struct llama_context * llama_init_from_model(
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
     // this is necessary due to kv_self.n being padded later during inference
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, ctx->kv_self.get_padding(cparams));
 
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -9692,7 +9692,7 @@ struct llama_context * llama_init_from_model(
 
         llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
 
-        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
+        if (!ctx->kv_self.init(ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;

From fef90cb3d7a823bd00a7899b52ffc70a4f824d44 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Jan 2025 15:58:20 +0200
Subject: [PATCH 04/84] kv_cache : fix

ggml-ci
---
 src/llama-kv-cache.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 8b2f6287b8ae7..fe59867684a85 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -35,8 +35,8 @@ bool llama_kv_cache::init(
     size = kv_size;
     used = 0;
 
-    type_k = type_k;
-    type_v = type_v;
+    this->type_k = type_k;
+    this->type_v = type_v;
 
     cells.clear();
     cells.resize(kv_size);

From 73a14eccc9f200d6012963af9448042dfeac54fc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 14 Jan 2025 11:56:53 +0200
Subject: [PATCH 05/84] kv_cache : minor

---
 src/llama-kv-cache.cpp | 38 +++++++++++++++++++++++++++++++-------
 src/llama-kv-cache.h   | 18 +++++++++++-------
 src/llama.cpp          | 18 +++++-------------
 3 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index fe59867684a85..9f3b4e5144415 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -73,17 +73,22 @@ bool llama_kv_cache::init(
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
-        LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
+        const char * dev_name = "CPU";
 
         ggml_backend_buffer_type_t buft;
         if (offload) {
             auto * dev = model.dev_layer(i);
             buft = ggml_backend_dev_buffer_type(dev);
+
+            dev_name = ggml_backend_dev_name(dev);
         } else {
             buft = ggml_backend_cpu_buffer_type();
         }
-        ggml_context * ctx = ctx_for_buft(buft);
 
+        LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
+                i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
+
+        ggml_context * ctx = ctx_for_buft(buft);
         if (!ctx) {
             LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
             return false;
@@ -134,14 +139,13 @@ size_t llama_kv_cache::total_size() const {
     return size;
 }
 
-// TODO: better data structures to reduce the cost of this operation
-llama_pos llama_kv_cache::max_pos() const {
-    llama_pos max_pos = -1;
+llama_pos llama_kv_cache::pos_max() const {
+    llama_pos pos_max = -1;
     for (const auto & cell : cells) {
-        max_pos = std::max(max_pos, cell.pos);
+        pos_max = std::max(pos_max, cell.pos);
     }
 
-    return max_pos;
+    return pos_max;
 }
 
 void llama_kv_cache::clear() {
@@ -672,6 +676,26 @@ uint32_t llama_kv_cache::cell_max() const {
     return 0;
 }
 
+size_t llama_kv_cache::size_k_bytes() const {
+    size_t size_k_bytes = 0;
+
+    for (const auto & k : k_l) {
+        size_k_bytes += ggml_nbytes(k);
+    }
+
+    return size_k_bytes;
+}
+
+size_t llama_kv_cache::size_v_bytes() const {
+    size_t size_v_bytes = 0;
+
+    for (const auto & v : v_l) {
+        size_v_bytes += ggml_nbytes(v);
+    }
+
+    return size_v_bytes;
+}
+
 void llama_kv_cache_clear(llama_kv_cache * kv) {
     kv->clear();
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 4ee3418d80334..97285481e3588 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -61,17 +61,11 @@ struct llama_kv_cache {
     // computed before each graph build
     uint32_t n = 0;
 
-    ggml_type type_k = GGML_TYPE_F16;
-    ggml_type type_v = GGML_TYPE_F16;
-
     std::vector<llama_kv_cell> cells;
 
     std::vector<struct ggml_tensor *> k_l; // per layer
     std::vector<struct ggml_tensor *> v_l;
 
-    std::vector<ggml_context_ptr> ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
-
     // TODO: become constructor
     bool init(
             const llama_model & model,
@@ -86,7 +80,7 @@ struct llama_kv_cache {
     size_t total_size() const;
 
     // TODO: better data structures to reduce the cost of this operation
-    llama_pos max_pos() const;
+    llama_pos pos_max() const;
 
     void clear();
 
@@ -112,6 +106,16 @@ struct llama_kv_cache {
 
     // find how many cells are currently in use
     uint32_t cell_max() const;
+
+    size_t size_k_bytes() const;
+    size_t size_v_bytes() const;
+
+private:
+    ggml_type type_k = GGML_TYPE_F16;
+    ggml_type type_v = GGML_TYPE_F16;
+
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
 };
 
 //
diff --git a/src/llama.cpp b/src/llama.cpp
index d8427af9d1b6d..0227ba6b36a93 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1973,7 +1973,7 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,      cur, inp_out_ids);
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
 
@@ -8456,7 +8456,7 @@ static int llama_decode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens_all = batch.n_tokens;
@@ -8792,7 +8792,7 @@ static int llama_encode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens = batch.n_tokens;
@@ -9699,16 +9699,8 @@ struct llama_context * llama_init_from_model(
         }
 
         {
-            size_t memory_size_k = 0;
-            size_t memory_size_v = 0;
-
-            for (auto & k : ctx->kv_self.k_l) {
-                memory_size_k += ggml_nbytes(k);
-            }
-
-            for (auto & v : ctx->kv_self.v_l) {
-                memory_size_v += ggml_nbytes(v);
-            }
+            const size_t memory_size_k = ctx->kv_self.size_k_bytes();
+            const size_t memory_size_v = ctx->kv_self.size_v_bytes();
 
             LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
                       (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),

From 4cd1b6fa4cc4e8da927caac5c61b9fcd096a1ace Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 14 Jan 2025 12:33:13 +0200
Subject: [PATCH 06/84] context : prepare kv_cache_read/write to be moved to
 kv_cache

ggml-ci
---
 src/llama-context.cpp | 153 +++++++++++++++++++++---------------------
 src/llama-kv-cache.h  |   1 +
 2 files changed, 76 insertions(+), 78 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0654feccb8951..8fc6de2f271f9 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -928,11 +928,8 @@ struct llama_data_write {
         }
     }
 
-    void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
-        const struct llama_kv_cache & kv_self = ctx->kv_self;
-        const struct llama_hparams & hparams = ctx->model.hparams;
-
-        const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
+    void write_kv_cache_data(const llama_kv_cache & kv, const llama_hparams & hparams, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
+        const uint32_t v_trans = kv.v_trans ? 1 : 0;
         const uint32_t n_layer = hparams.n_layer;
 
         write(&v_trans, sizeof(v_trans));
@@ -946,52 +943,52 @@ struct llama_data_write {
             const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
 
             // Write key type
-            const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
+            const int32_t k_type_i = (int32_t)kv.k_l[il]->type;
             write(&k_type_i, sizeof(k_type_i));
 
             // Write row size of key
-            const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+            const uint64_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa);
             write(&k_size_row, sizeof(k_size_row));
 
             // Read each range of cells of k_size length each into tmp_buf and write out
             for (const auto & range : cell_ranges) {
                 const size_t range_size = range.second - range.first;
                 const size_t buf_size = range_size * k_size_row;
-                write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
+                write_tensor_data(kv.k_l[il], range.first * k_size_row, buf_size);
             }
         }
 
-        if (!kv_self.v_trans) {
+        if (!kv.v_trans) {
             for (uint32_t il = 0; il < n_layer; ++il) {
                 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
 
                 // Write value type
-                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
                 write(&v_type_i, sizeof(v_type_i));
 
                 // Write row size of value
-                const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+                const uint64_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa);
                 write(&v_size_row, sizeof(v_size_row));
 
                 // Read each range of cells of v_size length each into tmp_buf and write out
                 for (const auto & range : cell_ranges) {
                     const size_t range_size = range.second - range.first;
                     const size_t buf_size = range_size * v_size_row;
-                    write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
+                    write_tensor_data(kv.v_l[il], range.first * v_size_row, buf_size);
                 }
             }
         } else {
             // When v is transposed, we also need the element size and get the element ranges from each row
-            const uint32_t kv_size = kv_self.size;
+            const uint32_t kv_size = kv.size;
             for (uint32_t il = 0; il < n_layer; ++il) {
                 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
 
                 // Write value type
-                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
                 write(&v_type_i, sizeof(v_type_i));
 
                 // Write element size
-                const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+                const uint32_t v_size_el = ggml_type_size(kv.v_l[il]->type);
                 write(&v_size_el, sizeof(v_size_el));
 
                 // Write GQA embedding size
@@ -1004,37 +1001,36 @@ struct llama_data_write {
                         const size_t range_size = range.second - range.first;
                         const size_t src_offset = (range.first + j * kv_size) * v_size_el;
                         const size_t buf_size = range_size * v_size_el;
-                        write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
+                        write_tensor_data(kv.v_l[il], src_offset, buf_size);
                     }
                 }
             }
         }
     }
 
-    void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
-        const struct llama_kv_cache & kv_self = ctx->kv_self;
+    void write_kv_cache(const llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) {
         std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
         uint32_t cell_count = 0;
 
         // Count the number of cells with the specified seq_id
         // Find all the ranges of cells with this seq id (or all, when -1)
-        uint32_t cell_range_begin = kv_self.size;
-        for (uint32_t i = 0; i < kv_self.size; ++i) {
-            const auto & cell = kv_self.cells[i];
+        uint32_t cell_range_begin = kv.size;
+        for (uint32_t i = 0; i < kv.size; ++i) {
+            const auto & cell = kv.cells[i];
             if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
                 ++cell_count;
-                if (cell_range_begin == kv_self.size) {
+                if (cell_range_begin == kv.size) {
                     cell_range_begin = i;
                 }
             } else {
-                if (cell_range_begin != kv_self.size) {
+                if (cell_range_begin != kv.size) {
                     cell_ranges.emplace_back(cell_range_begin, i);
-                    cell_range_begin = kv_self.size;
+                    cell_range_begin = kv.size;
                 }
             }
         }
-        if (cell_range_begin != kv_self.size) {
-            cell_ranges.emplace_back(cell_range_begin, kv_self.size);
+        if (cell_range_begin != kv.size) {
+            cell_ranges.emplace_back(cell_range_begin, kv.size);
         }
 
         // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
@@ -1046,8 +1042,8 @@ struct llama_data_write {
 
         write(&cell_count, sizeof(cell_count));
 
-        write_kv_cache_meta(kv_self, cell_ranges, seq_id);
-        write_kv_cache_data(ctx, cell_ranges);
+        write_kv_cache_meta(kv, cell_ranges, seq_id);
+        write_kv_cache_data(kv, hparams, cell_ranges);
     }
 };
 
@@ -1140,15 +1136,15 @@ struct llama_data_read {
         }
     }
 
-    bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
-        struct llama_kv_cache & kv_self = ctx->kv_self;
-
+    bool read_kv_cache_meta(llama_kv_cache & kv, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
         if (dest_seq_id != -1) {
             // single sequence
 
-            kv_self.seq_rm(dest_seq_id, -1, -1);
+            kv.seq_rm(dest_seq_id, -1, -1);
+
+            llama_sbatch sbatch;
+            llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
 
-            llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
             batch.n_tokens = cell_count;
             batch.n_seq_tokens = cell_count;
             batch.n_seqs = 1;
@@ -1157,7 +1153,7 @@ struct llama_data_read {
                 llama_pos pos;
                 uint32_t n_seq_id;
 
-                read_to(&pos, sizeof(pos));
+                read_to(&pos,      sizeof(pos));
                 read_to(&n_seq_id, sizeof(n_seq_id));
 
                 if (n_seq_id != 0) {
@@ -1169,30 +1165,30 @@ struct llama_data_read {
             }
             batch.n_seq_id[0] = 1;
             batch.seq_id[0] = &dest_seq_id;
-            if (!kv_self.find_slot(batch)) {
+            if (!kv.find_slot(batch)) {
                 LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
                 return false;
             }
 
-            // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+            // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
             // Assume that this is one contiguous block of cells
-            GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
-            GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
-            GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
-            GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
-            GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
+            GGML_ASSERT(kv.head + cell_count <= kv.size);
+            GGML_ASSERT(kv.cells[kv.head].pos == batch.pos[0]);
+            GGML_ASSERT(kv.cells[kv.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+            GGML_ASSERT(kv.cells[kv.head].has_seq_id(dest_seq_id));
+            GGML_ASSERT(kv.cells[kv.head + cell_count - 1].has_seq_id(dest_seq_id));
         } else {
             // whole KV cache restore
 
-            if (cell_count > kv_self.size) {
+            if (cell_count > kv.size) {
                 LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
                 return false;
             }
 
-            kv_self.clear();
+            kv.clear();
 
             for (uint32_t i = 0; i < cell_count; ++i) {
-                llama_kv_cell & cell = kv_self.cells[i];
+                llama_kv_cell & cell = kv.cells[i];
 
                 llama_pos pos;
                 uint32_t  n_seq_id;
@@ -1206,15 +1202,18 @@ struct llama_data_read {
                     llama_seq_id seq_id;
                     read_to(&seq_id, sizeof(seq_id));
 
-                    if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
-                        LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                    // TODO: llama_kv_cache should have a notion of max sequences
+                    //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                    if (seq_id < 0) {
+                        //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                        LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
                         return false;
                     }
 
                     cell.seq_id.insert(seq_id);
 
-                    if (kv_self.recurrent) {
-                        int32_t & tail = kv_self.cells[seq_id].tail;
+                    if (kv.recurrent) {
+                        int32_t & tail = kv.cells[seq_id].tail;
                         if (tail != -1) {
                             LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
                             return false;
@@ -1224,24 +1223,22 @@ struct llama_data_read {
                 }
             }
 
-            kv_self.head = 0;
-            kv_self.used = cell_count;
+            kv.head = 0;
+            kv.used = cell_count;
         }
 
-        if (kv_self.recurrent) {
+        if (kv.recurrent) {
             for (uint32_t i = 0; i < cell_count; ++i) {
-                uint32_t cell_id = kv_self.head + i;
+                uint32_t cell_id = kv.head + i;
                 // make sure the recurrent states will keep their restored state
-                kv_self.cells[cell_id].src = cell_id;
+                kv.cells[cell_id].src = cell_id;
             }
         }
 
         return true;
     }
 
-    bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
-        const struct llama_hparams & hparams = ctx->model.hparams;
-        struct llama_kv_cache & kv_self = ctx->kv_self;
+    bool read_kv_cache_data(llama_kv_cache & kv, const llama_hparams & hparams, uint32_t cell_count) {
         uint32_t v_trans;
         uint32_t n_layer;
         read_to(&v_trans, sizeof(v_trans));
@@ -1251,11 +1248,11 @@ struct llama_data_read {
             LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
             return false;
         }
-        if (cell_count > kv_self.size) {
-            LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
+        if (cell_count > kv.size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv.size);
             return false;
         }
-        if (kv_self.v_trans != (bool) v_trans) {
+        if (kv.v_trans != (bool) v_trans) {
             LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
             return false;
         }
@@ -1267,7 +1264,7 @@ struct llama_data_read {
             // Read type of key
             int32_t k_type_i_ref;
             read_to(&k_type_i_ref, sizeof(k_type_i_ref));
-            const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
+            const int32_t k_type_i = (int32_t)kv.k_l[il]->type;
             if (k_type_i != k_type_i_ref) {
                 LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
                 return false;
@@ -1276,7 +1273,7 @@ struct llama_data_read {
             // Read row size of key
             uint64_t k_size_row_ref;
             read_to(&k_size_row_ref, sizeof(k_size_row_ref));
-            const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+            const size_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa);
             if (k_size_row != k_size_row_ref) {
                 LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
                 return false;
@@ -1284,18 +1281,18 @@ struct llama_data_read {
 
             if (cell_count) {
                 // Read and set the keys for the whole cell range
-                ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
+                ggml_backend_tensor_set(kv.k_l[il], read(cell_count * k_size_row), kv.head * k_size_row, cell_count * k_size_row);
             }
         }
 
-        if (!kv_self.v_trans) {
+        if (!kv.v_trans) {
             for (uint32_t il = 0; il < n_layer; ++il) {
                 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
 
                 // Read type of value
                 int32_t v_type_i_ref;
                 read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
                 if (v_type_i != v_type_i_ref) {
                     LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
                     return false;
@@ -1304,7 +1301,7 @@ struct llama_data_read {
                 // Read row size of value
                 uint64_t v_size_row_ref;
                 read_to(&v_size_row_ref, sizeof(v_size_row_ref));
-                const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+                const size_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa);
                 if (v_size_row != v_size_row_ref) {
                     LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
                     return false;
@@ -1312,7 +1309,7 @@ struct llama_data_read {
 
                 if (cell_count) {
                     // Read and set the values for the whole cell range
-                    ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
+                    ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_row), kv.head * v_size_row, cell_count * v_size_row);
                 }
             }
         } else {
@@ -1323,7 +1320,7 @@ struct llama_data_read {
                 // Read type of value
                 int32_t v_type_i_ref;
                 read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
                 if (v_type_i != v_type_i_ref) {
                     LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
                     return false;
@@ -1332,7 +1329,7 @@ struct llama_data_read {
                 // Read element size of value
                 uint32_t v_size_el_ref;
                 read_to(&v_size_el_ref, sizeof(v_size_el_ref));
-                const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+                const size_t v_size_el = ggml_type_size(kv.v_l[il]->type);
                 if (v_size_el != v_size_el_ref) {
                     LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
                     return false;
@@ -1349,8 +1346,8 @@ struct llama_data_read {
                 if (cell_count) {
                     // For each row in the transposed matrix, read the values for the whole cell range
                     for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                        const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
-                        ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                        const size_t dst_offset = (kv.head + j * kv.size) * v_size_el;
+                        ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
                     }
                 }
             }
@@ -1358,17 +1355,17 @@ struct llama_data_read {
         return true;
     }
 
-    void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
+    void read_kv_cache(llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) {
         uint32_t cell_count;
         read_to(&cell_count, sizeof(cell_count));
 
-        bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count);
+        bool res = read_kv_cache_meta(kv, cell_count, seq_id) && read_kv_cache_data(kv, hparams, cell_count);
 
         if (!res) {
             if (seq_id == -1) {
-                ctx->kv_self.clear();
+                kv.clear();
             } else {
-                ctx->kv_self.seq_rm(seq_id, -1, -1);
+                kv.seq_rm(seq_id, -1, -1);
             }
             throw std::runtime_error("failed to restore kv cache");
         }
@@ -1521,7 +1518,7 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
     data_ctx.write_logits(ctx);
     data_ctx.write_embeddings(ctx);
 
-    data_ctx.write_kv_cache(ctx);
+    data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams);
 
     return data_ctx.get_size_written();
 }
@@ -1558,7 +1555,7 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
     data_ctx.read_logits(ctx);
     data_ctx.read_embeddings(ctx);
 
-    data_ctx.read_kv_cache(ctx);
+    data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams);
 
     return data_ctx.get_size_read();
 }
@@ -1654,7 +1651,7 @@ bool llama_state_save_file(struct llama_context * ctx, const char * path_session
 static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
     llama_synchronize(ctx);
 
-    data_ctx.write_kv_cache(ctx, seq_id);
+    data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams, seq_id);
 
     return data_ctx.get_size_written();
 }
@@ -1677,7 +1674,7 @@ size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_
 static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
     llama_synchronize(ctx);
 
-    data_ctx.read_kv_cache(ctx, dest_seq_id);
+    data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams, dest_seq_id);
 
     return data_ctx.get_size_read();
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 97285481e3588..7fc2fabf5163d 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -44,6 +44,7 @@ struct llama_kv_cache_slot_info {
 
 // ring-buffer of cached KV data
 // TODO: pimpl
+// TODO: add notion of max sequences
 struct llama_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;

From fd05ab87aad1221535da86d5cd810ee5856ebb49 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 14 Jan 2025 13:13:35 +0200
Subject: [PATCH 07/84] kv_cache : move state read/write to llama_kv_cache

ggml-ci
---
 src/llama-context.cpp  | 424 +++++------------------------------------
 src/llama-kv-cache.cpp | 378 ++++++++++++++++++++++++++++++++++++
 src/llama-kv-cache.h   |  20 ++
 3 files changed, 446 insertions(+), 376 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8fc6de2f271f9..0e146652c5996 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -908,143 +908,6 @@ struct llama_data_write {
             write(ctx->embd, embeddings_size * sizeof(float));
         }
     }
-
-    void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
-        for (const auto & range : cell_ranges) {
-            for (uint32_t i = range.first; i < range.second; ++i) {
-                const auto & cell = kv_self.cells[i];
-                const llama_pos pos      = cell.pos;
-                const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
-
-                write(&pos,      sizeof(pos));
-                write(&n_seq_id, sizeof(n_seq_id));
-
-                if (n_seq_id) {
-                    for (auto seq_id : cell.seq_id) {
-                        write(&seq_id, sizeof(seq_id));
-                    }
-                }
-            }
-        }
-    }
-
-    void write_kv_cache_data(const llama_kv_cache & kv, const llama_hparams & hparams, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
-        const uint32_t v_trans = kv.v_trans ? 1 : 0;
-        const uint32_t n_layer = hparams.n_layer;
-
-        write(&v_trans, sizeof(v_trans));
-        write(&n_layer, sizeof(n_layer));
-
-        std::vector<uint8_t> tmp_buf;
-
-        // Iterate and write all the keys first, each row is a cell
-        // Get whole range at a time
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-            // Write key type
-            const int32_t k_type_i = (int32_t)kv.k_l[il]->type;
-            write(&k_type_i, sizeof(k_type_i));
-
-            // Write row size of key
-            const uint64_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa);
-            write(&k_size_row, sizeof(k_size_row));
-
-            // Read each range of cells of k_size length each into tmp_buf and write out
-            for (const auto & range : cell_ranges) {
-                const size_t range_size = range.second - range.first;
-                const size_t buf_size = range_size * k_size_row;
-                write_tensor_data(kv.k_l[il], range.first * k_size_row, buf_size);
-            }
-        }
-
-        if (!kv.v_trans) {
-            for (uint32_t il = 0; il < n_layer; ++il) {
-                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-                // Write value type
-                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
-                write(&v_type_i, sizeof(v_type_i));
-
-                // Write row size of value
-                const uint64_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa);
-                write(&v_size_row, sizeof(v_size_row));
-
-                // Read each range of cells of v_size length each into tmp_buf and write out
-                for (const auto & range : cell_ranges) {
-                    const size_t range_size = range.second - range.first;
-                    const size_t buf_size = range_size * v_size_row;
-                    write_tensor_data(kv.v_l[il], range.first * v_size_row, buf_size);
-                }
-            }
-        } else {
-            // When v is transposed, we also need the element size and get the element ranges from each row
-            const uint32_t kv_size = kv.size;
-            for (uint32_t il = 0; il < n_layer; ++il) {
-                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-                // Write value type
-                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
-                write(&v_type_i, sizeof(v_type_i));
-
-                // Write element size
-                const uint32_t v_size_el = ggml_type_size(kv.v_l[il]->type);
-                write(&v_size_el, sizeof(v_size_el));
-
-                // Write GQA embedding size
-                write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
-
-                // For each row, we get the element values of each cell
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    // Read each range of cells of v_size_el length each into tmp_buf and write out
-                    for (const auto & range : cell_ranges) {
-                        const size_t range_size = range.second - range.first;
-                        const size_t src_offset = (range.first + j * kv_size) * v_size_el;
-                        const size_t buf_size = range_size * v_size_el;
-                        write_tensor_data(kv.v_l[il], src_offset, buf_size);
-                    }
-                }
-            }
-        }
-    }
-
-    void write_kv_cache(const llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) {
-        std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
-        uint32_t cell_count = 0;
-
-        // Count the number of cells with the specified seq_id
-        // Find all the ranges of cells with this seq id (or all, when -1)
-        uint32_t cell_range_begin = kv.size;
-        for (uint32_t i = 0; i < kv.size; ++i) {
-            const auto & cell = kv.cells[i];
-            if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
-                ++cell_count;
-                if (cell_range_begin == kv.size) {
-                    cell_range_begin = i;
-                }
-            } else {
-                if (cell_range_begin != kv.size) {
-                    cell_ranges.emplace_back(cell_range_begin, i);
-                    cell_range_begin = kv.size;
-                }
-            }
-        }
-        if (cell_range_begin != kv.size) {
-            cell_ranges.emplace_back(cell_range_begin, kv.size);
-        }
-
-        // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
-        uint32_t cell_count_check = 0;
-        for (const auto & range : cell_ranges) {
-            cell_count_check += range.second - range.first;
-        }
-        GGML_ASSERT(cell_count == cell_count_check);
-
-        write(&cell_count, sizeof(cell_count));
-
-        write_kv_cache_meta(kv, cell_ranges, seq_id);
-        write_kv_cache_data(kv, hparams, cell_ranges);
-    }
 };
 
 struct llama_data_read {
@@ -1135,241 +998,6 @@ struct llama_data_read {
             read_to(ctx->embd, embeddings_size * sizeof(float));
         }
     }
-
-    bool read_kv_cache_meta(llama_kv_cache & kv, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
-        if (dest_seq_id != -1) {
-            // single sequence
-
-            kv.seq_rm(dest_seq_id, -1, -1);
-
-            llama_sbatch sbatch;
-            llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
-
-            batch.n_tokens = cell_count;
-            batch.n_seq_tokens = cell_count;
-            batch.n_seqs = 1;
-
-            for (uint32_t i = 0; i < cell_count; ++i) {
-                llama_pos pos;
-                uint32_t n_seq_id;
-
-                read_to(&pos,      sizeof(pos));
-                read_to(&n_seq_id, sizeof(n_seq_id));
-
-                if (n_seq_id != 0) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
-                    return false;
-                }
-
-                batch.pos[i] = pos;
-            }
-            batch.n_seq_id[0] = 1;
-            batch.seq_id[0] = &dest_seq_id;
-            if (!kv.find_slot(batch)) {
-                LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
-                return false;
-            }
-
-            // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
-            // Assume that this is one contiguous block of cells
-            GGML_ASSERT(kv.head + cell_count <= kv.size);
-            GGML_ASSERT(kv.cells[kv.head].pos == batch.pos[0]);
-            GGML_ASSERT(kv.cells[kv.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
-            GGML_ASSERT(kv.cells[kv.head].has_seq_id(dest_seq_id));
-            GGML_ASSERT(kv.cells[kv.head + cell_count - 1].has_seq_id(dest_seq_id));
-        } else {
-            // whole KV cache restore
-
-            if (cell_count > kv.size) {
-                LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
-                return false;
-            }
-
-            kv.clear();
-
-            for (uint32_t i = 0; i < cell_count; ++i) {
-                llama_kv_cell & cell = kv.cells[i];
-
-                llama_pos pos;
-                uint32_t  n_seq_id;
-
-                read_to(&pos,      sizeof(pos));
-                read_to(&n_seq_id, sizeof(n_seq_id));
-
-                cell.pos = pos;
-
-                for (uint32_t j = 0; j < n_seq_id; ++j) {
-                    llama_seq_id seq_id;
-                    read_to(&seq_id, sizeof(seq_id));
-
-                    // TODO: llama_kv_cache should have a notion of max sequences
-                    //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
-                    if (seq_id < 0) {
-                        //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
-                        LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
-                        return false;
-                    }
-
-                    cell.seq_id.insert(seq_id);
-
-                    if (kv.recurrent) {
-                        int32_t & tail = kv.cells[seq_id].tail;
-                        if (tail != -1) {
-                            LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
-                            return false;
-                        }
-                        tail = i;
-                    }
-                }
-            }
-
-            kv.head = 0;
-            kv.used = cell_count;
-        }
-
-        if (kv.recurrent) {
-            for (uint32_t i = 0; i < cell_count; ++i) {
-                uint32_t cell_id = kv.head + i;
-                // make sure the recurrent states will keep their restored state
-                kv.cells[cell_id].src = cell_id;
-            }
-        }
-
-        return true;
-    }
-
-    bool read_kv_cache_data(llama_kv_cache & kv, const llama_hparams & hparams, uint32_t cell_count) {
-        uint32_t v_trans;
-        uint32_t n_layer;
-        read_to(&v_trans, sizeof(v_trans));
-        read_to(&n_layer, sizeof(n_layer));
-
-        if (n_layer != hparams.n_layer) {
-            LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
-            return false;
-        }
-        if (cell_count > kv.size) {
-            LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv.size);
-            return false;
-        }
-        if (kv.v_trans != (bool) v_trans) {
-            LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
-            return false;
-        }
-
-        // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-            // Read type of key
-            int32_t k_type_i_ref;
-            read_to(&k_type_i_ref, sizeof(k_type_i_ref));
-            const int32_t k_type_i = (int32_t)kv.k_l[il]->type;
-            if (k_type_i != k_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
-                return false;
-            }
-
-            // Read row size of key
-            uint64_t k_size_row_ref;
-            read_to(&k_size_row_ref, sizeof(k_size_row_ref));
-            const size_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa);
-            if (k_size_row != k_size_row_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
-                return false;
-            }
-
-            if (cell_count) {
-                // Read and set the keys for the whole cell range
-                ggml_backend_tensor_set(kv.k_l[il], read(cell_count * k_size_row), kv.head * k_size_row, cell_count * k_size_row);
-            }
-        }
-
-        if (!kv.v_trans) {
-            for (uint32_t il = 0; il < n_layer; ++il) {
-                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-                // Read type of value
-                int32_t v_type_i_ref;
-                read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
-                if (v_type_i != v_type_i_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                    return false;
-                }
-
-                // Read row size of value
-                uint64_t v_size_row_ref;
-                read_to(&v_size_row_ref, sizeof(v_size_row_ref));
-                const size_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa);
-                if (v_size_row != v_size_row_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
-                    return false;
-                }
-
-                if (cell_count) {
-                    // Read and set the values for the whole cell range
-                    ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_row), kv.head * v_size_row, cell_count * v_size_row);
-                }
-            }
-        } else {
-            // For each layer, read the values for each cell (transposed)
-            for (uint32_t il = 0; il < n_layer; ++il) {
-                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-                // Read type of value
-                int32_t v_type_i_ref;
-                read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
-                if (v_type_i != v_type_i_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                    return false;
-                }
-
-                // Read element size of value
-                uint32_t v_size_el_ref;
-                read_to(&v_size_el_ref, sizeof(v_size_el_ref));
-                const size_t v_size_el = ggml_type_size(kv.v_l[il]->type);
-                if (v_size_el != v_size_el_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
-                    return false;
-                }
-
-                // Read GQA embedding size
-                uint32_t n_embd_v_gqa_ref;
-                read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
-                if (n_embd_v_gqa != n_embd_v_gqa_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
-                    return false;
-                }
-
-                if (cell_count) {
-                    // For each row in the transposed matrix, read the values for the whole cell range
-                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                        const size_t dst_offset = (kv.head + j * kv.size) * v_size_el;
-                        ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
-                    }
-                }
-            }
-        }
-        return true;
-    }
-
-    void read_kv_cache(llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) {
-        uint32_t cell_count;
-        read_to(&cell_count, sizeof(cell_count));
-
-        bool res = read_kv_cache_meta(kv, cell_count, seq_id) && read_kv_cache_data(kv, hparams, cell_count);
-
-        if (!res) {
-            if (seq_id == -1) {
-                kv.clear();
-            } else {
-                kv.seq_rm(seq_id, -1, -1);
-            }
-            throw std::runtime_error("failed to restore kv cache");
-        }
-    }
 };
 
 struct llama_data_write_dummy : llama_data_write {
@@ -1518,7 +1146,18 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
     data_ctx.write_logits(ctx);
     data_ctx.write_embeddings(ctx);
 
-    data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams);
+    llama_kv_cache::io io = {
+        /* .write =*/ [&](const void * src, size_t size) {
+            data_ctx.write(src, size);
+        },
+        /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
+            data_ctx.write_tensor_data(tensor, offset, size);
+        },
+        /* .read    =*/ nullptr,
+        /* .read_to =*/ nullptr,
+    };
+
+    ctx->kv_self.state_write(io, ctx->model.hparams);
 
     return data_ctx.get_size_written();
 }
@@ -1555,7 +1194,18 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
     data_ctx.read_logits(ctx);
     data_ctx.read_embeddings(ctx);
 
-    data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams);
+    llama_kv_cache::io io = {
+        /* .write =*/ nullptr,
+        /* .write_tensor_data =*/ nullptr,
+        /* .read =*/ [&](size_t size) {
+            return data_ctx.read(size);
+        },
+        /* .read_to =*/ [&](void * dst, size_t size) {
+            data_ctx.read_to(dst, size);
+        },
+    };
+
+    ctx->kv_self.state_read(io, ctx->model.hparams);
 
     return data_ctx.get_size_read();
 }
@@ -1651,7 +1301,18 @@ bool llama_state_save_file(struct llama_context * ctx, const char * path_session
 static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
     llama_synchronize(ctx);
 
-    data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams, seq_id);
+    llama_kv_cache::io io = {
+        /* .write =*/ [&](const void * src, size_t size) {
+            data_ctx.write(src, size);
+        },
+        /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
+            data_ctx.write_tensor_data(tensor, offset, size);
+        },
+        /* .read =*/    nullptr,
+        /* .read_to =*/ nullptr,
+    };
+
+    ctx->kv_self.state_write(io, ctx->model.hparams, seq_id);
 
     return data_ctx.get_size_written();
 }
@@ -1674,7 +1335,18 @@ size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_
 static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
     llama_synchronize(ctx);
 
-    data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams, dest_seq_id);
+    llama_kv_cache::io io = {
+        /* .write =*/ nullptr,
+        /* .write_tensor_data =*/ nullptr,
+        /* .read =*/ [&](size_t size) {
+            return data_ctx.read(size);
+        },
+        /* .read_to =*/ [&](void * dst, size_t size) {
+            data_ctx.read_to(dst, size);
+        },
+    };
+
+    ctx->kv_self.state_read(io, ctx->model.hparams, dest_seq_id);
 
     return data_ctx.get_size_read();
 }
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 9f3b4e5144415..6886d24f0d98f 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -8,6 +8,7 @@
 #include <algorithm>
 #include <limits>
 #include <map>
+#include <stdexcept>
 
 static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
 
@@ -696,6 +697,383 @@ size_t llama_kv_cache::size_v_bytes() const {
     return size_v_bytes;
 }
 
+void llama_kv_cache::state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) const {
+    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+    uint32_t cell_count = 0;
+
+    // Count the number of cells with the specified seq_id
+    // Find all the ranges of cells with this seq id (or all, when -1)
+    uint32_t cell_range_begin = size;
+    for (uint32_t i = 0; i < size; ++i) {
+        const auto & cell = cells[i];
+        if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
+            ++cell_count;
+            if (cell_range_begin == size) {
+                cell_range_begin = i;
+            }
+        } else {
+            if (cell_range_begin != size) {
+                cell_ranges.emplace_back(cell_range_begin, i);
+                cell_range_begin = size;
+            }
+        }
+    }
+    if (cell_range_begin != size) {
+        cell_ranges.emplace_back(cell_range_begin, size);
+    }
+
+    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+    uint32_t cell_count_check = 0;
+    for (const auto & range : cell_ranges) {
+        cell_count_check += range.second - range.first;
+    }
+    GGML_ASSERT(cell_count == cell_count_check);
+
+    io.write(&cell_count, sizeof(cell_count));
+
+    state_write_meta(io, cell_ranges, seq_id);
+    state_write_data(io, cell_ranges, hparams);
+}
+
+void llama_kv_cache::state_read(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) {
+    uint32_t cell_count;
+    io.read_to(&cell_count, sizeof(cell_count));
+
+    bool res = true;
+    res = res && state_read_meta(io, cell_count, seq_id);
+    res = res && state_read_data(io, hparams, cell_count);
+
+    if (!res) {
+        if (seq_id == -1) {
+            clear();
+        } else {
+            seq_rm(seq_id, -1, -1);
+        }
+        throw std::runtime_error("failed to restore kv cache");
+    }
+}
+
+void llama_kv_cache::state_write_meta(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+    for (const auto & range : cell_ranges) {
+        for (uint32_t i = range.first; i < range.second; ++i) {
+            const auto & cell = cells[i];
+            const llama_pos pos      = cell.pos;
+            const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
+
+            io.write(&pos,      sizeof(pos));
+            io.write(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id) {
+                for (auto seq_id : cell.seq_id) {
+                    io.write(&seq_id, sizeof(seq_id));
+                }
+            }
+        }
+    }
+}
+
+void llama_kv_cache::state_write_data(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const {
+    const uint32_t v_trans = this->v_trans ? 1 : 0;
+    const uint32_t n_layer = hparams.n_layer;
+
+    io.write(&v_trans, sizeof(v_trans));
+    io.write(&n_layer, sizeof(n_layer));
+
+    std::vector<uint8_t> tmp_buf;
+
+    // Iterate and write all the keys first, each row is a cell
+    // Get whole range at a time
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Write key type
+        const int32_t k_type_i = (int32_t)k_l[il]->type;
+        io.write(&k_type_i, sizeof(k_type_i));
+
+        // Write row size of key
+        const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        io.write(&k_size_row, sizeof(k_size_row));
+
+        // Read each range of cells of k_size length each into tmp_buf and write out
+        for (const auto & range : cell_ranges) {
+            const size_t range_size = range.second - range.first;
+            const size_t buf_size = range_size * k_size_row;
+            io.write_tensor_data(k_l[il], range.first * k_size_row, buf_size);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write row size of value
+            const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            io.write(&v_size_row, sizeof(v_size_row));
+
+            // Read each range of cells of v_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                const size_t buf_size = range_size * v_size_row;
+                io.write_tensor_data(v_l[il], range.first * v_size_row, buf_size);
+            }
+        }
+    } else {
+        // When v is transposed, we also need the element size and get the element ranges from each row
+        const uint32_t kv_size = size;
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write element size
+            const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
+            io.write(&v_size_el, sizeof(v_size_el));
+
+            // Write GQA embedding size
+            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+            // For each row, we get the element values of each cell
+            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+                    const size_t buf_size = range_size * v_size_el;
+                    io.write_tensor_data(v_l[il], src_offset, buf_size);
+                }
+            }
+        }
+    }
+}
+
+bool llama_kv_cache::state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+    if (dest_seq_id != -1) {
+        // single sequence
+
+        seq_rm(dest_seq_id, -1, -1);
+
+        llama_sbatch sbatch;
+        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
+
+        batch.n_tokens = cell_count;
+        batch.n_seq_tokens = cell_count;
+        batch.n_seqs = 1;
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id != 0) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                return false;
+            }
+
+            batch.pos[i] = pos;
+        }
+        batch.n_seq_id[0] = 1;
+        batch.seq_id[0] = &dest_seq_id;
+        if (!find_slot(batch)) {
+            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            return false;
+        }
+
+        // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+        // Assume that this is one contiguous block of cells
+        GGML_ASSERT(head + cell_count <= size);
+        GGML_ASSERT(cells[head].pos == batch.pos[0]);
+        GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+        GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
+        GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
+    } else {
+        // whole KV cache restore
+
+        if (cell_count > size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+            return false;
+        }
+
+        clear();
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_kv_cell & cell = cells[i];
+
+            llama_pos pos;
+            uint32_t  n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            cell.pos = pos;
+
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+
+                // TODO: llama_kv_cache should have a notion of max sequences
+                //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                if (seq_id < 0) {
+                    //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
+                    return false;
+                }
+
+                cell.seq_id.insert(seq_id);
+
+                if (recurrent) {
+                    int32_t & tail = cells[seq_id].tail;
+                    if (tail != -1) {
+                        LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
+                        return false;
+                    }
+                    tail = i;
+                }
+            }
+        }
+
+        head = 0;
+        used = cell_count;
+    }
+
+    if (recurrent) {
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            uint32_t cell_id = head + i;
+            // make sure the recurrent states will keep their restored state
+            cells[cell_id].src = cell_id;
+        }
+    }
+
+    return true;
+}
+
+bool llama_kv_cache::state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count) {
+    uint32_t v_trans;
+    uint32_t n_layer;
+    io.read_to(&v_trans, sizeof(v_trans));
+    io.read_to(&n_layer, sizeof(n_layer));
+
+    if (n_layer != hparams.n_layer) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+        return false;
+    }
+    if (cell_count > size) {
+        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
+        return false;
+    }
+    if (v_trans != (bool) v_trans) {
+        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+        return false;
+    }
+
+    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Read type of key
+        int32_t k_type_i_ref;
+        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+        const int32_t k_type_i = (int32_t) k_l[il]->type;
+        if (k_type_i != k_type_i_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+            return false;
+        }
+
+        // Read row size of key
+        uint64_t k_size_row_ref;
+        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        if (k_size_row != k_size_row_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+            return false;
+        }
+
+        if (cell_count) {
+            // Read and set the keys for the whole cell range
+            ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of value
+            uint64_t v_size_row_ref;
+            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+            const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            if (v_size_row != v_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the values for the whole cell range
+                ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+            }
+        }
+    } else {
+        // For each layer, read the values for each cell (transposed)
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read element size of value
+            uint32_t v_size_el_ref;
+            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+            const size_t v_size_el = ggml_type_size(v_l[il]->type);
+            if (v_size_el != v_size_el_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                return false;
+            }
+
+            // Read GQA embedding size
+            uint32_t n_embd_v_gqa_ref;
+            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // For each row in the transposed matrix, read the values for the whole cell range
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    const size_t dst_offset = (head + j * size) * v_size_el;
+                    ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+/////////////
+
 void llama_kv_cache_clear(llama_kv_cache * kv) {
     kv->clear();
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 7fc2fabf5163d..0384a2b7ce7ab 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -6,8 +6,10 @@
 
 #include <set>
 #include <vector>
+#include <functional>
 
 struct llama_cparams;
+struct llama_hparams;
 struct llama_ubatch;
 
 struct llama_kv_cell {
@@ -45,6 +47,7 @@ struct llama_kv_cache_slot_info {
 // ring-buffer of cached KV data
 // TODO: pimpl
 // TODO: add notion of max sequences
+// TODO: add llama_hparams &
 struct llama_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;
@@ -111,12 +114,29 @@ struct llama_kv_cache {
     size_t size_k_bytes() const;
     size_t size_v_bytes() const;
 
+    struct io {
+        std::function<void(const void * src, size_t size)> write;
+        std::function<void(const struct ggml_tensor * tensor, size_t offset, size_t size)> write_tensor_data;
+
+        std::function<const uint8_t * (size_t size)> read;
+        std::function<void(void * dst, size_t size)> read_to;
+    };
+
+    void state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const;
+    void state_read (const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1);
+
 private:
     ggml_type type_k = GGML_TYPE_F16;
     ggml_type type_v = GGML_TYPE_F16;
 
     std::vector<ggml_context_ptr> ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
+
+    void state_write_meta(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+    void state_write_data(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const;
+
+    bool state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count);
 };
 
 //

From 17b363afd3575f8f9d025a35d2abb75f528a64c2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 14 Jan 2025 16:47:34 +0200
Subject: [PATCH 08/84] llama : update llama_kv_self API

ggml-ci
---
 common/common.cpp                             |   6 +-
 common/speculative.cpp                        |  10 +-
 examples/batched-bench/batched-bench.cpp      |   6 +-
 examples/batched.swift/Sources/main.swift     |   2 +-
 .../cvector-generator/cvector-generator.cpp   |   3 +-
 examples/embedding/embedding.cpp              |   5 +-
 examples/gritlm/gritlm.cpp                    |   8 +-
 examples/imatrix/imatrix.cpp                  |   4 +-
 examples/infill/infill.cpp                    |   6 +-
 examples/llama-bench/llama-bench.cpp          |   6 +-
 .../llama/src/main/cpp/llama-android.cpp      |   8 +-
 .../llama.cpp.swift/LibLlama.swift            |   8 +-
 examples/lookahead/lookahead.cpp              |  13 +-
 examples/lookup/lookup.cpp                    |   3 +-
 examples/main/main.cpp                        |  14 +-
 examples/parallel/parallel.cpp                |  11 +-
 examples/passkey/passkey.cpp                  |  30 ++--
 examples/perplexity/perplexity.cpp            |  24 +--
 examples/retrieval/retrieval.cpp              |   4 +-
 examples/run/run.cpp                          |   7 +-
 examples/save-load-state/save-load-state.cpp  |   4 +-
 examples/server/server.cpp                    |  25 ++-
 examples/simple-chat/simple-chat.cpp          |   6 +-
 .../speculative-simple/speculative-simple.cpp |   4 +-
 examples/speculative/speculative.cpp          |  29 ++--
 include/llama.h                               | 105 ++++++++++---
 src/llama-context.cpp                         |  34 ++--
 src/llama-kv-cache.cpp                        |  20 +--
 src/llama-kv-cache.h                          |  42 +++++
 src/llama.cpp                                 | 145 +++++++++++++++++-
 30 files changed, 387 insertions(+), 205 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 29de45189e2d3..098feebee9e65 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -952,9 +952,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
-    llama_kv_cache * kv = llama_get_kv_cache(lctx);
-
-    if (params.ctx_shift && !llama_kv_cache_can_shift(kv)) {
+    if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
         LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
         params.ctx_shift = false;
     }
@@ -1059,7 +1057,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_decoder(model)) {
             llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(lctx);
         llama_synchronize(lctx);
         llama_perf_context_reset(lctx);
     }
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 6ac0585178ebd..a660f198ae865 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -171,10 +171,8 @@ llama_tokens common_speculative_gen_draft(
     llama_tokens result;
     result.reserve(params.n_draft);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     if (reuse_n == 0) {
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         prompt.clear();
     } else {
@@ -193,14 +191,14 @@ llama_tokens common_speculative_gen_draft(
         }
 
         if (reuse_i > 0) {
-            llama_kv_cache_seq_rm (kv, 0, 0, reuse_i);
-            llama_kv_cache_seq_add(kv, 0, reuse_i, -1, -reuse_i);
+            llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
+            llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
 
             prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
         }
 
         if (reuse_n < (int) prompt.size()) {
-            llama_kv_cache_seq_rm (kv, 0, reuse_n, -1);
+            llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
 
             prompt.erase(prompt.begin() + reuse_n, prompt.end());
         }
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index fcbad37bb3f2f..430e8be512653 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -57,8 +57,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const int32_t n_kv_max = llama_n_ctx(ctx);
 
     llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
@@ -134,7 +132,7 @@ int main(int argc, char ** argv) {
 
                 const auto t_pp_start = ggml_time_us();
 
-                llama_kv_cache_clear(kv);
+                llama_kv_self_clear(ctx);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                     LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -143,7 +141,7 @@ int main(int argc, char ** argv) {
 
                 if (is_pp_shared) {
                     for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
+                        llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
                     }
                 }
 
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 371917b2ee863..a6494ebdfe176 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -111,7 +111,7 @@ if llama_decode(context, batch) != 0 {
 }
 
 for i in 1 ..< n_parallel {
-    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
+    llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
 }
 
 if n_parallel > 1 {
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index adb4a60ada41f..3733e32d7007e 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -342,8 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 }
 
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-    llama_kv_cache_clear(kv);
+    llama_kv_self_clear(ctx);
     if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index fda0949f1c4cf..c4fb1c6d1d78f 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -34,11 +34,10 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-    const llama_model * model = llama_get_model(ctx);
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+    const struct llama_model * model = llama_get_model(ctx);
 
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(kv);
+    llama_kv_self_clear(ctx);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 16437453edb89..f7db7861c1ad5 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -13,8 +13,6 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
     for (uint64_t i = 0; i < sentences.size(); i++) {
@@ -47,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         }
 
         // clear previous kv_cache values (irrelevant for embeddings)
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
         llama_set_embeddings(ctx, true);
         llama_set_causal_attn(ctx, false);
 
@@ -102,11 +100,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     llama_token eos_token = llama_vocab_eos(vocab);
 
-    llama_kv_cache_clear(kv);
+    llama_kv_self_clear(ctx);
     llama_set_embeddings(ctx, false);
     llama_set_causal_attn(ctx, true);
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 5efe4f019f562..e335ecc74b8fe 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -431,8 +431,6 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     const int n_ctx = llama_n_ctx(ctx);
 
@@ -499,7 +497,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index de8e7769552bb..4e2f7b7270003 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -139,8 +139,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     const int n_ctx_train = llama_model_n_ctx_train(model);
@@ -334,8 +332,8 @@ int main(int argc, char ** argv) {
                 LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                llama_kv_cache_seq_rm (kv, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_add(kv, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
                 n_past -= n_discard;
 
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 8843c0048d6cc..fc58135fe5fa8 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1575,11 +1575,9 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
-        llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
         test t(inst, lmodel, ctx);
 
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         // cool off before the test
         if (params.delay) {
@@ -1619,7 +1617,7 @@ int main(int argc, char ** argv) {
         }
 
         for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_clear(kv);
+            llama_kv_self_clear(ctx);
 
             uint64_t t_start = get_time_ns();
 
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 2a73983a9832f..cf5e14907247e 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
         }
 
         batch->logits[batch->n_tokens - 1] = true;
-        llama_kv_cache_clear(context);
+        llama_kv_self_clear(context);
 
         const auto t_pp_start = ggml_time_us();
         if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
 
         LOGi("Benchmark text generation (tg)");
 
-        llama_kv_cache_clear(context);
+        llama_kv_self_clear(context);
         const auto t_tg_start = ggml_time_us();
         for (i = 0; i < tg; i++) {
 
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
 
         const auto t_tg_end = ggml_time_us();
 
-        llama_kv_cache_clear(context);
+        llama_kv_self_clear(context);
 
         const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
         const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
-    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
+    llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
 }
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index 477c3e6f2e95b..82c26935bbaea 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -208,7 +208,7 @@ actor LlamaContext {
             }
             batch.logits[Int(batch.n_tokens) - 1] = 1 // true
 
-            llama_kv_cache_clear(context)
+            llama_kv_self_clear(context)
 
             let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
 
@@ -221,7 +221,7 @@ actor LlamaContext {
 
             // bench text generation
 
-            llama_kv_cache_clear(context)
+            llama_kv_self_clear(context)
 
             let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
 
@@ -240,7 +240,7 @@ actor LlamaContext {
 
             let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
 
-            llama_kv_cache_clear(context)
+            llama_kv_self_clear(context)
 
             let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
             let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@@ -290,7 +290,7 @@ actor LlamaContext {
     func clear() {
         tokens_list.removeAll()
         temporary_invalid_cchars.removeAll()
-        llama_kv_cache_clear(context)
+        llama_kv_self_clear(context)
     }
 
     private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 1219c207464d2..b7f334007a39b 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -60,7 +60,6 @@ int main(int argc, char ** argv) {
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -96,7 +95,7 @@ int main(int argc, char ** argv) {
     llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 
     for (int s = 1; s < W + G + 1; ++s) {
-        llama_kv_cache_seq_cp(kv, 0, s, -1, -1);
+        llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
     }
 
     const auto t_enc_end = ggml_time_us();
@@ -438,17 +437,17 @@ int main(int argc, char ** argv) {
 
         // KV cache management
         // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
-        llama_kv_cache_seq_rm(kv, -1, n_past, -1);
+        llama_kv_self_seq_rm(ctx, -1, n_past, -1);
 
         if (seq_id_best != 0) {
             // if a verification token matched, we keep the best sequence and remove the rest
             // this leads to some KV cache fragmentation
-            llama_kv_cache_seq_keep(kv, seq_id_best);
-            llama_kv_cache_seq_cp  (kv, seq_id_best, 0, -1, -1);
-            llama_kv_cache_seq_rm  (kv, seq_id_best,    -1, -1);
+            llama_kv_self_seq_keep(ctx, seq_id_best);
+            llama_kv_self_seq_cp  (ctx, seq_id_best, 0, -1, -1);
+            llama_kv_self_seq_rm  (ctx, seq_id_best,    -1, -1);
 
             for (int s = 1; s < W + G + 1; ++s) {
-                llama_kv_cache_seq_cp(kv, 0, s, -1, -1);
+                llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
             }
         }
     }
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 8628f7318556c..4ae93b2a5ed15 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -35,7 +35,6 @@ int main(int argc, char ** argv){
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -193,7 +192,7 @@ int main(int argc, char ** argv){
 
         // KV cache management
         // clean the cache of draft tokens that weren't accepted
-        llama_kv_cache_seq_rm(kv, 0, n_past, -1);
+        llama_kv_self_seq_rm(ctx, 0, n_past, -1);
 
         common_batch_clear(batch_tgt);
         common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 9d79af79e2723..23437937cfb5e 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -157,8 +157,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const llama_vocab * vocab = llama_model_get_vocab(model);
     auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
 
@@ -330,7 +328,7 @@ int main(int argc, char ** argv) {
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_seq_rm(kv, -1, n_matching_session_tokens, -1);
+        llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
     }
 
     LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@@ -571,8 +569,8 @@ int main(int argc, char ** argv) {
                     LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                    llama_kv_cache_seq_rm (kv, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_add(kv, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_kv_self_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
 
@@ -595,9 +593,9 @@ int main(int argc, char ** argv) {
                     LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                     LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
-                    llama_kv_cache_seq_add(kv, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div(kv, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_add(kv, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_kv_self_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
+                    llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
 
                     n_past -= bd;
 
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 2ba0706dc5d24..3f9e1bcbbe540 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -134,7 +134,6 @@ int main(int argc, char ** argv) {
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -202,7 +201,7 @@ int main(int argc, char ** argv) {
 
         // assign the system KV cache to all parallel sequences
         for (int32_t i = 1; i <= n_clients; ++i) {
-            llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
+            llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
         }
 
         LOG_INF("\n");
@@ -234,9 +233,9 @@ int main(int argc, char ** argv) {
         if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
             for (int i = 1; i <= n_clients; ++i) {
-                llama_kv_cache_seq_rm(kv, i, -1, -1);
+                llama_kv_self_seq_rm(ctx, i, -1, -1);
                 // but keep the system prompt
-                llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
+                llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
             }
 
             LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -372,8 +371,8 @@ int main(int argc, char ** argv) {
                     }
 
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(kv,    client.id + 1, -1, -1);
-                    llama_kv_cache_seq_cp(kv, 0, client.id + 1, -1, -1);
+                    llama_kv_self_seq_rm(ctx,    client.id + 1, -1, -1);
+                    llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
 
                     const auto t_main_end = ggml_time_us();
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index e2764313b2f01..46de2c2a207f9 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -86,8 +86,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     auto sparams = llama_sampler_chain_default_params();
 
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
@@ -134,11 +132,11 @@ int main(int argc, char ** argv) {
             const int ib = i/n_batch - 1;
             const int bd = n_batch_grp*(n_grp - 1);
 
-            llama_kv_cache_seq_add(kv, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div(kv, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_update_kv_cache (ctx, kv);
+            llama_kv_self_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            llama_kv_self_update  (ctx);
 
-            n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
+            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
         }
 
         common_batch_clear(batch);
@@ -168,12 +166,12 @@ int main(int argc, char ** argv) {
 
         LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
 
-        llama_kv_cache_seq_rm (kv, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_cache_defrag (kv);
-        llama_update_kv_cache (ctx, kv);
+        llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+        llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+      //llama_kv_self_defrag (ctx);
+        llama_kv_self_update (ctx);
 
-        n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
+        n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
 
         common_batch_clear(batch);
 
@@ -199,12 +197,12 @@ int main(int argc, char ** argv) {
         if (n_discard > 0) {
             LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
 
-            llama_kv_cache_seq_rm (kv, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_cache_defrag (kv);
-            llama_update_kv_cache (ctx, kv);
+            llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+            llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+          //llama_kv_self_defrag (ctx);
+            llama_kv_self_update (ctx);
 
-            n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
+            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
         }
     }
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 6c9f716ede23c..31c436f13976b 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -299,8 +299,6 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
@@ -362,7 +360,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
@@ -452,8 +450,6 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
@@ -550,7 +546,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -745,8 +741,6 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     // Calculates hellaswag score (acc_norm) from prompt
     //
     // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -929,7 +923,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
             return;
         }
 
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1090,8 +1084,6 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     constexpr int k_min_trailing_ctx = 3;
 
     auto data = load_winogrande_from_csv(params.prompt);
@@ -1210,7 +1202,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
             return;
         }
 
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1396,8 +1388,6 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     std::istringstream strstream(params.prompt);
     uint32_t n_task;
     strstream.read((char *)&n_task, sizeof(n_task));
@@ -1584,7 +1574,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
             return;
         }
 
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1681,8 +1671,6 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     if (params.logits_file.empty()) {
         LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
         return;
@@ -1776,7 +1764,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         }
 
         // clear the KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index a907ea07607dd..0efe20d4b3f5d 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -82,10 +82,8 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 }
 
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(kv);
+    llama_kv_self_clear(ctx);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index 8e2c174a955e8..2c38d1ef68321 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -756,8 +756,7 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
                            std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
-    const llama_kv_cache * kv = llama_get_kv_cache(llama_data.context.get());
-    const bool is_first = llama_kv_cache_used_cells(kv) == 0;
+    const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
 
     const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
     prompt_tokens.resize(n_prompt_tokens);
@@ -772,10 +771,8 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
 
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
-    llama_kv_cache * kv = llama_get_kv_cache(ctx.get());
-
     const int n_ctx      = llama_n_ctx(ctx.get());
-    const int n_ctx_used = llama_kv_cache_used_cells(kv);
+    const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
     if (n_ctx_used + batch.n_tokens > n_ctx) {
         printf("\033[0m\n");
         printe("context size exceeded\n");
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 3839fbe8c84d5..77b1572a9dec5 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -156,8 +156,6 @@ int main(int argc, char ** argv) {
     // make new context
     llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));
 
-    llama_kv_cache * kv3 = llama_get_kv_cache(ctx3);
-
     llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
 
     llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
@@ -198,7 +196,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
 
         // erase whole kv
-        llama_kv_cache_clear(kv3);
+        llama_kv_self_clear(ctx3);
         fprintf(stderr, "%s : kv cache cleared\n", __func__);
 
         // restore kv into seq 1
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 076044d39679c..b665bde417094 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1693,7 +1693,6 @@ struct server_context {
 
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
-    llama_kv_cache * kv = nullptr;
 
     const llama_vocab * vocab = nullptr;
 
@@ -1756,8 +1755,6 @@ struct server_context {
             return false;
         }
 
-        kv = llama_get_kv_cache(ctx);
-
         vocab = llama_model_get_vocab(model);
 
         n_ctx = llama_n_ctx(ctx);
@@ -2026,7 +2023,7 @@ struct server_context {
         SRV_DBG("%s", "clearing KV cache\n");
 
         // clear the entire KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
         clean_kv_cache = false;
     }
 
@@ -2568,8 +2565,8 @@ struct server_context {
                     res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                     res->t_start             = metrics.t_start;
 
-                    res->kv_cache_tokens_count = llama_kv_cache_n_tokens(kv);
-                    res->kv_cache_used_cells   = llama_kv_cache_used_cells(kv);
+                    res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
+                    res->kv_cache_used_cells   = llama_kv_self_used_cells(ctx);
 
                     res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                     res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
@@ -2685,7 +2682,7 @@ struct server_context {
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_cache_seq_rm(kv, slot->id, -1, -1);
+                    llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
                     auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2753,8 +2750,8 @@ struct server_context {
 
                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-                llama_kv_cache_seq_rm (kv, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_cache_seq_add(kv, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -2941,8 +2938,8 @@ struct server_context {
 
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
-                                            llama_kv_cache_seq_rm (kv, slot.id, head_p, head_c);
-                                            llama_kv_cache_seq_add(kv, slot.id, head_c, -1,     kv_shift);
+                                            llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
+                                            llama_kv_self_seq_add(ctx, slot.id, head_c, -1,     kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -2980,9 +2977,9 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    if (!llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1)) {
+                    if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_cache_seq_rm(kv, slot.id, -1, -1);
+                        llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
 
                         // there is no common part left
                         slot.n_past = 0;
@@ -3222,7 +3219,7 @@ struct server_context {
                 slot.cache_tokens.push_back(id);
                 slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
 
-                llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1);
+                llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
 
                 for (size_t i = 0; i < ids.size(); ++i) {
                     completion_token_output result;
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index 130e326b55d4c..84f4159737260 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -88,8 +88,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    const llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     // initialize the sampler
     llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
     llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
@@ -100,7 +98,7 @@ int main(int argc, char ** argv) {
     auto generate = [&](const std::string & prompt) {
         std::string response;
 
-        const bool is_first = llama_kv_cache_used_cells(kv) == 0;
+        const bool is_first = llama_kv_self_used_cells(ctx) == 0;
 
         // tokenize the prompt
         const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -115,7 +113,7 @@ int main(int argc, char ** argv) {
         while (true) {
             // check if we have enough space in the context to evaluate this batch
             int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_kv_cache_used_cells(kv);
+            int n_ctx_used = llama_kv_self_used_cells(ctx);
             if (n_ctx_used + batch.n_tokens > n_ctx) {
                 printf("\033[0m\n");
                 fprintf(stderr, "context size exceeded\n");
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 24bdc806d5710..a5d2bc9d09de7 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -45,8 +45,6 @@ int main(int argc, char ** argv) {
     model_tgt = llama_init_tgt.model.get();
     ctx_tgt   = llama_init_tgt.context.get();
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx_tgt);
-
     const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
 
     // load the draft model
@@ -219,7 +217,7 @@ int main(int argc, char ** argv) {
         {
             LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
 
-            llama_kv_cache_seq_rm(kv, 0, n_past, -1);
+            llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
         }
 
         if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index b4e5259b5be46..bfddc67e034fb 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -90,9 +90,6 @@ int main(int argc, char ** argv) {
     model_dft = llama_init_dft.model.get();
     ctx_dft   = llama_init_dft.context.get();
 
-    llama_kv_cache * kv_tgt = llama_get_kv_cache(ctx_tgt);
-    llama_kv_cache * kv_dft = llama_get_kv_cache(ctx_dft);
-
     const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
     const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
 
@@ -423,14 +420,14 @@ int main(int argc, char ** argv) {
             {
                 LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
-                llama_kv_cache_seq_keep(kv_dft, s_keep);
-                llama_kv_cache_seq_cp  (kv_dft, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(kv_dft, 0);
+                llama_kv_self_seq_keep(ctx_dft, s_keep);
+                llama_kv_self_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
+                llama_kv_self_seq_keep(ctx_dft, 0);
 
-                llama_kv_cache_seq_rm  (kv_tgt, s_keep, n_past_tgt, -1);
-                llama_kv_cache_seq_keep(kv_tgt, s_keep);
-                llama_kv_cache_seq_cp  (kv_tgt, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(kv_tgt, 0);
+                llama_kv_self_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
+                llama_kv_self_seq_keep(ctx_tgt, s_keep);
+                llama_kv_self_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
+                llama_kv_self_seq_keep(ctx_tgt, 0);
             }
 
             for (int s = 0; s < n_seq_dft; ++s) {
@@ -447,8 +444,8 @@ int main(int argc, char ** argv) {
             common_batch_clear(batch_dft);
             common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
 
-            llama_kv_cache_seq_rm(kv_dft, 0, n_past_dft, -1);
-            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(kv_dft, batch_dft).c_str());
+            llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
+            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
             llama_decode(ctx_dft, batch_dft);
 
             ++n_past_dft;
@@ -506,8 +503,8 @@ int main(int argc, char ** argv) {
                     if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
                         LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
-                        llama_kv_cache_seq_rm(kv_dft,    n_seq_cur, -1, -1);
-                        llama_kv_cache_seq_cp(kv_dft, s, n_seq_cur, -1, -1);
+                        llama_kv_self_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
+                        llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
 
                         // all previous tokens from this branch are now also part of the new branch
                         for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -588,9 +585,9 @@ int main(int argc, char ** argv) {
 
         // evaluate the target model on the drafted tokens
         {
-            llama_kv_cache_seq_keep(kv_tgt, 0);
+            llama_kv_self_seq_keep(ctx_tgt, 0);
             for (int s = 1; s < n_seq_dft; ++s) {
-                llama_kv_cache_seq_cp(kv_tgt, 0, s, -1, -1);
+                llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
             }
 
             // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
diff --git a/include/llama.h b/include/llama.h
index 08b8658ad89ac..91300b1ae51a3 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -469,7 +469,7 @@ extern "C" {
     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 
     LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx); // TODO: remove const?
-    LLAMA_API    struct llama_kv_cache * llama_get_kv_cache(      struct llama_context * ctx);
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
     LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx);
 
     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
@@ -641,28 +641,28 @@ extern "C" {
 
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_kv_cache_n_tokens(const struct llama_kv_cache * kv);
+    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
 
     DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
-            "use llama_kv_cache_n_tokens instead");
+            "use llama_kv_self_n_tokens instead");
 
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_kv_cache_used_cells(const struct llama_kv_cache * kv);
+    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
 
     DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
-            "use llama_kv_cache_used_cells instead");
+            "use llama_kv_self_used_cells instead");
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_cache_clear(
-            struct llama_kv_cache * kv);
+    LLAMA_API void llama_kv_self_clear(
+            struct llama_context * ctx);
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
     // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_cache_seq_rm(
-            struct llama_kv_cache * kv,
+    LLAMA_API bool llama_kv_self_seq_rm(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id,
                         llama_pos   p0,
                         llama_pos   p1);
@@ -671,26 +671,26 @@ extern "C" {
     // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_cp(
-            struct llama_kv_cache * kv,
+    LLAMA_API void llama_kv_self_seq_cp(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id_src,
                      llama_seq_id   seq_id_dst,
                         llama_pos   p0,
                         llama_pos   p1);
 
     // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(
-            struct llama_kv_cache * kv,
+    LLAMA_API void llama_kv_self_seq_keep(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id);
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
-            struct llama_kv_cache * kv,
+    LLAMA_API void llama_kv_self_seq_add(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id,
                         llama_pos   p0,
                         llama_pos   p1,
@@ -699,32 +699,87 @@ extern "C" {
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_kv_cache * kv,
+    LLAMA_API void llama_kv_self_seq_div(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id,
                         llama_pos   p0,
                         llama_pos   p1,
                               int   d);
 
     // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_kv_cache * kv,
+    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id);
 
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_kv_cache * kv);
+    //   - explicitly with llama_kv_self_update()
+    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
 
     // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(const struct llama_kv_cache * kv);
+    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
 
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_update_kv_cache(struct llama_context * ctx, struct llama_kv_cache * kv);
+    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx),
+            "use llama_kv_self_clear instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_rm instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_cp instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_keep instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta),
+            "use llama_kv_self_seq_add instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d),
+            "use llama_kv_self_seq_div instead");
+
+    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_pos_max instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
+            "use llama_kv_self_defrag instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
+            "use llama_kv_self_can_shift instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
+            "use llama_kv_self_update instead");
+
 
     //
     // State / sessions
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0e146652c5996..0004e214b9e27 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -606,7 +606,7 @@ const llama_model * llama_get_model(const llama_context * ctx) {
     return &ctx->model;
 }
 
-llama_kv_cache * llama_get_kv_cache(llama_context * ctx) {
+llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
     return &ctx->kv_self;
 }
 
@@ -1147,14 +1147,14 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
     data_ctx.write_embeddings(ctx);
 
     llama_kv_cache::io io = {
-        /* .write =*/ [&](const void * src, size_t size) {
+        /* .write = */ [&](const void * src, size_t size) {
             data_ctx.write(src, size);
         },
-        /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
+        /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
             data_ctx.write_tensor_data(tensor, offset, size);
         },
-        /* .read    =*/ nullptr,
-        /* .read_to =*/ nullptr,
+        /* .read    = */ nullptr,
+        /* .read_to = */ nullptr,
     };
 
     ctx->kv_self.state_write(io, ctx->model.hparams);
@@ -1195,12 +1195,12 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
     data_ctx.read_embeddings(ctx);
 
     llama_kv_cache::io io = {
-        /* .write =*/ nullptr,
-        /* .write_tensor_data =*/ nullptr,
-        /* .read =*/ [&](size_t size) {
+        /* .write = */ nullptr,
+        /* .write_tensor_data = */ nullptr,
+        /* .read = */ [&](size_t size) {
             return data_ctx.read(size);
         },
-        /* .read_to =*/ [&](void * dst, size_t size) {
+        /* .read_to = */ [&](void * dst, size_t size) {
             data_ctx.read_to(dst, size);
         },
     };
@@ -1302,14 +1302,14 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
     llama_synchronize(ctx);
 
     llama_kv_cache::io io = {
-        /* .write =*/ [&](const void * src, size_t size) {
+        /* .write = */ [&](const void * src, size_t size) {
             data_ctx.write(src, size);
         },
-        /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
+        /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
             data_ctx.write_tensor_data(tensor, offset, size);
         },
-        /* .read =*/    nullptr,
-        /* .read_to =*/ nullptr,
+        /* .read = */    nullptr,
+        /* .read_to = */ nullptr,
     };
 
     ctx->kv_self.state_write(io, ctx->model.hparams, seq_id);
@@ -1336,12 +1336,12 @@ static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llam
     llama_synchronize(ctx);
 
     llama_kv_cache::io io = {
-        /* .write =*/ nullptr,
-        /* .write_tensor_data =*/ nullptr,
-        /* .read =*/ [&](size_t size) {
+        /* .write = */ nullptr,
+        /* .write_tensor_data = */ nullptr,
+        /* .read = */ [&](size_t size) {
             return data_ctx.read(size);
         },
-        /* .read_to =*/ [&](void * dst, size_t size) {
+        /* .read_to = */ [&](void * dst, size_t size) {
             data_ctx.read_to(dst, size);
         },
     };
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 6886d24f0d98f..d2b81a0220d83 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1072,7 +1072,17 @@ bool llama_kv_cache::state_read_data(const io & io, const llama_hparams & hparam
     return true;
 }
 
-/////////////
+//
+// interface implementation
+//
+
+int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) {
+    return kv->n_tokens();
+}
+
+int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
+    return kv->used;
+}
 
 void llama_kv_cache_clear(llama_kv_cache * kv) {
     kv->clear();
@@ -1125,14 +1135,6 @@ void llama_kv_cache_defrag(llama_kv_cache * kv) {
     kv->defrag();
 }
 
-int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) {
-    return kv->n_tokens();
-}
-
-int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
-    return kv->used;
-}
-
 bool llama_kv_cache_can_shift(const llama_kv_cache * kv) {
     return kv->can_shift;
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 0384a2b7ce7ab..2e021d4edf959 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -190,6 +190,48 @@ struct llama_kv_slot_restorer {
     }
 };
 
+// TODO: maybe become part of the public llama_kv_cache in the future
+int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv);
+
+int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv);
+
+void llama_kv_cache_clear(llama_kv_cache * kv);
+
+bool llama_kv_cache_seq_rm(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1);
+
+void llama_kv_cache_seq_cp(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id_src,
+          llama_seq_id   seq_id_dst,
+             llama_pos   p0,
+             llama_pos   p1);
+
+void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id);
+
+void llama_kv_cache_seq_add(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1,
+             llama_pos   delta);
+
+void llama_kv_cache_seq_div(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1,
+                   int   d);
+
+llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id);
+
+void llama_kv_cache_defrag(llama_kv_cache * kv);
+
+bool llama_kv_cache_can_shift(const llama_kv_cache * kv);
+
 //
 // kv cache view
 //
diff --git a/src/llama.cpp b/src/llama.cpp
index 0227ba6b36a93..b8f4043757d49 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8564,7 +8564,7 @@ static int llama_decode_impl(
 
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
-            llama_update_kv_cache(&lctx, &lctx.kv_self); // TODO: lctx->update_kv_cache()
+            llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update()
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
@@ -9182,9 +9182,12 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
     //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
 }
 
-static void llama_update_kv_cache_impl(llama_context & lctx, llama_kv_cache & kv) {
+// TODO: move to llama_context
+static void llama_kv_self_update_impl(llama_context & lctx) {
     bool need_reserve = false;
 
+    auto & kv = lctx.kv_self;
+
     if (kv.has_shift) {
         if (!kv.can_shift) {
             GGML_ABORT("The current context does not support K-shift");
@@ -9856,17 +9859,151 @@ void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view *
 
 // deprecated
 int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
+    return llama_kv_self_n_tokens(ctx);
+}
+
+int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
     return llama_kv_cache_n_tokens(&ctx->kv_self);
 }
 
 // deprecated
 int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
+    return llama_kv_self_used_cells(ctx);
+}
+
+int32_t llama_kv_self_used_cells(const llama_context * ctx) {
     return llama_kv_cache_used_cells(&ctx->kv_self);
 }
 
+// deprecated
+void llama_kv_cache_clear(llama_context * ctx) {
+    llama_kv_self_clear(ctx);
+}
+
+void llama_kv_self_clear(llama_context * ctx) {
+    llama_kv_cache_clear(&ctx->kv_self);
+}
+
+// deprecated
+bool llama_kv_cache_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
+}
+
+bool llama_kv_self_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1);
+}
+
+// deprecated
+void llama_kv_cache_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_self_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
+}
+
+// deprecated
+void llama_kv_cache_seq_keep(
+        llama_context * ctx,
+         llama_seq_id   seq_id) {
+    return llama_kv_self_seq_keep(ctx, seq_id);
+}
+
+void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id);
+}
+
+// deprecated
+void llama_kv_cache_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
+}
+
+void llama_kv_self_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta);
+}
+
+// deprecated
+void llama_kv_cache_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
+}
+
+void llama_kv_self_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d);
+}
+
+// deprecated
+llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_self_seq_pos_max(ctx, seq_id);
+}
+
+llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id);
+}
+
+// deprecated
+void llama_kv_cache_defrag(llama_context * ctx) {
+    return llama_kv_self_defrag(ctx);
+}
+
+void llama_kv_self_defrag(llama_context * ctx) {
+    return llama_kv_cache_defrag(&ctx->kv_self);
+}
+
+// deprecated
+bool llama_kv_cache_can_shift(const llama_context * ctx) {
+    return llama_kv_self_can_shift(ctx);
+}
+
+bool llama_kv_self_can_shift(const llama_context * ctx) {
+    return llama_kv_cache_can_shift(&ctx->kv_self);
+}
+
+// deprecated
+void llama_kv_cache_update(llama_context * ctx) {
+    llama_kv_self_update(ctx);
+}
+
 // TODO: move to llama-context
-void llama_update_kv_cache(llama_context * ctx, llama_kv_cache * kv) {
-    llama_update_kv_cache_impl(*ctx, *kv);
+void llama_kv_self_update(llama_context * ctx) {
+    llama_kv_self_update_impl(*ctx);
 }
 
 ///

From a19f671fe078497f73ec1898951475e026ffdc20 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 15 Jan 2025 10:54:21 +0200
Subject: [PATCH 09/84] context : minor

ggml-ci
---
 src/llama-context.cpp  | 36 +++++++++++-------------------------
 src/llama-context.h    |  8 +++-----
 src/llama-kv-cache.cpp |  1 +
 src/llama-kv-cache.h   |  6 +++---
 src/llama.cpp          | 33 +++++++++++++++++++--------------
 5 files changed, 37 insertions(+), 47 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0004e214b9e27..9eae6fe57ce1e 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -8,30 +8,6 @@
 #include <cstring>
 #include <stdexcept>
 
-void llama_set_k_shift(struct llama_context & lctx) {
-    const int64_t kv_size = lctx.kv_self.size;
-
-    assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
-
-    int32_t * data = (int32_t *) lctx.inp_K_shift->data;
-
-    for (int i = 0; i < kv_size; ++i) {
-        data[i] = lctx.kv_self.cells[i].delta;
-    }
-}
-
-void llama_set_s_copy(struct llama_context & lctx) {
-    const int64_t kv_size = lctx.kv_self.size;
-
-    assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
-
-    int32_t * data = (int32_t *) lctx.inp_s_copy->data;
-
-    for (int i = 0; i < kv_size; ++i) {
-        data[i] = lctx.kv_self.cells[i].src;
-    }
-}
-
 // llama input
 
 static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
@@ -58,6 +34,16 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
+void llama_context::set_k_shift(llama_kv_cache & kv) {
+    assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
+
+    int32_t * data = (int32_t *) inp_K_shift->data;
+
+    for (uint32_t i = 0; i < kv.size; ++i) {
+        data[i] = kv.cells[i].delta;
+    }
+}
+
 void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
     //
     // set input data
@@ -134,7 +120,6 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
             const int64_t n_seqs       = ubatch.n_seqs;
 
-
             float * data     = nullptr;
             float * data_swa = nullptr;
 
@@ -599,6 +584,7 @@ uint32_t llama_n_ubatch(const struct llama_context * ctx) {
 }
 
 uint32_t llama_n_seq_max(const struct llama_context * ctx) {
+    // TODO: add notion of n_seq_max to llama_kv_cache and use it here
     return ctx->kv_self.size;
 }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index a9268b2920908..73baa711f394a 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -18,7 +18,7 @@ struct llama_context {
     llama_context(const llama_model & model)
         : model(model)
         , t_start_us(model.t_start_us)
-        , t_load_us(model.t_load_us) {}
+        , t_load_us (model.t_load_us) {}
 
     const struct llama_model & model;
 
@@ -107,13 +107,11 @@ struct llama_context {
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+
+    void set_k_shift(llama_kv_cache & kv);
 };
 
 // TODO: make these methods of llama_context
-void llama_set_k_shift(struct llama_context & lctx);
-
-void llama_set_s_copy(struct llama_context & lctx);
-
 void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
 
 // Make sure enough space is available for outputs.
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index d2b81a0220d83..b79c2ff934a6e 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -6,6 +6,7 @@
 #include "llama-model.h"
 
 #include <algorithm>
+#include <cassert>
 #include <limits>
 #include <map>
 #include <stdexcept>
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 2e021d4edf959..5ffee62818b18 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -88,11 +88,11 @@ struct llama_kv_cache {
 
     void clear();
 
-    bool seq_rm  (llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1);
     void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
     void seq_keep(llama_seq_id seq_id);
-    void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
-    void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d);
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta);
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d);
 
     llama_pos seq_pos_max(llama_seq_id seq_id);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index b8f4043757d49..3e1cd8260b329 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1142,18 +1142,18 @@ struct llm_build_context {
 
         ctx0 = ggml_init(params);
 
-        lctx.inp_tokens      = nullptr;
-        lctx.inp_embd        = nullptr;
-        lctx.inp_pos         = nullptr;
-        lctx.inp_out_ids     = nullptr;
-        lctx.inp_KQ_mask     = nullptr;
-        lctx.inp_KQ_mask_swa = nullptr;
-        lctx.inp_K_shift     = nullptr;
-        lctx.inp_mean        = nullptr;
-        lctx.inp_cls         = nullptr;
-        lctx.inp_s_copy      = nullptr;
-        lctx.inp_s_mask      = nullptr;
-        lctx.inp_s_seq       = nullptr;
+        lctx.inp_tokens        = nullptr;
+        lctx.inp_embd          = nullptr;
+        lctx.inp_pos           = nullptr;
+        lctx.inp_out_ids       = nullptr;
+        lctx.inp_KQ_mask       = nullptr;
+        lctx.inp_KQ_mask_swa   = nullptr;
+        lctx.inp_K_shift       = nullptr;
+        lctx.inp_mean          = nullptr;
+        lctx.inp_cls           = nullptr;
+        lctx.inp_s_copy        = nullptr;
+        lctx.inp_s_mask        = nullptr;
+        lctx.inp_s_seq         = nullptr;
         lctx.inp_pos_bucket    = nullptr;
         lctx.inp_embd_enc      = nullptr;
         lctx.inp_KQ_mask_cross = nullptr;
@@ -1174,9 +1174,11 @@ struct llm_build_context {
         ggml_set_input(lctx.inp_K_shift);
 
         for (int il = 0; il < n_layer; ++il) {
-            const int64_t n_head_kv = hparams.n_head_kv(il);
+            const int64_t n_head_kv    = hparams.n_head_kv(il);
             const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
             struct ggml_tensor * rope_factors = build_rope_factors(il);
+
             struct ggml_tensor * k =
                 ggml_view_3d(ctx0, kv_self.k_l[il],
                     n_embd_head_k, n_head_kv, n_ctx,
@@ -1189,6 +1191,7 @@ struct llm_build_context {
                 // dequantize to f32 -> RoPE -> quantize back
                 tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
                 cb(tmp, "K_f32", il);
+
                 for (auto & backend : lctx.backends) {
                     // Figure out which backend KV cache belongs to
                     if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
@@ -1200,6 +1203,7 @@ struct llm_build_context {
                         lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(tmp, "K_shifted_f32", il);
+
                 tmp = ggml_cpy(ctx0, tmp, k);
             } else {
                 // we rotate only the first n_rot dimensions
@@ -1208,6 +1212,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
             }
             cb(tmp, "K_shifted", il);
+
             ggml_build_forward_expand(gf, tmp);
         }
 
@@ -9201,7 +9206,7 @@ static void llama_kv_self_update_impl(llama_context & lctx) {
 
             ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
 
-            llama_set_k_shift(lctx);
+            lctx.set_k_shift(kv);
 
             llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 

From ae274f9747cce6ba6b4910d05ddc3016cd0b4e21 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 15 Jan 2025 13:35:56 +0200
Subject: [PATCH 10/84] llama : fix names [no ci]

---
 src/llama.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 3e1cd8260b329..37816ddc28a38 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1373,9 +1373,9 @@ struct llm_build_context {
             inp = ggml_graph_node(gf, i);
             if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
                 break;
-            } else {
-                inp = nullptr;
             }
+
+            inp = nullptr;
         }
         GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
 
@@ -1431,7 +1431,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_tensor * llm_build_pos_bucket(bool causal) {
+    struct ggml_tensor * build_pos_bucket(bool causal) {
         if (causal) {
             lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
         } else {
@@ -1444,7 +1444,7 @@ struct llm_build_context {
         return lctx.inp_pos_bucket;
     }
 
-    struct ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
+    struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
         struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
         cb(pos_bucket_1d, "pos_bucket_1d", -1);
 
@@ -1463,7 +1463,7 @@ struct llm_build_context {
         return pos_bias;
     }
 
-    struct ggml_tensor * llm_build_inp_embd_enc() {
+    struct ggml_tensor * build_inp_embd_enc() {
         const int64_t n_embd = hparams.n_embd;
         lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
         ggml_set_input(lctx.inp_embd_enc);
@@ -1471,7 +1471,7 @@ struct llm_build_context {
         return lctx.inp_embd_enc;
     }
 
-    struct ggml_tensor * llm_build_inp_KQ_mask_cross() {
+    struct ggml_tensor * build_inp_KQ_mask_cross() {
         lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         ggml_set_input(lctx.inp_KQ_mask_cross);
         cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1);
@@ -6775,7 +6775,7 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         GGML_ASSERT(lctx.is_encoding);
-        struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
+        struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
@@ -6810,7 +6810,7 @@ struct llm_build_context {
                 cb(kq, "kq", il);
 
                 struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
-                struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
+                struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b);
                 struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
                 cb(kq_b, "kq_b", il);
 
@@ -6909,11 +6909,11 @@ struct llm_build_context {
         GGML_ASSERT(!lctx.is_encoding);
         GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
 
-        struct ggml_tensor * embd_enc       = llm_build_inp_embd_enc();
-        struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
+        struct ggml_tensor * embd_enc       = build_inp_embd_enc();
+        struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true);
 
         struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
-        struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
+        struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross();
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6961,7 +6961,7 @@ struct llm_build_context {
                 cb(kq, "kq", il);
 
                 struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
-                struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
+                struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b);
                 struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
                 cb(kq_b, "kq_b", il);
 

From f2524c0e4137a4327473c086f97a01aa0632ca3e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 16 Jan 2025 15:04:14 +0200
Subject: [PATCH 11/84] llama : remove references to llama_kv_cache (wip)

Intermediate step necessary to abstract the `llama_context` and
`llama_kv_cache`.

ggml-ci
---
 src/llama-context.cpp | 1031 ++++++++-
 src/llama-context.h   |  162 +-
 src/llama.cpp         | 4642 +++++++++++++++++------------------------
 3 files changed, 3017 insertions(+), 2818 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 9eae6fe57ce1e..910e2243d7e8a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -8,8 +8,6 @@
 #include <cstring>
 #include <stdexcept>
 
-// llama input
-
 static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
     // TODO move to hparams if a T5 variant appears that uses a different value
     const int64_t max_distance = 128;
@@ -34,56 +32,88 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
-void llama_context::set_k_shift(llama_kv_cache & kv) {
-    assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
+// TODO: improve
+void llama_context::reset() {
+    inp_tokens          = nullptr;
+    inp_embd            = nullptr;
+    inp_pos             = nullptr;
+    inp_out_ids         = nullptr;
+    inp_mean            = nullptr;
+    inp_cls             = nullptr;
+    inp_embd_enc        = nullptr;
+    inp_pos_bucket      = nullptr;
+    inp_KQ_mask         = nullptr;
+    inp_KQ_mask_cnv     = nullptr;
+    inp_KQ_mask_swa     = nullptr;
+    inp_KQ_mask_swa_cnv = nullptr;
+    inp_KQ_mask_cross   = nullptr;
+    inp_K_shift         = nullptr;
+    inp_s_copy          = nullptr;
+    inp_s_mask          = nullptr;
+}
+
+void llama_context::prepare_k_shift() {
+}
 
-    int32_t * data = (int32_t *) inp_K_shift->data;
+void llama_context::prepare_defrag() {
+}
 
-    for (uint32_t i = 0; i < kv.size; ++i) {
-        data[i] = kv.cells[i].delta;
-    }
+void llama_context::prepare_decode(const llama_ubatch & /*ubatch*/) {
 }
 
-void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
+// llama input
+
+void llama_context::set_inputs(const llama_ubatch & ubatch) {
+    const llama_hparams & hparams = model.hparams;
+
     //
     // set input data
     //
 
-    const auto & hparams = lctx.model.hparams;
-    const auto & cparams = lctx.cparams;
-    const auto & kv_self = lctx.kv_self;
+    if (inp_K_shift) {
+        assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
+
+        int32_t * data = (int32_t *) inp_K_shift->data;
+
+        for (uint32_t i = 0; i < kv_self.size; ++i) {
+            data[i] = kv_self.cells[i].delta;
+        }
+
+        // the K-shift graph requires just this input
+        return;
+    }
 
     if (ubatch.token) {
         const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
+        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
     }
 
     if (ubatch.embd) {
         const int64_t n_embd   = hparams.n_embd;
         const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+        ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd));
     }
 
-    if (ubatch.pos && lctx.inp_pos) {
+    if (ubatch.pos && inp_pos) {
         const int64_t n_tokens = ubatch.n_tokens;
-        auto n_pos = lctx.n_pos_per_token;
-        ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
+        auto n_pos = n_pos_per_token;
+        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(inp_pos));
     }
 
     if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
+        //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs");
 
-        if (!lctx.inp_out_ids) {
-            LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
+        if (!inp_out_ids) {
+            LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__);
         } else {
             const int64_t n_tokens = ubatch.n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
-            int32_t * data = (int32_t *) lctx.inp_out_ids->data;
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer));
+            int32_t * data = (int32_t *) inp_out_ids->data;
 
-            if (lctx.n_outputs == n_tokens) {
+            if (n_outputs == n_tokens) {
                 for (int i = 0; i < n_tokens; ++i) {
                     data[i] = i;
                 }
@@ -95,26 +125,26 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
                     }
                 }
                 // the graph needs to have been passed the correct number of outputs
-                GGML_ASSERT(lctx.n_outputs == n_outputs);
-            } else if (lctx.n_outputs == 1) {
+                GGML_ASSERT(n_outputs == n_outputs);
+            } else if (n_outputs == 1) {
                 // only keep last output
                 data[0] = n_tokens - 1;
             } else {
-                GGML_ASSERT(lctx.n_outputs == 0);
+                GGML_ASSERT(n_outputs == 0);
             }
         }
     }
 
     GGML_ASSERT(
-        // (!a || b) is a logical implication (a -> b)
-        // !hparams.causal_attn -> !cparams.causal_attn
-        (hparams.causal_attn || !cparams.causal_attn) &&
-        "causal attention is not supported by this model"
-    );
+            // (!a || b) is a logical implication (a -> b)
+            // !hparams.causal_attn -> !cparams.causal_attn
+            (hparams.causal_attn || !cparams.causal_attn) &&
+            "causal attention is not supported by this model"
+            );
 
-    if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
+    if (inp_KQ_mask || inp_KQ_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
-        if (cparams.causal_attn && !lctx.is_encoding) {
+        if (cparams.causal_attn && !is_encoding) {
             const int64_t n_kv         = kv_self.n;
             const int64_t n_tokens     = ubatch.n_tokens;
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
@@ -123,14 +153,14 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
             float * data     = nullptr;
             float * data_swa = nullptr;
 
-            if (lctx.inp_KQ_mask) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
-                data = (float *) lctx.inp_KQ_mask->data;
+            if (inp_KQ_mask) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer));
+                data = (float *) inp_KQ_mask->data;
             }
 
-            if (lctx.inp_KQ_mask_swa) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
-                data_swa = (float *) lctx.inp_KQ_mask_swa->data;
+            if (inp_KQ_mask_swa) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_swa->buffer));
+                data_swa = (float *) inp_KQ_mask_swa->data;
             }
 
             // For causal attention, use only the previous KV cells
@@ -191,11 +221,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
             const int64_t n_seqs       = ubatch.n_seqs;
             // when using kv cache, the mask needs to match the kv cache size
-            const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;
+            const int64_t n_stride = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer));
 
-            float * data = (float *) lctx.inp_KQ_mask->data;
+            float * data = (float *) inp_KQ_mask->data;
 
             for (int h = 0; h < 1; ++h) {
                 for (int s1 = 0; s1 < n_seqs; ++s1) {
@@ -238,11 +268,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
 
-        GGML_ASSERT(lctx.inp_mean);
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
+        GGML_ASSERT(inp_mean);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer));
 
-        float * data = (float *) lctx.inp_mean->data;
-        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
+        float * data = (float *) inp_mean->data;
+        memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean));
 
         std::vector<uint64_t> sum(n_tokens, 0);
 
@@ -279,11 +309,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
 
-        GGML_ASSERT(lctx.inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
+        GGML_ASSERT(inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
 
-        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
-        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+        uint32_t * data = (uint32_t *) inp_cls->data;
+        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
 
         for (int s = 0; s < n_seqs; ++s) {
             const llama_seq_id seq_id = ubatch.seq_id[s][0];
@@ -306,11 +336,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
 
-        GGML_ASSERT(lctx.inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
+        GGML_ASSERT(inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
 
-        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
-        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+        uint32_t * data = (uint32_t *) inp_cls->data;
+        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
 
         std::vector<int> last_pos(n_tokens, -1);
         std::vector<int> last_row(n_tokens, -1);
@@ -341,17 +371,18 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
     if (kv_self.recurrent) {
         const int64_t n_kv = kv_self.n;
 
-        if (lctx.inp_s_mask) {
-            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
-            float * data = (float *) lctx.inp_s_mask->data;
+        if (inp_s_mask) {
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer));
+            float * data = (float *) inp_s_mask->data;
 
             // clear unused states
             for (int i = 0; i < n_kv; ++i) {
                 const uint32_t  cell_id = i + kv_self.head;
-                llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
+                llama_kv_cell & kv_cell = kv_self.cells[cell_id];
 
                 data[i] = (float) (kv_cell.src >= 0);
 
+                // TODO: do not mutate the KV cache
                 // only clear once
                 if (kv_cell.src < 0) {
                     kv_cell.src = cell_id;
@@ -359,14 +390,14 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
             }
         }
 
-        if (lctx.inp_s_copy) {
-            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
-            int32_t * data = (int32_t *) lctx.inp_s_copy->data;
+        if (inp_s_copy) {
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer));
+            int32_t * data = (int32_t *) inp_s_copy->data;
 
             // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
             for (uint32_t i = 0; i < n_kv; ++i) {
                 const uint32_t  cell_id = i + kv_self.head;
-                llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
+                llama_kv_cell & kv_cell = kv_self.cells[cell_id];
 
                 // prevent out-of-bound sources
                 if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) {
@@ -375,6 +406,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
 
                 data[i] = kv_cell.src;
 
+                // TODO: do not mutate the KV cache
                 // ensure copy only happens once
                 if (kv_cell.src != (int32_t) cell_id) {
                     kv_cell.src = cell_id;
@@ -383,20 +415,20 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
         }
     }
 
-    if (lctx.inp_pos_bucket) {
+    if (inp_pos_bucket) {
         const int64_t n_tokens = ubatch.n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_pos_bucket->buffer));
         GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
 
-        int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;
+        int32_t * data = (int32_t *) inp_pos_bucket->data;
 
-        if (!lctx.is_encoding) {
+        if (!is_encoding) {
             const int64_t n_kv = kv_self.n;
             for (int h = 0; h < 1; ++h) {
                 for (int j = 0; j < n_tokens; ++j) {
                     for (int i = 0; i < n_kv; ++i) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding);
                     }
                 }
             }
@@ -404,28 +436,28 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
             for (int h = 0; h < 1; ++h) {
                 for (int j = 0; j < n_tokens; ++j) {
                     for (int i = 0; i < n_tokens; ++i) {
-                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding);
                     }
                 }
             }
         }
     }
 
-    if (!lctx.is_encoding && lctx.inp_embd_enc) {
-        assert(lctx.inp_embd_enc->type == GGML_TYPE_F32);
-        assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size());
+    if (!is_encoding && inp_embd_enc) {
+        assert(inp_embd_enc->type == GGML_TYPE_F32);
+        assert((size_t) ggml_nelements(inp_embd_enc) == embd_enc.size());
 
-        ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc));
+        ggml_backend_tensor_set(inp_embd_enc, embd_enc.data(), 0, ggml_nbytes(inp_embd_enc));
     }
 
-    if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
-        const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
+    if (!is_encoding && inp_KQ_mask_cross) {
+        const int64_t n_output_enc = embd_enc.size() / hparams.n_embd;
         const int64_t n_tokens = ubatch.n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_cross->buffer));
         GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
 
-        float * data = (float *) lctx.inp_KQ_mask_cross->data;
+        float * data = (float *) inp_KQ_mask_cross->data;
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
@@ -433,7 +465,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
                     float f = -INFINITY;
                     for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
                         const llama_seq_id seq_id = ubatch.seq_id[j][s];
-                        if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
+                        if (seq_ids_enc[i].find(seq_id) != seq_ids_enc[i].end()) {
                             f = 0.0f;
                         }
                     }
@@ -450,6 +482,851 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
     }
 }
 
+// do mat_mul, while optionally apply lora
+ggml_tensor * llama_context::build_lora_mm(
+        ggml_context * ctx0,
+         ggml_tensor * w,
+         ggml_tensor * cur) {
+    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+
+    for (const auto & lora : loras) {
+        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
+
+        const float adapter_scale = lora.second;
+        const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+        struct ggml_tensor * ab_cur = ggml_mul_mat(
+            ctx0, lw->b,
+            ggml_mul_mat(ctx0, lw->a, cur)
+        );
+
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+
+    return res;
+}
+
+// do mat_mul_id, while optionally apply lora
+ggml_tensor * llama_context::build_lora_mm_id(
+        ggml_context * ctx0,
+         ggml_tensor * w,
+         ggml_tensor * cur,
+         ggml_tensor * ids) {
+    struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
+    for (const auto & lora : loras) {
+        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
+
+        const float alpha = lora.first->alpha;
+        const float rank  = (float) lw->b->ne[0];
+        const float scale = alpha ? lora.second * alpha / rank : lora.second;
+
+        struct ggml_tensor * ab_cur = ggml_mul_mat_id(
+            ctx0, lw->b,
+            ggml_mul_mat_id(ctx0, lw->a, cur, ids),
+            ids
+        );
+
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+
+    return res;
+}
+
+void llama_context::build_attn_inp(
+        ggml_context * ctx0,
+             int32_t   n_tokens,
+                bool   causal,
+                bool   swa,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto n_kv = worst_case ? kv_self.size : kv_self.n;
+
+    inp_KQ_mask = causal
+        ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
+        : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    //cb(inp_KQ_mask, "KQ_mask", -1);
+    ggml_set_input(inp_KQ_mask);
+
+    inp_KQ_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask, GGML_TYPE_F16) : inp_KQ_mask;
+
+    if (swa) {
+        GGML_ASSERT(hparams.n_swa > 0);
+
+        inp_KQ_mask_swa = causal
+            ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
+            : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp_KQ_mask_swa, "KQ_mask_swa", -1);
+        ggml_set_input(inp_KQ_mask_swa);
+
+        inp_KQ_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask_swa, GGML_TYPE_F16) : inp_KQ_mask_swa;
+    }
+}
+
+void llama_context::build_attn_kv_store(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * k_cur,
+         ggml_tensor * v_cur,
+             int32_t   n_tokens,
+             int64_t   il,
+             bool      worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto & n_ctx = cparams.n_ctx;
+
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+    GGML_ASSERT(kv_self.size == n_ctx);
+
+    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head);
+    //cb(k_cache_view, "k_cache_view", il);
+
+    // note: storing RoPE-ed version of K in the KV cache
+    ggml_build_forward_expand(graph, ggml_cpy(ctx0, k_cur, k_cache_view));
+
+    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
+
+    struct ggml_tensor * v_cache_view = nullptr;
+
+    if (cparams.flash_attn) {
+        v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head);
+    } else {
+        // note: the V cache is transposed when not using flash attention
+        v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa,
+                (  n_ctx)*ggml_element_size(kv_self.v_l[il]),
+                (kv_head)*ggml_element_size(kv_self.v_l[il]));
+
+        v_cur = ggml_transpose(ctx0, v_cur);
+    }
+    //cb(v_cache_view, "v_cache_view", il);
+
+    ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view));
+}
+
+ggml_tensor * llama_context::build_attn_qkv(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * wo,
+         ggml_tensor * wo_b,
+         ggml_tensor * q_cur,
+             int32_t   n_tokens,
+             float     kq_scale,
+             int       il,
+             bool      worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto & n_ctx         = cparams.n_ctx;
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+    const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    // TODO: improve
+    bool is_sliding = false;
+
+    switch (model.arch) {
+        case LLM_ARCH_COHERE2:
+            {
+                const int32_t sliding_window_pattern = 4;
+                is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
+            } break;
+        case LLM_ARCH_GEMMA2:
+            {
+                const int32_t sliding_window_pattern = 2;
+                is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
+            } break;
+        case LLM_ARCH_PHI3:
+            {
+                is_sliding = hparams.n_swa > 0;
+            } break;
+        default:
+            {
+                is_sliding = false;
+            }
+    };
+
+    const auto & kq_mask = is_sliding ? inp_KQ_mask_swa_cnv : inp_KQ_mask_cnv;
+
+    const auto n_kv = worst_case ? kv_self.size : kv_self.n;
+
+    const int64_t n_head       = hparams.n_head(il);
+    const int64_t n_head_kv    = hparams.n_head_kv(il);
+    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+    struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
+
+    struct ggml_tensor * k =
+        ggml_view_3d(ctx0, kv_self.k_l[il],
+                n_embd_head_k, n_kv, n_head_kv,
+                ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                0);
+    //cb(k, "k", il);
+
+    struct ggml_tensor * cur;
+
+    if (cparams.flash_attn) {
+        GGML_UNUSED(model);
+        GGML_UNUSED(n_ctx);
+
+        // split cached v into n_head heads (not transposed)
+        struct ggml_tensor * v =
+            ggml_view_3d(ctx0, kv_self.v_l[il],
+                    n_embd_head_v, n_kv, n_head_kv,
+                    ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                    ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v),
+                    0);
+        //cb(v, "v", il);
+
+        cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
+                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
+
+        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
+
+        cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
+    } else {
+        struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+        //cb(kq, "kq", il);
+
+        // note: this op tends to require high floating point range
+        //       while for some models F16 is enough, for others it is not, so we default to F32 here
+        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+
+        if (model.arch == LLM_ARCH_GROK) {
+            // need to do the following:
+            // multiply by attn_output_multiplyer of 0.08838834764831845
+            // and then :
+            // kq = 30 * tanh(kq / 30)
+            // before the softmax below
+
+            kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
+            kq = ggml_scale(ctx0, kq, 30);
+        }
+
+        if (hparams.attn_soft_cap) {
+            kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
+            kq = ggml_tanh(ctx0, kq);
+            kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
+        }
+
+        kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
+        //cb(kq, "kq_soft_max_ext", il);
+
+        GGML_ASSERT(kv_self.size == n_ctx);
+
+        // split cached v into n_head heads
+        struct ggml_tensor * v =
+            ggml_view_3d(ctx0, kv_self.v_l[il],
+                    n_kv, n_embd_head_v, n_head_kv,
+                    ggml_element_size(kv_self.v_l[il])*n_ctx,
+                    ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+                    0);
+        //cb(v, "v", il);
+
+        struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+        //cb(kqv, "kqv", il);
+
+        struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+        //cb(kqv_merged, "kqv_merged", il);
+
+        cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
+        //cb(cur, "kqv_merged_cont", il);
+
+        if (!cparams.offload_kqv) {
+            // all nodes between the KV store and the attention output are run on the CPU
+            ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
+        }
+    }
+
+    ggml_build_forward_expand(graph, cur);
+
+    if (wo) {
+        cur = build_lora_mm(ctx0, wo, cur);
+    }
+
+    if (wo_b) {
+        //cb(cur, "kqv_wo", il);
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
+
+    return cur;
+}
+
+ggml_tensor * llama_context::build_soft_max_ext(
+        ggml_context * ctx0,
+         ggml_tensor * kq,
+             float     kq_scale) {
+    const auto & hparams = model.hparams;
+
+    return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
+}
+
+ggml_tensor * llama_context::get_rope_factors(int il) {
+    const auto & hparams = model.hparams;
+
+    // choose long/short freq factors based on the context size
+    const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
+
+    if (model.layers[il].rope_freqs != nullptr) {
+        return model.layers[il].rope_freqs;
+    }
+
+    if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
+        return model.layers[il].rope_long;
+    }
+
+    return model.layers[il].rope_short;
+}
+
+void llama_context::build_k_shift(
+        ggml_context * ctx0,
+         ggml_cgraph * graph) {
+    const auto & n_ctx      = cparams.n_ctx;
+    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
+    const auto & freq_base  = cparams.rope_freq_base;
+    const auto & freq_scale = cparams.rope_freq_scale;
+
+    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
+    const auto & yarn_attn_factor = cparams.yarn_attn_factor;
+    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
+
+    const auto & hparams = model.hparams;
+
+    const auto & n_rot     = hparams.n_rot;
+    const auto & n_layer   = hparams.n_layer;
+    const auto & rope_type = hparams.rope_type;
+
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    GGML_ASSERT(kv_self.size == n_ctx);
+
+    inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+    //cb(inp_K_shift, "K_shift", -1);
+    ggml_set_input(inp_K_shift);
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const int64_t n_head_kv    = hparams.n_head_kv(il);
+        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+        struct ggml_tensor * rope_factors = get_rope_factors(il);
+
+        struct ggml_tensor * k =
+            ggml_view_3d(ctx0, kv_self.k_l[il],
+                n_embd_head_k, n_head_kv, n_ctx,
+                ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                0);
+
+        struct ggml_tensor * tmp;
+        if (ggml_is_quantized(k->type)) {
+            // dequantize to f32 -> RoPE -> quantize back
+            tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
+            //cb(tmp, "K_f32", il);
+
+            for (auto & backend : backends) {
+                // Figure out which backend KV cache belongs to
+                if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
+                    ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
+                    break;
+                }
+            }
+            tmp = ggml_rope_ext_inplace(ctx0, tmp,
+                    inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+            //cb(tmp, "K_shifted_f32", il);
+
+            tmp = ggml_cpy(ctx0, tmp, k);
+        } else {
+            // we rotate only the first n_rot dimensions
+            tmp = ggml_rope_ext_inplace(ctx0, k,
+                    inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+        }
+        //cb(tmp, "K_shifted", il);
+
+        ggml_build_forward_expand(graph, tmp);
+    }
+}
+
+void llama_context::build_defrag(
+        ggml_context * ctx0,
+         ggml_cgraph * graph) {
+    const auto & hparams = model.hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    const uint32_t n_kv   = kv_self.cell_max();
+    const uint32_t n_used = kv_self.used;
+
+    assert(n_used <= n_kv);
+
+    //const int64_t t_start = ggml_time_us();
+
+    // number of cells moved
+    uint32_t n_moves = 0;
+
+    // each move requires 6*n_layer tensors (see build_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    //const uint32_t max_moves = model.max_nodes()/(6*n_layer);
+    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+    const uint32_t max_moves = (model.max_nodes() - 2*n_layer)/(6*n_layer);
+
+    // determine which KV cells to move where
+    //
+    //  cell i moves to ids[i]
+    //
+    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+    //
+    std::vector<uint32_t> ids(n_kv, n_kv);
+
+    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+        const auto & cell0 = kv_self.cells[i0];
+
+        if (!cell0.is_empty()) {
+            ids[i0] = i0;
+
+            continue;
+        }
+
+        // found a hole - fill it with data from the end of the cache
+
+        uint32_t nh = 1;
+
+        // determine the size of the hole
+        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
+            nh++;
+        }
+
+        uint32_t nf = 0;
+        uint32_t is = n_kv - 1;
+
+        // starting from the end, find nh non-empty cells
+        for (; is > i0; --is) {
+            const auto & cell1 = kv_self.cells[is];
+
+            if (cell1.is_empty() || ids[is] != n_kv) {
+                continue;
+            }
+
+            // non-empty cell which is not yet moved
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        // this can only happen if `n_used` is not accurate, which would be a bug
+        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
+
+        nf = 0;
+
+        uint32_t i1 = is;
+
+        // are we moving a continuous block of memory?
+        bool cont = false;
+
+        // should we stop searching for the next move?
+        bool stop = false;
+
+        // go back and move the nf cells to the hole
+        for (; i1 < n_kv; ++i1) {
+            auto & cell1 = kv_self.cells[i1];
+
+            if (cell1.is_empty() || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
+                cont = false;
+                continue;
+            }
+
+            // this cell goes to (i0 + nf)
+            ids[i1] = i0 + nf;
+
+            // move the cell meta data
+            kv_self.cells[i0 + nf] = cell1;
+
+            // clear the old cell and move the head there
+            cell1 = llama_kv_cell();
+            kv_self.head = n_used;
+
+            if (!cont) {
+                n_moves++;
+                cont = true;
+            }
+
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
+        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+
+        i0 += nh - 1;
+    }
+
+    if (n_moves == 0) {
+        return;
+    }
+
+    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
+
+    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+
+#if 0
+    // CPU defrag
+    //
+    // TODO: optimizations are possible:
+    //       - multiple threads
+    //       - avoid copying to the host memory when already there
+    //
+    // likely not worth the effort, as we have ggml_graph based defrag
+    //
+
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+    const uint32_t kv_size = kv_self.size;
+
+    std::vector<uint8_t> buf_k;
+    std::vector<uint8_t> buf_v;
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+        const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
+
+        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+        const size_t v_size    = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
+
+        buf_k.resize(k_size);
+        buf_v.resize(v_size);
+
+        ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
+
+        // batch move [i, i+nm) to [id, id+nm)
+        // note: cells can move only to a lower index
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t id = ids[i];
+
+            if (i == id || id == n_kv) {
+                continue;
+            }
+
+            uint32_t nm = 1;
+
+            while (i + nm < n_kv && ids[i + nm] == id + nm) {
+                nm++;
+            }
+
+            // move keys
+            {
+                const int64_t os =  i*k_size_row;
+                const int64_t od = id*k_size_row;
+
+                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
+            }
+
+            // move values (note: they are transposed)
+            {
+                const int64_t os =  i;
+                const int64_t od = id;
+
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
+                }
+            }
+
+            i += nm - 1;
+        }
+
+        ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
+    }
+#else
+    for (uint32_t i = 0; i < ids.size(); ++i) {
+        const uint32_t id = ids[i];
+
+        if (i == id || id == ids.size()) {
+            continue;
+        }
+
+        uint32_t nm = 1;
+
+        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+            nm++;
+        }
+
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+
+            ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+
+            ggml_tensor * view_v_src;
+            ggml_tensor * view_v_dst;
+
+            if (cparams.flash_attn) {
+                // NOTE: the V cache is not transposed when using flash attention
+                view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+
+                view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+            } else {
+                view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+                        ggml_row_size(kv_self.v_l[il]->type, i));
+
+                view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+                        ggml_row_size(kv_self.v_l[il]->type, id));
+            }
+
+            ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_k_src, view_k_dst));
+            ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_v_src, view_v_dst));
+        }
+
+        i += nm - 1;
+    }
+
+    //LLAMA_LOG_INFO("graph->n_nodes = %d\n", graph->n_nodes);
+#endif
+}
+
+ggml_tensor * llama_context::build_inp_s_copy(
+        ggml_context * ctx0,
+                bool   worst_case) {
+    const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
+
+    inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
+    //cb(inp_s_copy, "inp_s_copy", -1);
+    ggml_set_input(inp_s_copy);
+    return inp_s_copy;
+}
+
+ggml_tensor * llama_context::build_inp_s_mask(
+        ggml_context * ctx0,
+                bool   worst_case) {
+    const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
+    inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
+    //cb(inp_s_mask, "inp_s_mask", -1);
+    ggml_set_input(inp_s_mask);
+    return inp_s_mask;
+}
+
+ggml_tensor * llama_context::build_copy_mask_state(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * s,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+             int32_t   n_tokens,
+             int32_t   n_state,
+             int32_t   n_seqs,
+                bool   worst_case) {
+    const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self.size);
+
+    // copy states
+    // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
+    // this shrinks the tensors's ne[1] to n_kv
+    states = ggml_get_rows(ctx0, states, state_copy);
+
+    // clear states of sequences which are starting at the beginning of this batch
+    // FIXME: zero-out NANs?
+    states = ggml_mul(ctx0, states, state_mask);
+
+    // copy states which won't be changed further (between n_seqs and n_kv)
+    ggml_build_forward_expand(graph,
+        ggml_cpy(ctx0,
+            ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs          )*n_state*ggml_element_size(states)),
+            ggml_view_1d(ctx0, s,      n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
+
+    // the part of the states that will be used and modified
+    return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
+}
+
+// TODO: split
+ggml_tensor * llama_context::build_mamba_layer(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * cur,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto & n_tokens = ubatch.n_tokens;
+
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    const int64_t d_conv  = hparams.ssm_d_conv;
+    const int64_t d_inner = hparams.ssm_d_inner;
+    const int64_t d_state = hparams.ssm_d_state;
+    const int64_t dt_rank = hparams.ssm_dt_rank;
+    const int64_t n_seqs  = ubatch.n_seqs;
+    // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
+    const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
+    // Use the same RMS norm as the final layer norm
+    const float norm_rms_eps = hparams.f_norm_rms_eps;
+
+    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    GGML_ASSERT(n_seqs != 0);
+    GGML_ASSERT(ubatch.equal_seqs);
+    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+    struct ggml_tensor * conv_states_all = kv_self.k_l[il];
+    struct ggml_tensor * ssm_states_all  = kv_self.v_l[il];
+
+    // (ab)using the KV cache to store the states
+    struct ggml_tensor * conv = build_copy_mask_state(
+            ctx0, graph, conv_states_all, state_copy, state_mask,
+            n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
+    conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
+    struct ggml_tensor * ssm = build_copy_mask_state(
+            ctx0, graph, ssm_states_all, state_copy, state_mask,
+            n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
+    ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
+
+    // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+    cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+    // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
+    struct ggml_tensor * xz = build_lora_mm(ctx0, model.layers[il].ssm_in, cur);
+    // split the above in two
+    // => {d_inner, n_seq_tokens, n_seqs}
+    struct ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
+    struct ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
+
+    // conv
+    {
+        // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
+        struct ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
+
+        // copy last (d_conv - 1) columns back into the state cache
+        struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
+
+        ggml_build_forward_expand(graph,
+            ggml_cpy(ctx0, last_conv,
+                ggml_view_1d(ctx0, conv_states_all,
+                    (d_conv - 1)*(d_inner)*(n_seqs),
+                    kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
+
+        // 1D convolution
+        // The equivalent is to make a self-overlapping view of conv_x
+        // over d_conv columns at each stride in the 3rd dimension,
+        // then element-wise multiply that with the conv1d weight,
+        // then sum the elements of each row,
+        // (the last two steps are a dot product over rows (also doable with mul_mat))
+        // then permute away the ne[0] dimension,
+        // and then you're left with the resulting x tensor.
+        // For simultaneous sequences, all sequences need to have the same length.
+        x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
+
+        // bias
+        x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
+
+        x = ggml_silu(ctx0, x);
+    }
+
+    // ssm
+    {
+        // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
+        struct ggml_tensor * x_db = build_lora_mm(ctx0, model.layers[il].ssm_x, x);
+        // split
+        struct ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
+        struct ggml_tensor * B  = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
+        struct ggml_tensor * C  = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
+
+        // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
+        if (ssm_dt_b_c_rms) {
+            dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
+            B = ggml_rms_norm(ctx0, B, norm_rms_eps);
+            C = ggml_rms_norm(ctx0, C, norm_rms_eps);
+        }
+
+        // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
+        dt = build_lora_mm(ctx0, model.layers[il].ssm_dt, dt);
+        dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
+
+        // Custom operator to optimize the parallel associative scan
+        // as described in the Annex D of the Mamba paper.
+        // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+        struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
+
+        // store last states
+        ggml_build_forward_expand(graph,
+            ggml_cpy(ctx0,
+                ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
+                ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
+
+        struct ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
+
+        // TODO: skip computing output earlier for unused tokens
+
+        // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
+        y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
+        y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
+
+        // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+        cur = build_lora_mm(ctx0, model.layers[il].ssm_out, y);
+    }
+
+    // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+    cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+    //cb(cur, "mamba_out", il);
+
+    return cur;
+}
+
+
 // llama output
 
 size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
diff --git a/src/llama-context.h b/src/llama-context.h
index 73baa711f394a..a2f41b5c8fc7d 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -14,6 +14,8 @@
 #include <vector>
 #include <set>
 
+using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
+
 struct llama_context {
     llama_context(const llama_model & model)
         : model(model)
@@ -22,12 +24,10 @@ struct llama_context {
 
     const struct llama_model & model;
 
-    struct llama_cparams      cparams;
-    struct llama_sbatch       sbatch;  // TODO: revisit if needed
-    struct llama_kv_cache     kv_self;
-    struct llama_adapter_cvec cvec;
-
-    std::unordered_map<struct llama_adapter_lora *, float> lora;
+    llama_cparams      cparams;
+    llama_sbatch       sbatch;  // TODO: revisit if needed
+    llama_adapter_cvec cvec;
+    llama_loras        loras;
 
     std::vector<ggml_backend_ptr> backends;
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
@@ -72,18 +72,6 @@ struct llama_context {
     // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
     std::map<llama_seq_id, std::vector<float>> embd_seq;
 
-    // whether we are computing encoder output or decoder output
-    bool is_encoding = false;
-
-    // TODO: find a better way to accommodate mutli-dimension position encoding methods
-    // number of position id each token get, 1 for each token in most cases.
-    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
-    int n_pos_per_token = 1;
-
-    // output of the encoder part of the encoder-decoder models
-    std::vector<float> embd_enc;
-    std::vector<std::set<llama_seq_id>> seq_ids_enc;
-
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
     ggml_backend_sched_ptr sched;
@@ -91,28 +79,144 @@ struct llama_context {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
+    void reset();
+
+    void prepare_k_shift();
+    void prepare_defrag();
+    void prepare_decode(const llama_ubatch & ubatch);
+
+    void set_inputs(const llama_ubatch & ubatch);
+
+    ggml_tensor * build_lora_mm(
+            ggml_context * ctx0,
+             ggml_tensor * w,
+             ggml_tensor * cur);
+
+    ggml_tensor * build_lora_mm_id(
+            ggml_context * ctx0,
+             ggml_tensor * w,   // struct ggml_tensor * as
+             ggml_tensor * cur, // struct ggml_tensor * b
+             ggml_tensor * ids);
+
     // input tensors
     struct ggml_tensor * inp_tokens;        // I32 [n_batch]
     struct ggml_tensor * inp_embd;          // F32 [n_embd, n_batch]
     struct ggml_tensor * inp_pos;           // I32 [n_batch]
     struct ggml_tensor * inp_out_ids;       // I32 [n_outputs]
-    struct ggml_tensor * inp_KQ_mask;       // F32 [kv_size, n_batch]
-    struct ggml_tensor * inp_KQ_mask_swa;   // F32 [kv_size, n_batch]
-    struct ggml_tensor * inp_K_shift;       // I32 [kv_size]
     struct ggml_tensor * inp_mean;          // F32 [n_batch, n_batch]
     struct ggml_tensor * inp_cls;           // I32 [n_batch]
+
+    // === encoder-decoder ===
+
+    // whether we are computing encoder output or decoder output
+    bool is_encoding = false;
+
+    // output of the encoder part of the encoder-decoder models
+    std::vector<float> embd_enc;
+    std::vector<std::set<llama_seq_id>> seq_ids_enc;
+
+    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
+    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
+
+    // === unified KV cache ===
+
+    llama_kv_cache     kv_self;
+
+    struct ggml_tensor * inp_KQ_mask;         // F32 [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_mask_cnv;     //     [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_mask_swa;     // F32 [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_mask_swa_cnv; //     [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_mask_cross;   // F32 [n_outputs_enc, n_batch]
+    struct ggml_tensor * inp_K_shift;         // I32 [kv_size]
+
+    void build_attn_inp(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   causal,
+                    bool   swa,
+                    bool   worst_case);
+
+    void build_attn_kv_store(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * k_cur,
+             ggml_tensor * v_cur,
+                 int32_t   n_tokens,
+                 int64_t   il,
+                 bool      worst_case);
+
+    ggml_tensor * build_attn_qkv(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * wo,
+             ggml_tensor * wo_b,
+             ggml_tensor * q_cur,
+                 int32_t   n_tokens,
+                 float     kq_scale,
+                 int       il,
+                 bool      worst_case);
+
+    ggml_tensor * build_soft_max_ext(
+            ggml_context * ctx0,
+             ggml_tensor * kq,
+                 float     kq_scale);
+
+    ggml_tensor * get_rope_factors(int il);
+
+    void build_k_shift(
+            ggml_context * ctx0,
+             ggml_cgraph * graph);
+
+    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
+    void build_defrag(
+            ggml_context * ctx0,
+             ggml_cgraph * graph);
+
+    // === recurrent ===
+
+    // TODO: add recurrent cache
+    // TODO: add mamba-specific llama_context
+
+    // TODO: change these to build_mamba_inp and hide `state_copy` and `state_mask` inside the llama_context impl
+    ggml_tensor * build_inp_s_copy(
+            ggml_context * ctx0,
+                    bool   worst_case);
+
+    ggml_tensor * build_inp_s_mask(
+            ggml_context * ctx0,
+                    bool   worst_case);
+
+    ggml_tensor * build_copy_mask_state(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * s,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+                 int32_t   n_tokens,
+                 int32_t   n_state,
+                 int32_t   n_seqs,
+                    bool   worst_case);
+
+    ggml_tensor * build_mamba_layer(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * cur,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case);
+
     struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
     struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
-    struct ggml_tensor * inp_s_seq;         // I32 [n_kv, n_batch]
-    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
-    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
-    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
 
-    void set_k_shift(llama_kv_cache & kv);
-};
+    // === vision ===
 
-// TODO: make these methods of llama_context
-void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
+    // TODO: find a better way to accommodate mutli-dimension position encoding methods
+    // number of position id each token get, 1 for each token in most cases.
+    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
+    int n_pos_per_token = 1;
+};
 
 // Make sure enough space is available for outputs.
 // Returns max number of outputs for which space was reserved.
diff --git a/src/llama.cpp b/src/llama.cpp
index 37816ddc28a38..a2e5e0bea0fb5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4,8 +4,6 @@
 #include "llama-mmap.h"
 #include "llama-context.h"
 #include "llama-vocab.h"
-#include "llama-sampling.h"
-#include "llama-kv-cache.h"
 #include "llama-model-loader.h"
 #include "llama-model.h"
 
@@ -106,946 +104,15 @@ enum llm_norm_type {
     LLM_NORM_GROUP,
 };
 
-static struct ggml_tensor * llm_build_inp_embd(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-        const llama_hparams & hparams,
-         const llama_ubatch & ubatch,
-         struct ggml_tensor * tok_embd,
-         const llm_build_cb & cb) {
-    const int64_t n_embd = hparams.n_embd;
-
-    struct ggml_tensor * inpL;
-
-    if (ubatch.token) {
-        lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens);
-        cb(lctx.inp_tokens, "inp_tokens", -1);
-        ggml_set_input(lctx.inp_tokens);
-
-        inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
-
-        // apply lora for embedding tokens if needed
-        for (auto & it : lctx.lora) {
-            struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd);
-            if (lw == nullptr) {
-                continue;
-            }
-            const float adapter_scale = it.second;
-            const float scale = lw->get_scale(it.first->alpha, adapter_scale);
-            struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
-                ctx, lw->b, // non-transposed lora_b
-                ggml_get_rows(ctx, lw->a, lctx.inp_tokens)
-            ), scale);
-            inpL = ggml_add(ctx, inpL, inpL_delta);
-        }
-    } else {
-        lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
-        inpL = lctx.inp_embd;
-        ggml_set_input(lctx.inp_embd);
-    }
-
-    // For Granite architecture
-    if (hparams.f_embedding_scale != 0.0f) {
-        inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
-    }
-
-    cb(inpL, "inp_embd", -1);
-
-    return inpL;
-}
-
-static void llm_build_kv_store(
-        struct ggml_context * ctx,
-        const llama_hparams & hparams,
-        const llama_cparams & cparams,
-       const llama_kv_cache & kv,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * k_cur,
-         struct ggml_tensor * v_cur,
-                    int32_t   n_tokens,
-                    int32_t   kv_head,
-         const llm_build_cb & cb,
-                    int64_t   il) {
-    const int64_t n_ctx = cparams.n_ctx;
-
-    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
-    GGML_ASSERT(kv.size == n_ctx);
-
-    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
-    cb(k_cache_view, "k_cache_view", il);
-
-    // note: storing RoPE-ed version of K in the KV cache
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
-
-    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
-
-    struct ggml_tensor * v_cache_view = nullptr;
-
-    if (cparams.flash_attn) {
-        v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
-    } else {
-        // note: the V cache is transposed when not using flash attention
-        v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
-                (  n_ctx)*ggml_element_size(kv.v_l[il]),
-                (kv_head)*ggml_element_size(kv.v_l[il]));
-
-        v_cur = ggml_transpose(ctx, v_cur);
-    }
-    cb(v_cache_view, "v_cache_view", il);
-
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
-}
-
-// do mat_mul, while optionally apply lora
-static struct ggml_tensor * llm_build_lora_mm(
-        struct llama_context & lctx,
-         struct ggml_context * ctx0,
-          struct ggml_tensor * w,
-          struct ggml_tensor * cur) {
-    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
-    for (auto & it : lctx.lora) {
-        struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
-        if (lw == nullptr) {
-            continue;
-        }
-        const float adapter_scale = it.second;
-        const float scale = lw->get_scale(it.first->alpha, adapter_scale);
-        struct ggml_tensor * ab_cur = ggml_mul_mat(
-            ctx0, lw->b,
-            ggml_mul_mat(ctx0, lw->a, cur)
-        );
-        ab_cur = ggml_scale(ctx0, ab_cur, scale);
-        res = ggml_add(ctx0, res, ab_cur);
-    }
-    return res;
-}
-
-// do mat_mul_id, while optionally apply lora
-static struct ggml_tensor * llm_build_lora_mm_id(
-        struct llama_context & lctx,
-         struct ggml_context * ctx0,
-          struct ggml_tensor * w,   // struct ggml_tensor * as
-          struct ggml_tensor * cur, // struct ggml_tensor * b
-          struct ggml_tensor * ids) {
-    struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
-    for (auto & it : lctx.lora) {
-        struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
-        if (lw == nullptr) {
-            continue;
-        }
-        const float alpha = it.first->alpha;
-        const float rank  = (float) lw->b->ne[0];
-        const float scale = alpha ? it.second * alpha / rank : it.second;
-        struct ggml_tensor * ab_cur = ggml_mul_mat_id(
-            ctx0, lw->b,
-            ggml_mul_mat_id(ctx0, lw->a, cur, ids),
-            ids
-        );
-        ab_cur = ggml_scale(ctx0, ab_cur, scale);
-        res = ggml_add(ctx0, res, ab_cur);
-    }
-    return res;
-}
-
-static struct ggml_tensor * llm_build_norm(
-        struct ggml_context * ctx,
-         struct ggml_tensor * cur,
-        const llama_hparams & hparams,
-         struct ggml_tensor * mw,
-         struct ggml_tensor * mb,
-              llm_norm_type   type,
-         const llm_build_cb & cb,
-                        int   il) {
-    switch (type) {
-        case LLM_NORM:       cur = ggml_norm      (ctx, cur, hparams.f_norm_eps);     break;
-        case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx, cur, hparams.f_norm_rms_eps); break;
-        case LLM_NORM_GROUP:
-            {
-                cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
-                cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
-                cur = ggml_reshape_2d(ctx, cur, cur->ne[0],    cur->ne[2]);
-            } break;
-    }
-
-    if (mw || mb) {
-        cb(cur, "norm", il);
-    }
-
-    if (mw) {
-        cur = ggml_mul(ctx, cur, mw);
-        if (mb) {
-            cb(cur, "norm_w", il);
-        }
-    }
-
-    if (mb) {
-        cur = ggml_add(ctx, cur, mb);
-    }
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_ffn(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-         struct ggml_tensor * cur,
-         struct ggml_tensor * up,
-         struct ggml_tensor * up_b,
-         struct ggml_tensor * up_s,
-         struct ggml_tensor * gate,
-         struct ggml_tensor * gate_b,
-         struct ggml_tensor * gate_s,
-         struct ggml_tensor * down,
-         struct ggml_tensor * down_b,
-         struct ggml_tensor * down_s,
-         struct ggml_tensor * act_scales,
-            llm_ffn_op_type   type_op,
-          llm_ffn_gate_type   type_gate,
-         const llm_build_cb & cb,
-                        int   il) {
-    struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur;
-    cb(tmp, "ffn_up", il);
-
-    if (up_b) {
-        tmp = ggml_add(ctx, tmp, up_b);
-        cb(tmp, "ffn_up_b", il);
-    }
-
-    if (up_s) {
-        tmp = ggml_mul(ctx, tmp, up_s);
-        cb(tmp, "ffn_up_s", il);
-    }
-
-    if (gate) {
-        switch (type_gate) {
-            case LLM_FFN_SEQ:
-                {
-                    cur = llm_build_lora_mm(lctx, ctx, gate, tmp);
-                    cb(cur, "ffn_gate", il);
-                } break;
-            case LLM_FFN_PAR:
-                {
-                    cur = llm_build_lora_mm(lctx, ctx, gate, cur);
-                    cb(cur, "ffn_gate", il);
-                } break;
-        }
-
-        if (gate_b) {
-            cur = ggml_add(ctx, cur, gate_b);
-            cb(cur, "ffn_gate_b", il);
-        }
-
-        if (gate_s) {
-            cur = ggml_mul(ctx, cur, gate_s);
-            cb(cur, "ffn_gate_s", il);
-        }
-
-    } else {
-        cur = tmp;
-    }
-
-    switch (type_op) {
-        case LLM_FFN_SILU:
-            {
-                cur = ggml_silu(ctx, cur);
-                cb(cur, "ffn_silu", il);
-            } break;
-        case LLM_FFN_GELU:
-            {
-                cur = ggml_gelu(ctx, cur);
-                cb(cur, "ffn_gelu", il);
-                if (act_scales != NULL) {
-                    cur = ggml_div(ctx, cur, act_scales);
-                    cb(cur, "ffn_act", il);
-                }
-            } break;
-        case LLM_FFN_RELU:
-            {
-                cur = ggml_relu(ctx, cur);
-                cb(cur, "ffn_relu", il);
-            } break;
-        case LLM_FFN_RELU_SQR:
-            {
-                cur = ggml_relu(ctx, cur);
-                cb(cur, "ffn_relu", il);
-
-                cur = ggml_sqr(ctx, cur);
-                cb(cur, "ffn_sqr(relu)", il);
-            } break;
-        case LLM_FFN_SWIGLU:
-            {
-                // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-                int64_t split_point = cur->ne[0] / 2;
-                struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
-                struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
-
-                x0 = ggml_silu(ctx, x0);
-                cb(cur, "ffn_silu", il);
-
-                cur = ggml_mul(ctx, x0, x1);
-                cb(cur, "ffn_mul", il);
-            } break;
-    }
-
-    if (type_gate == LLM_FFN_PAR) {
-        cur = ggml_mul(ctx, cur, tmp);
-        cb(cur, "ffn_gate_par", il);
-    }
-
-    if (down) {
-        cur = llm_build_lora_mm(lctx, ctx, down, cur);
-    }
-
-    if (down_b) {
-        cb(cur, "ffn_down", il);
-    }
-
-    if (down_b) {
-        cur = ggml_add(ctx, cur, down_b);
-    }
-
-    if (down_s) {
-        cur = ggml_mul(ctx, cur, down_s);
-        cb(cur, "ffn_down_s", il);
-    }
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_moe_ffn(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-         struct ggml_tensor * cur,
-         struct ggml_tensor * gate_inp,
-         struct ggml_tensor * up_exps,
-         struct ggml_tensor * gate_exps,
-         struct ggml_tensor * down_exps,
-         struct ggml_tensor * exp_probs_b,
-                    int64_t   n_expert,
-                    int64_t   n_expert_used,
-            llm_ffn_op_type   type_op,
-                       bool   norm_w,
-                       bool   scale_w,
-                      float   w_scale,
-llama_expert_gating_func_type gating_op,
-         const llm_build_cb & cb,
-                        int   il) {
-    int64_t n_embd = cur->ne[0];
-    int64_t n_tokens = cur->ne[1];
-
-    ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
-    cb(logits, "ffn_moe_logits", il);
-
-    ggml_tensor * probs = nullptr;
-    switch (gating_op) {
-        case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
-            {
-                probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
-            } break;
-        case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
-            {
-                probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
-            } break;
-        default:
-            GGML_ABORT("fatal error");
-    }
-    cb(probs, "ffn_moe_probs", il);
-
-    // add experts selection bias - introduced in DeepSeek V3
-    // leave probs unbiased as it's later used to get expert weights
-    ggml_tensor * selection_probs = probs;
-    if (exp_probs_b != nullptr) {
-        selection_probs = ggml_add(ctx, probs, exp_probs_b);
-        cb(selection_probs, "ffn_moe_probs_biased", il);
-    }
-
-    // select experts
-    ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
-    cb(selected_experts->src[0], "ffn_moe_argsort", il);
-    cb(selected_experts, "ffn_moe_topk", il);
-
-    ggml_tensor * weights = ggml_get_rows(ctx,
-            ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
-    cb(weights, "ffn_moe_weights", il);
-
-    if (norm_w) {
-        weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
-
-        ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
-        cb(weights_sum, "ffn_moe_weights_sum", il);
-
-        weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
-        cb(weights, "ffn_moe_weights_norm", il);
-
-        weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
-    }
-    if (scale_w) {
-        weights = ggml_scale(ctx, weights, w_scale);
-        cb(weights, "ffn_moe_weights_scaled", il);
-    }
-
-    cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
-    ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(up, "ffn_moe_up", il);
-
-    ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(gate, "ffn_moe_gate", il);
-
-    switch (type_op) {
-        case LLM_FFN_SILU:
-            {
-                gate = ggml_silu(ctx, gate);
-                cb(gate, "ffn_moe_silu", il);
-            } break;
-        case LLM_FFN_GELU:
-            {
-                gate = ggml_gelu(ctx, gate);
-                cb(gate, "ffn_moe_gelu", il);
-            } break;
-        default:
-            GGML_ABORT("fatal error");
-    }
-
-    ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
-    cb(par, "ffn_moe_gate_par", il);
-
-    ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
-    cb(experts, "ffn_moe_down", il);
-
-    experts = ggml_mul(ctx, experts, weights);
-
-    // aggregate experts
-    ggml_tensor * moe_out = nullptr;
-    for (int i = 0; i < n_expert_used; ++i) {
-        ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
-                experts->nb[2], i*experts->nb[1]);
-
-        if (i == 0) {
-            moe_out = cur_expert;
-        } else {
-            moe_out = ggml_add(ctx, moe_out, cur_expert);
-        }
-    }
-
-    if (n_expert_used == 1) {
-        // avoid returning a non-contiguous tensor
-        moe_out = ggml_cont(ctx, moe_out);
-    }
-
-    return moe_out;
-}
-
-static struct ggml_tensor * llm_build_kqv(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-       const llama_kv_cache & kv,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * wo,
-         struct ggml_tensor * wo_b,
-         struct ggml_tensor * q_cur,
-         struct ggml_tensor * kq_mask,
-                    int32_t   n_tokens,
-                    int32_t   n_kv,
-                    float     kq_scale,
-         const llm_build_cb & cb,
-                    int       il) {
-    const llama_model   & model   = lctx.model;
-    const llama_hparams & hparams = lctx.model.hparams;
-    const llama_cparams & cparams = lctx.cparams;
-
-    const int64_t n_ctx         = cparams.n_ctx;
-    const int64_t n_head        = hparams.n_head(il);
-    const int64_t n_head_kv     = hparams.n_head_kv(il);
-    const int64_t n_embd_head_k = hparams.n_embd_head_k;
-    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_head_v = hparams.n_embd_head_v;
-    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(il);
-
-    struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
-    cb(q, "q", il);
-
-    struct ggml_tensor * k =
-        ggml_view_3d(ctx, kv.k_l[il],
-                n_embd_head_k, n_kv, n_head_kv,
-                ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
-                ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
-                0);
-    cb(k, "k", il);
-
-    struct ggml_tensor * cur;
-
-    if (cparams.flash_attn) {
-        GGML_UNUSED(model);
-        GGML_UNUSED(n_ctx);
-
-        // split cached v into n_head heads (not transposed)
-        struct ggml_tensor * v =
-            ggml_view_3d(ctx, kv.v_l[il],
-                    n_embd_head_v, n_kv, n_head_kv,
-                    ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
-                    ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
-                    0);
-        cb(v, "v", il);
-
-        cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
-                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
-
-        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
-
-        cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
-    } else {
-        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-        cb(kq, "kq", il);
-
-        // note: this op tends to require high floating point range
-        //       while for some models F16 is enough, for others it is not, so we default to F32 here
-        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
-
-        if (model.arch == LLM_ARCH_GROK) {
-            // need to do the following:
-            // multiply by attn_output_multiplyer of 0.08838834764831845
-            // and then :
-            // kq = 30 * tanh(kq / 30)
-            // before the softmax below
-
-            kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
-            kq = ggml_scale(ctx, kq, 30);
-        }
-
-        if (hparams.attn_soft_cap) {
-            kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
-            kq = ggml_tanh(ctx, kq);
-            kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
-        }
-
-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
-        cb(kq, "kq_soft_max_ext", il);
-
-        GGML_ASSERT(kv.size == n_ctx);
-
-        // split cached v into n_head heads
-        struct ggml_tensor * v =
-            ggml_view_3d(ctx, kv.v_l[il],
-                    n_kv, n_embd_head_v, n_head_kv,
-                    ggml_element_size(kv.v_l[il])*n_ctx,
-                    ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
-                    0);
-        cb(v, "v", il);
-
-        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
-        cb(kqv, "kqv", il);
-
-        struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
-        cb(kqv_merged, "kqv_merged", il);
-
-        cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
-        cb(cur, "kqv_merged_cont", il);
-    }
-
-    ggml_build_forward_expand(graph, cur);
-
-    if (wo) {
-        cur = llm_build_lora_mm(lctx, ctx, wo, cur);
-    }
-
-    if (wo_b) {
-        cb(cur, "kqv_wo", il);
-    }
-
-    if (wo_b) {
-        cur = ggml_add(ctx, cur, wo_b);
-    }
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_kv(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-       const llama_kv_cache & kv,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * wo,
-         struct ggml_tensor * wo_b,
-         struct ggml_tensor * k_cur,
-         struct ggml_tensor * v_cur,
-         struct ggml_tensor * q_cur,
-         struct ggml_tensor * kq_mask,
-                    int32_t   n_tokens,
-                    int32_t   kv_head,
-                    int32_t   n_kv,
-                    float     kq_scale,
-         const llm_build_cb & cb,
-                    int       il) {
-    const llama_hparams & hparams = lctx.model.hparams;
-    const llama_cparams & cparams = lctx.cparams;
-
-    // these nodes are added to the graph together so that they are not reordered
-    // by doing so, the number of splits in the graph is reduced
-    ggml_build_forward_expand(graph, q_cur);
-    ggml_build_forward_expand(graph, k_cur);
-    ggml_build_forward_expand(graph, v_cur);
-
-    llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
-
-    struct ggml_tensor * cur;
-
-    cur  = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
-    cb(cur, "kqv_out", il);
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_copy_mask_state(
-        struct ggml_context * ctx,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * s,
-         struct ggml_tensor * state_copy,
-         struct ggml_tensor * state_mask,
-                    int32_t   n_state,
-                    int32_t   kv_size,
-                    int32_t   kv_head,
-                    int32_t   n_kv,
-                    int32_t   n_seqs) {
-    struct ggml_tensor * states = ggml_reshape_2d(ctx, s, n_state, kv_size);
-
-    // copy states
-    // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
-    // this shrinks the tensors's ne[1] to n_kv
-    states = ggml_get_rows(ctx, states, state_copy);
-
-    // clear states of sequences which are starting at the beginning of this batch
-    // FIXME: zero-out NANs?
-    states = ggml_mul(ctx, states, state_mask);
-
-    // copy states which won't be changed further (between n_seqs and n_kv)
-    ggml_build_forward_expand(graph,
-        ggml_cpy(ctx,
-            ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
-            ggml_view_1d(ctx, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
-
-    // the part of the states that will be used and modified
-    return ggml_view_2d(ctx, states, n_state, n_seqs, states->nb[1], 0);
-}
-
-// TODO: split
-static struct ggml_tensor * llm_build_mamba(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-         const llama_ubatch & ubatch,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * cur,
-         struct ggml_tensor * state_copy,
-         struct ggml_tensor * state_mask,
-                    int32_t   kv_head,
-                    int32_t   n_kv,
-         const llm_build_cb & cb,
-                    int       il) {
-    const llama_model    & model   = lctx.model;
-    const llama_hparams  & hparams = model.hparams;
-    const llama_kv_cache & kv      = lctx.kv_self;
-    const int64_t d_conv  = hparams.ssm_d_conv;
-    const int64_t d_inner = hparams.ssm_d_inner;
-    const int64_t d_state = hparams.ssm_d_state;
-    const int64_t dt_rank = hparams.ssm_dt_rank;
-    const int64_t n_seqs  = ubatch.n_seqs;
-    // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
-    const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
-    // Use the same RMS norm as the final layer norm
-    const float norm_rms_eps = hparams.f_norm_rms_eps;
-
-    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-
-    GGML_ASSERT(n_seqs != 0);
-    GGML_ASSERT(ubatch.equal_seqs);
-    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-
-    struct ggml_tensor * conv_states_all = kv.k_l[il];
-    struct ggml_tensor * ssm_states_all  = kv.v_l[il];
-
-    // (ab)using the KV cache to store the states
-    struct ggml_tensor * conv = llm_build_copy_mask_state(ctx,
-            graph, conv_states_all, state_copy, state_mask,
-            hparams.n_embd_k_s(), kv.size, kv_head, n_kv, n_seqs);
-    conv = ggml_reshape_3d(ctx, conv, d_conv - 1, d_inner, n_seqs);
-    struct ggml_tensor * ssm = llm_build_copy_mask_state(ctx,
-            graph, ssm_states_all, state_copy, state_mask,
-            hparams.n_embd_v_s(), kv.size, kv_head, n_kv, n_seqs);
-    ssm = ggml_reshape_3d(ctx, ssm, d_state, d_inner, n_seqs);
-
-    // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
-    cur = ggml_reshape_3d(ctx, cur, cur->ne[0], n_seq_tokens, n_seqs);
-
-    // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
-    struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_in, cur);
-    // split the above in two
-    // => {d_inner, n_seq_tokens, n_seqs}
-    struct ggml_tensor * x = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
-    struct ggml_tensor * z = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
-
-    // conv
-    {
-        // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
-        struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_transpose(ctx, x), 0);
-
-        // copy last (d_conv - 1) columns back into the state cache
-        struct ggml_tensor * last_conv = ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
-
-        ggml_build_forward_expand(graph,
-            ggml_cpy(ctx, last_conv,
-                ggml_view_1d(ctx, conv_states_all,
-                    (d_conv - 1)*(d_inner)*(n_seqs),
-                    kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
-
-        // 1D convolution
-        // The equivalent is to make a self-overlapping view of conv_x
-        // over d_conv columns at each stride in the 3rd dimension,
-        // then element-wise multiply that with the conv1d weight,
-        // then sum the elements of each row,
-        // (the last two steps are a dot product over rows (also doable with mul_mat))
-        // then permute away the ne[0] dimension,
-        // and then you're left with the resulting x tensor.
-        // For simultaneous sequences, all sequences need to have the same length.
-        x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d);
-
-        // bias
-        x = ggml_add(ctx, x, model.layers[il].ssm_conv1d_b);
-
-        x = ggml_silu(ctx, x);
-    }
-
-    // ssm
-    {
-        // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
-        struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_x, x);
-        // split
-        struct ggml_tensor * dt = ggml_view_3d(ctx, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
-        struct ggml_tensor * B  = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
-        struct ggml_tensor * C  = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
-
-        // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
-        if (ssm_dt_b_c_rms) {
-            dt = ggml_rms_norm(ctx, dt, norm_rms_eps);
-            B = ggml_rms_norm(ctx, B, norm_rms_eps);
-            C = ggml_rms_norm(ctx, C, norm_rms_eps);
-        }
-
-        // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
-        dt = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_dt, dt);
-        dt = ggml_add(ctx, dt, model.layers[il].ssm_dt_b);
-
-        // Custom operator to optimize the parallel associative scan
-        // as described in the Annex D of the Mamba paper.
-        // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
-        struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C);
-
-        // store last states
-        ggml_build_forward_expand(graph,
-            ggml_cpy(ctx,
-                ggml_view_1d(ctx, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
-                ggml_view_1d(ctx, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
-
-        struct ggml_tensor * y = ggml_view_3d(ctx, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
-
-        // TODO: skip computing output earlier for unused tokens
-
-        // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
-        y = ggml_add(ctx, y, ggml_mul(ctx, x, model.layers[il].ssm_d));
-        y = ggml_mul(ctx, y, ggml_silu(ctx, ggml_cont(ctx, z)));
-
-        // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
-        cur = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_out, y);
-    }
-
-    // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
-    cur = ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs);
-    cb(cur, "mamba_out", il);
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_rwkv6_time_mix(
-        struct llama_context & lctx,
-        struct ggml_context * ctx,
-        const struct llama_layer * layer,
-        struct ggml_tensor * cur,
-        struct ggml_tensor * x_prev,
-        struct ggml_tensor ** wkv_state,
-        size_t wkv_head_size,
-        size_t head_count_kv) {
-    size_t n_embd       = cur->ne[0];
-    size_t n_seq_tokens = cur->ne[1];
-    size_t n_seqs       = cur->ne[2];
-
-    size_t head_size  = wkv_head_size;
-    size_t head_count = n_embd / head_size;
-
-    size_t n_tokens = n_seqs * n_seq_tokens;
-
-    bool is_qrwkv = layer->time_mix_first == nullptr;
-
-    struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
-
-    sx  = ggml_reshape_2d(ctx, sx,  n_embd, n_tokens);
-    cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
-
-    struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
-
-    xxx = ggml_reshape_4d(
-        ctx,
-        ggml_tanh(
-            ctx,
-            ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
-        ),
-        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
-    );
-
-    xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2));
-
-    xxx = ggml_mul_mat(
-        ctx,
-        ggml_reshape_4d(
-            ctx,
-            layer->time_mix_w2,
-            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
-        ),
-        xxx
-    );
-
-    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
-    if (layer->time_mix_lerp_fused) {
-        // fusing these weights makes some performance improvement
-        sx  = ggml_reshape_3d(ctx, sx,  n_embd, 1, n_tokens);
-        cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
-        xxx = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur);
-        xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
-        xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
-        xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
-        xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
-        xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-    } else {
-        // for backward compatibility
-        xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
-        xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
-        xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
-        xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
-        xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-
-        xw = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur);
-        xk = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur);
-        xv = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur);
-        xr = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur);
-        xg = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur);
-    }
-
-    struct ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr);
-    struct ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key,        xk);
-    struct ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value,      xv);
-    if (layer->time_mix_receptance_b) {
-        r = ggml_add(ctx, r, layer->time_mix_receptance_b);
-    }
-    if (layer->time_mix_key_b) {
-        k = ggml_add(ctx, k, layer->time_mix_key_b);
-    }
-    if (layer->time_mix_value_b) {
-        v = ggml_add(ctx, v, layer->time_mix_value_b);
-    }
-
-    struct ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg);
-    if (is_qrwkv) {
-        g = ggml_sigmoid(ctx, g);
-    } else {
-        g = ggml_silu(ctx, g);
-    }
-
-    if (head_count_kv != head_count) {
-        GGML_ASSERT(head_count % head_count_kv == 0);
-        k = ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens);
-        v = ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens);
-        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
-        k = ggml_repeat(ctx, k, tmp);
-        v = ggml_repeat(ctx, v, tmp);
-    }
-
-    k = ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens);
-    v = ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens);
-    r = ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens);
-
-    struct ggml_tensor * w = ggml_mul_mat(
-        ctx,
-        layer->time_mix_decay_w2,
-        ggml_tanh(
-            ctx,
-            ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
-        )
-    );
-
-    w = ggml_add(ctx, w, layer->time_mix_decay);
-    w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
-    w = ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens);
-
-    if (is_qrwkv) {
-        // k = k * (1 - w)
-        k = ggml_sub(ctx, k, ggml_mul(ctx, k, w));
-    }
-
-    struct ggml_tensor * wkv_output;
-    if (!layer->time_mix_first) {
-        wkv_output = ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
-    } else {
-        wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
-    }
-    cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
-    *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
-
-    if (!is_qrwkv) {
-        // group norm with head_count groups
-        cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
-        cur = ggml_norm(ctx, cur, 64e-5f);
-
-        // Convert back to regular vectors.
-        cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
-        cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
-    } else {
-        cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
-    }
-
-    cur = ggml_mul(ctx, cur, g);
-    cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
-
-    return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
-}
-
-static struct ggml_tensor * llm_build_rwkv6_channel_mix(
-        struct llama_context & lctx,
-        struct ggml_context * ctx,
-        const struct llama_layer * layer,
-        struct ggml_tensor * cur,
-        struct ggml_tensor * x_prev) {
-    struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
-    struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
-    struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
-
-    struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
-    struct ggml_tensor * k = ggml_sqr(
-        ctx,
-        ggml_relu(
-            ctx,
-            llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
-        )
-    );
-
-    return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
-}
-
 struct llm_build_context {
-    const llama_model    & model;
-          llama_context  & lctx;
-    const llama_hparams  & hparams;
-    const llama_cparams  & cparams;
-    const llama_ubatch   & ubatch;
-    const llama_kv_cache & kv_self;
+          llama_context      & lctx;
+    const llama_model        & model;
+    const llama_hparams      & hparams;
+    const llama_cparams      & cparams;
+    const llama_ubatch       & ubatch;
+    //const llama_kv_cache     & kv_self;
+    const llama_adapter_cvec & cvec;
+    const llama_loras        & loras;
 
     const int64_t n_embd;
     const int64_t n_layer;
@@ -1070,12 +137,13 @@ struct llm_build_context {
     const float norm_rms_eps;
 
     const int32_t n_tokens;
-    const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
+    //const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
+    //const int32_t kv_head;  // index of where we store new KV data in the cache
     const int32_t n_outputs;
     const int32_t n_outputs_enc;
-    const int32_t kv_head;  // index of where we store new KV data in the cache
     const int32_t n_ctx_orig;
 
+    const bool worst_case;
     const bool flash_attn;
 
     const enum llama_pooling_type pooling_type;
@@ -1089,16 +157,18 @@ struct llm_build_context {
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
-        llama_context  & lctx,
-    const llama_ubatch & ubatch,
-    const llm_build_cb & cb,
-                  bool   worst_case) :
-        model            (lctx.model),
+            llama_context & lctx,
+       const llama_ubatch & ubatch,
+       const llm_build_cb & cb,
+                     bool   worst_case) :
         lctx             (lctx),
+        model            (lctx.model),
         hparams          (model.hparams),
         cparams          (lctx.cparams),
         ubatch           (ubatch),
-        kv_self          (lctx.kv_self),
+        //kv_self          (lctx.kv_self),
+        cvec             (lctx.cvec),
+        loras            (lctx.loras),
         n_embd           (hparams.n_embd),
         n_layer          (hparams.n_layer),
         n_rot            (hparams.n_rot),
@@ -1120,11 +190,12 @@ struct llm_build_context {
         norm_eps         (hparams.f_norm_eps),
         norm_rms_eps     (hparams.f_norm_rms_eps),
         n_tokens         (ubatch.n_tokens),
-        n_kv             (worst_case ? kv_self.size : kv_self.n),
+        //n_kv             (worst_case ? kv_self.size : kv_self.n),
+        //kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
         n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
         n_outputs_enc    (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
-        kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
         n_ctx_orig       (cparams.n_ctx_orig_yarn),
+        worst_case       (worst_case),
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
@@ -1133,156 +204,614 @@ struct llm_build_context {
             // all initializations should be done in init()
         }
 
-    void init() {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ buf_compute_meta.size(),
-            /*.mem_buffer =*/ buf_compute_meta.data(),
-            /*.no_alloc   =*/ true,
-        };
+    void init() {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ buf_compute_meta.size(),
+            /*.mem_buffer =*/ buf_compute_meta.data(),
+            /*.no_alloc   =*/ true,
+        };
+
+        ctx0 = ggml_init(params);
+
+        lctx.reset();
+    }
+
+    void free() {
+        ggml_free(ctx0);
+        ctx0 = nullptr;
+    }
+
+    struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
+        struct ggml_tensor * inpL;
+
+        if (ubatch.token) {
+            lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+            cb(lctx.inp_tokens, "inp_tokens", -1);
+            ggml_set_input(lctx.inp_tokens);
+
+            inpL = ggml_get_rows(ctx0, tok_embd, lctx.inp_tokens);
+
+            // apply lora for embedding tokens if needed
+            for (const auto & lora : loras) {
+                struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
+                if (lw == nullptr) {
+                    continue;
+                }
+
+                const float adapter_scale = lora.second;
+                const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+                struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
+                            ctx0, lw->b, // non-transposed lora_b
+                            ggml_get_rows(ctx0, lw->a, lctx.inp_tokens)
+                            ), scale);
+
+                inpL = ggml_add(ctx0, inpL, inpL_delta);
+            }
+        } else {
+            lctx.inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+            inpL = lctx.inp_embd;
+            ggml_set_input(lctx.inp_embd);
+        }
+
+        // For Granite architecture
+        if (hparams.f_embedding_scale != 0.0f) {
+            inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale);
+        }
+
+        cb(inpL, "inp_embd", -1);
+
+        return inpL;
+    }
+
+    // do mat_mul, while optionally apply lora
+    struct ggml_tensor * build_lora_mm(
+              struct ggml_tensor * w,
+              struct ggml_tensor * cur) {
+        struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+
+        for (const auto & lora : loras) {
+            struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+            if (lw == nullptr) {
+                continue;
+            }
+
+            const float adapter_scale = lora.second;
+            const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+            struct ggml_tensor * ab_cur = ggml_mul_mat(
+                ctx0, lw->b,
+                ggml_mul_mat(ctx0, lw->a, cur)
+            );
+
+            ab_cur = ggml_scale(ctx0, ab_cur, scale);
+            res = ggml_add(ctx0, res, ab_cur);
+        }
+
+        return res;
+    }
+
+    // do mat_mul_id, while optionally apply lora
+    struct ggml_tensor * build_lora_mm_id(
+              struct ggml_tensor * w,   // struct ggml_tensor * as
+              struct ggml_tensor * cur, // struct ggml_tensor * b
+              struct ggml_tensor * ids) {
+        struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
+        for (const auto & lora : loras) {
+            struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+            if (lw == nullptr) {
+                continue;
+            }
+
+            const float alpha = lora.first->alpha;
+            const float rank  = (float) lw->b->ne[0];
+            const float scale = alpha ? lora.second * alpha / rank : lora.second;
+
+            struct ggml_tensor * ab_cur = ggml_mul_mat_id(
+                ctx0, lw->b,
+                ggml_mul_mat_id(ctx0, lw->a, cur, ids),
+                ids
+            );
+
+            ab_cur = ggml_scale(ctx0, ab_cur, scale);
+            res = ggml_add(ctx0, res, ab_cur);
+        }
+
+        return res;
+    }
+
+    struct ggml_tensor * build_norm(
+             struct ggml_tensor * cur,
+             struct ggml_tensor * mw,
+             struct ggml_tensor * mb,
+                  llm_norm_type   type,
+                            int   il) {
+        switch (type) {
+            case LLM_NORM:       cur = ggml_norm      (ctx0, cur, hparams.f_norm_eps);     break;
+            case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx0, cur, hparams.f_norm_rms_eps); break;
+            case LLM_NORM_GROUP:
+                {
+                    cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
+                    cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
+                    cur = ggml_reshape_2d(ctx0, cur, cur->ne[0],    cur->ne[2]);
+                } break;
+        }
+
+        if (mw || mb) {
+            cb(cur, "norm", il);
+        }
+
+        if (mw) {
+            cur = ggml_mul(ctx0, cur, mw);
+            if (mb) {
+                cb(cur, "norm_w", il);
+            }
+        }
+
+        if (mb) {
+            cur = ggml_add(ctx0, cur, mb);
+        }
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_ffn(
+             struct ggml_tensor * cur,
+             struct ggml_tensor * up,
+             struct ggml_tensor * up_b,
+             struct ggml_tensor * up_s,
+             struct ggml_tensor * gate,
+             struct ggml_tensor * gate_b,
+             struct ggml_tensor * gate_s,
+             struct ggml_tensor * down,
+             struct ggml_tensor * down_b,
+             struct ggml_tensor * down_s,
+             struct ggml_tensor * act_scales,
+                llm_ffn_op_type   type_op,
+              llm_ffn_gate_type   type_gate,
+             const llm_build_cb & cb,
+                            int   il) {
+        struct ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
+        cb(tmp, "ffn_up", il);
+
+        if (up_b) {
+            tmp = ggml_add(ctx0, tmp, up_b);
+            cb(tmp, "ffn_up_b", il);
+        }
+
+        if (up_s) {
+            tmp = ggml_mul(ctx0, tmp, up_s);
+            cb(tmp, "ffn_up_s", il);
+        }
+
+        if (gate) {
+            switch (type_gate) {
+                case LLM_FFN_SEQ:
+                    {
+                        cur = build_lora_mm(gate, tmp);
+                        cb(cur, "ffn_gate", il);
+                    } break;
+                case LLM_FFN_PAR:
+                    {
+                        cur = build_lora_mm(gate, cur);
+                        cb(cur, "ffn_gate", il);
+                    } break;
+            }
+
+            if (gate_b) {
+                cur = ggml_add(ctx0, cur, gate_b);
+                cb(cur, "ffn_gate_b", il);
+            }
+
+            if (gate_s) {
+                cur = ggml_mul(ctx0, cur, gate_s);
+                cb(cur, "ffn_gate_s", il);
+            }
+
+        } else {
+            cur = tmp;
+        }
+
+        switch (type_op) {
+            case LLM_FFN_SILU:
+                {
+                    cur = ggml_silu(ctx0, cur);
+                    cb(cur, "ffn_silu", il);
+                } break;
+            case LLM_FFN_GELU:
+                {
+                    cur = ggml_gelu(ctx0, cur);
+                    cb(cur, "ffn_gelu", il);
+                    if (act_scales != NULL) {
+                        cur = ggml_div(ctx0, cur, act_scales);
+                        cb(cur, "ffn_act", il);
+                    }
+                } break;
+            case LLM_FFN_RELU:
+                {
+                    cur = ggml_relu(ctx0, cur);
+                    cb(cur, "ffn_relu", il);
+                } break;
+            case LLM_FFN_RELU_SQR:
+                {
+                    cur = ggml_relu(ctx0, cur);
+                    cb(cur, "ffn_relu", il);
+
+                    cur = ggml_sqr(ctx0, cur);
+                    cb(cur, "ffn_sqr(relu)", il);
+                } break;
+            case LLM_FFN_SWIGLU:
+                {
+                    // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+                    int64_t split_point = cur->ne[0] / 2;
+                    struct ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
+                    struct ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+                    x0 = ggml_silu(ctx0, x0);
+                    cb(cur, "ffn_silu", il);
+
+                    cur = ggml_mul(ctx0, x0, x1);
+                    cb(cur, "ffn_mul", il);
+                } break;
+        }
+
+        if (type_gate == LLM_FFN_PAR) {
+            cur = ggml_mul(ctx0, cur, tmp);
+            cb(cur, "ffn_gate_par", il);
+        }
+
+        if (down) {
+            cur = build_lora_mm(down, cur);
+        }
+
+        if (down_b) {
+            cb(cur, "ffn_down", il);
+        }
 
-        ctx0 = ggml_init(params);
+        if (down_b) {
+            cur = ggml_add(ctx0, cur, down_b);
+        }
 
-        lctx.inp_tokens        = nullptr;
-        lctx.inp_embd          = nullptr;
-        lctx.inp_pos           = nullptr;
-        lctx.inp_out_ids       = nullptr;
-        lctx.inp_KQ_mask       = nullptr;
-        lctx.inp_KQ_mask_swa   = nullptr;
-        lctx.inp_K_shift       = nullptr;
-        lctx.inp_mean          = nullptr;
-        lctx.inp_cls           = nullptr;
-        lctx.inp_s_copy        = nullptr;
-        lctx.inp_s_mask        = nullptr;
-        lctx.inp_s_seq         = nullptr;
-        lctx.inp_pos_bucket    = nullptr;
-        lctx.inp_embd_enc      = nullptr;
-        lctx.inp_KQ_mask_cross = nullptr;
-    }
+        if (down_s) {
+            cur = ggml_mul(ctx0, cur, down_s);
+            cb(cur, "ffn_down_s", il);
+        }
 
-    void free() {
-        ggml_free(ctx0);
-        ctx0 = nullptr;
+        return cur;
     }
 
-    struct ggml_cgraph * build_k_shift() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    struct ggml_tensor * build_moe_ffn(
+             struct ggml_tensor * cur,
+             struct ggml_tensor * gate_inp,
+             struct ggml_tensor * up_exps,
+             struct ggml_tensor * gate_exps,
+             struct ggml_tensor * down_exps,
+             struct ggml_tensor * exp_probs_b,
+                        int64_t   n_expert,
+                        int64_t   n_expert_used,
+                llm_ffn_op_type   type_op,
+                           bool   norm_w,
+                           bool   scale_w,
+                          float   w_scale,
+  llama_expert_gating_func_type   gating_op,
+             const llm_build_cb & cb,
+                            int   il) {
+        int64_t n_embd = cur->ne[0];
+        int64_t n_tokens = cur->ne[1];
 
-        GGML_ASSERT(kv_self.size == n_ctx);
+        ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
+        cb(logits, "ffn_moe_logits", il);
 
-        lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-        cb(lctx.inp_K_shift, "K_shift", -1);
-        ggml_set_input(lctx.inp_K_shift);
+        ggml_tensor * probs = nullptr;
+        switch (gating_op) {
+            case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
+                {
+                    probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens]
+                } break;
+            case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
+                {
+                    probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
+                } break;
+            default:
+                GGML_ABORT("fatal error");
+        }
+        cb(probs, "ffn_moe_probs", il);
 
-        for (int il = 0; il < n_layer; ++il) {
-            const int64_t n_head_kv    = hparams.n_head_kv(il);
-            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+        // add experts selection bias - introduced in DeepSeek V3
+        // leave probs unbiased as it's later used to get expert weights
+        ggml_tensor * selection_probs = probs;
+        if (exp_probs_b != nullptr) {
+            selection_probs = ggml_add(ctx0, probs, exp_probs_b);
+            cb(selection_probs, "ffn_moe_probs_biased", il);
+        }
 
-            struct ggml_tensor * rope_factors = build_rope_factors(il);
+        // select experts
+        ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+        cb(selected_experts->src[0], "ffn_moe_argsort", il);
+        cb(selected_experts, "ffn_moe_topk", il);
 
-            struct ggml_tensor * k =
-                ggml_view_3d(ctx0, kv_self.k_l[il],
-                    n_embd_head_k, n_head_kv, n_ctx,
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                    0);
+        ggml_tensor * weights = ggml_get_rows(ctx0,
+                ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+        cb(weights, "ffn_moe_weights", il);
 
-            struct ggml_tensor * tmp;
-            if (ggml_is_quantized(k->type)) {
-                // dequantize to f32 -> RoPE -> quantize back
-                tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
-                cb(tmp, "K_f32", il);
+        if (norm_w) {
+            weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
 
-                for (auto & backend : lctx.backends) {
-                    // Figure out which backend KV cache belongs to
-                    if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
-                        ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get());
-                        break;
-                    }
-                }
-                tmp = ggml_rope_ext_inplace(ctx0, tmp,
-                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(tmp, "K_shifted_f32", il);
+            ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
+            cb(weights_sum, "ffn_moe_weights_sum", il);
 
-                tmp = ggml_cpy(ctx0, tmp, k);
-            } else {
-                // we rotate only the first n_rot dimensions
-                tmp = ggml_rope_ext_inplace(ctx0, k,
-                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-            }
-            cb(tmp, "K_shifted", il);
+            weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
+            cb(weights, "ffn_moe_weights_norm", il);
 
-            ggml_build_forward_expand(gf, tmp);
+            weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
+        }
+        if (scale_w) {
+            weights = ggml_scale(ctx0, weights, w_scale);
+            cb(weights, "ffn_moe_weights_scaled", il);
         }
 
-        return gf;
-    }
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+        ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        cb(up, "ffn_moe_up", il);
 
-    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+        ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        cb(gate, "ffn_moe_gate", il);
+
+        switch (type_op) {
+            case LLM_FFN_SILU:
+                {
+                    gate = ggml_silu(ctx0, gate);
+                    cb(gate, "ffn_moe_silu", il);
+                } break;
+            case LLM_FFN_GELU:
+                {
+                    gate = ggml_gelu(ctx0, gate);
+                    cb(gate, "ffn_moe_gelu", il);
+                } break;
+            default:
+                GGML_ABORT("fatal error");
+        }
 
-        for (uint32_t i = 0; i < ids.size(); ++i) {
-            const uint32_t id = ids[i];
+        ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
+        cb(par, "ffn_moe_gate_par", il);
 
-            if (i == id || id == ids.size()) {
-                continue;
-            }
+        ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
+        cb(experts, "ffn_moe_down", il);
 
-            uint32_t nm = 1;
+        experts = ggml_mul(ctx0, experts, weights);
 
-            while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-                nm++;
+        // aggregate experts
+        ggml_tensor * moe_out = nullptr;
+        for (int i = 0; i < n_expert_used; ++i) {
+            ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
+                    experts->nb[2], i*experts->nb[1]);
+
+            if (i == 0) {
+                moe_out = cur_expert;
+            } else {
+                moe_out = ggml_add(ctx0, moe_out, cur_expert);
             }
+        }
 
-            for (int il = 0; il < n_layer; ++il) {
-                const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-                const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+        if (n_expert_used == 1) {
+            // avoid returning a non-contiguous tensor
+            moe_out = ggml_cont(ctx0, moe_out);
+        }
 
-                ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+        return moe_out;
+    }
 
-                ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+    struct ggml_tensor * build_attn(
+             struct ggml_cgraph * graph,
+             struct ggml_tensor * wo,
+             struct ggml_tensor * wo_b,
+             struct ggml_tensor * k_cur,
+             struct ggml_tensor * v_cur,
+             struct ggml_tensor * q_cur,
+                        int32_t   n_tokens,
+                        float     kq_scale,
+             const llm_build_cb & cb,
+                        int       il) {
+        // these nodes are added to the graph together so that they are not reordered
+        // by doing so, the number of splits in the graph is reduced
+        ggml_build_forward_expand(graph, q_cur);
+        ggml_build_forward_expand(graph, k_cur);
+        ggml_build_forward_expand(graph, v_cur);
 
-                ggml_tensor * view_v_src;
-                ggml_tensor * view_v_dst;
+        //build_kv_store(graph, k_cur, v_cur, il);
+        lctx.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
 
-                if (flash_attn) {
-                    // NOTE: the V cache is not transposed when using flash attention
-                    view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+        struct ggml_tensor * cur;
 
-                    view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
-                } else {
-                    view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
-                            ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, i));
-
-                    view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
-                            ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, id));
-                }
+        //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il);
+        cur = lctx.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
+        cb(cur, "kqv_out", il);
+
+        return cur;
+    }
+
+    //struct ggml_tensor * build_rwkv6_time_mix(
+    //        const struct llama_layer * layer,
+    //        struct ggml_tensor * cur,
+    //        struct ggml_tensor * x_prev,
+    //        struct ggml_tensor ** wkv_state,
+    //        size_t wkv_head_size,
+    //        size_t head_count_kv) {
+    //    size_t n_embd       = cur->ne[0];
+    //    size_t n_seq_tokens = cur->ne[1];
+    //    size_t n_seqs       = cur->ne[2];
+
+    //    size_t head_size  = wkv_head_size;
+    //    size_t head_count = n_embd / head_size;
+
+    //    size_t n_tokens = n_seqs * n_seq_tokens;
+
+    //    bool is_qrwkv = layer->time_mix_first == nullptr;
+
+    //    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+
+    //    sx  = ggml_reshape_2d(ctx0, sx,  n_embd, n_tokens);
+    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+
+    //    struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur);
+
+    //    xxx = ggml_reshape_4d(
+    //        ctx0,
+    //        ggml_tanh(
+    //            ctx0,
+    //            ggml_mul_mat(ctx0, layer->time_mix_w1, xxx)
+    //        ),
+    //        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
+    //    );
+
+    //    xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
+
+    //    xxx = ggml_mul_mat(
+    //        ctx0,
+    //        ggml_reshape_4d(
+    //            ctx0,
+    //            layer->time_mix_w2,
+    //            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
+    //        ),
+    //        xxx
+    //    );
+
+    //    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
+    //    if (layer->time_mix_lerp_fused) {
+    //        // fusing these weights makes some performance improvement
+    //        sx  = ggml_reshape_3d(ctx0, sx,  n_embd, 1, n_tokens);
+    //        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+    //        xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur);
+    //        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+    //        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+    //        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+    //        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+    //        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+    //    } else {
+    //        // for backward compatibility
+    //        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+    //        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+    //        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+    //        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+    //        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+
+    //        xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur);
+    //        xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur);
+    //        xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur);
+    //        xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur);
+    //        xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur);
+    //    }
+
+    //    struct ggml_tensor * r = build_lora_mm(layer->time_mix_receptance, xr);
+    //    struct ggml_tensor * k = build_lora_mm(layer->time_mix_key,        xk);
+    //    struct ggml_tensor * v = build_lora_mm(layer->time_mix_value,      xv);
+    //    if (layer->time_mix_receptance_b) {
+    //        r = ggml_add(ctx0, r, layer->time_mix_receptance_b);
+    //    }
+    //    if (layer->time_mix_key_b) {
+    //        k = ggml_add(ctx0, k, layer->time_mix_key_b);
+    //    }
+    //    if (layer->time_mix_value_b) {
+    //        v = ggml_add(ctx0, v, layer->time_mix_value_b);
+    //    }
+
+    //    struct ggml_tensor * g = build_lora_mm(layer->time_mix_gate, xg);
+    //    if (is_qrwkv) {
+    //        g = ggml_sigmoid(ctx0, g);
+    //    } else {
+    //        g = ggml_silu(ctx0, g);
+    //    }
+
+    //    if (head_count_kv != head_count) {
+    //        GGML_ASSERT(head_count % head_count_kv == 0);
+    //        k = ggml_reshape_4d(ctx0, k, head_size, 1, head_count_kv, n_tokens);
+    //        v = ggml_reshape_4d(ctx0, v, head_size, 1, head_count_kv, n_tokens);
+    //        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
+    //        k = ggml_repeat(ctx0, k, tmp);
+    //        v = ggml_repeat(ctx0, v, tmp);
+    //    }
+
+    //    k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
+    //    v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
+    //    r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
+
+    //    struct ggml_tensor * w = ggml_mul_mat(
+    //        ctx0,
+    //        layer->time_mix_decay_w2,
+    //        ggml_tanh(
+    //            ctx0,
+    //            ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw)
+    //        )
+    //    );
+
+    //    w = ggml_add(ctx0, w, layer->time_mix_decay);
+    //    w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
+    //    w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
+
+    //    if (is_qrwkv) {
+    //        // k = k * (1 - w)
+    //        k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
+    //    }
+
+    //    struct ggml_tensor * wkv_output;
+    //    if (!layer->time_mix_first) {
+    //        wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
+    //    } else {
+    //        wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, *wkv_state);
+    //    }
+    //    cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
+    //    *wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
+
+    //    if (!is_qrwkv) {
+    //        // group norm with head_count groups
+    //        cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
+    //        cur = ggml_norm(ctx0, cur, 64e-5f);
+
+    //        // Convert back to regular vectors.
+    //        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    //        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b);
+    //    } else {
+    //        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    //    }
+
+    //    cur = ggml_mul(ctx0, cur, g);
+    //    cur = build_lora_mm(layer->time_mix_output, cur);
+
+    //    return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
+    //}
+
+    //struct ggml_tensor * build_rwkv6_channel_mix(
+    //    const struct llama_layer * layer,
+    //    struct ggml_tensor * cur,
+    //    struct ggml_tensor * x_prev) {
+    //    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+    //    struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
+    //    struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
+
+    //    struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
+    //    struct ggml_tensor * k = ggml_sqr(
+    //            ctx0,
+    //            ggml_relu(
+    //                ctx0,
+    //                build_lora_mm(layer->channel_mix_key, xk)
+    //                )
+    //            );
+
+    //    return ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
+    //}
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
-            }
+    struct ggml_cgraph * build_k_shift() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-            i += nm - 1;
-        }
+        lctx.build_k_shift(ctx0, gf);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_defrag() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+        lctx.build_defrag(ctx0, gf);
 
         return gf;
     }
@@ -1294,21 +823,6 @@ struct llm_build_context {
         return lctx.inp_pos;
     }
 
-    struct ggml_tensor * build_rope_factors(int il) {
-        // choose long/short freq factors based on the context size
-        const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
-
-        if (model.layers[il].rope_freqs != nullptr) {
-            return model.layers[il].rope_freqs;
-        }
-
-        if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
-            return model.layers[il].rope_long;
-        }
-
-        return model.layers[il].rope_short;
-    }
-
     struct ggml_tensor * build_inp_out_ids() {
         lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
         cb(lctx.inp_out_ids, "inp_out_ids", -1);
@@ -1316,28 +830,6 @@ struct llm_build_context {
         return lctx.inp_out_ids;
     }
 
-    struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
-        lctx.inp_KQ_mask = causal
-            ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
-            : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        cb(lctx.inp_KQ_mask, "KQ_mask", -1);
-        ggml_set_input(lctx.inp_KQ_mask);
-
-        return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
-    }
-
-    struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) {
-        GGML_ASSERT(hparams.n_swa > 0);
-
-        lctx.inp_KQ_mask_swa = causal
-            ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
-            : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1);
-        ggml_set_input(lctx.inp_KQ_mask_swa);
-
-        return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa;
-    }
-
     struct ggml_tensor * build_inp_mean() {
         lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
         cb(lctx.inp_mean, "inp_mean", -1);
@@ -1352,20 +844,6 @@ struct llm_build_context {
         return lctx.inp_cls;
     }
 
-    struct ggml_tensor * build_inp_s_copy() {
-        lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
-        cb(lctx.inp_s_copy, "inp_s_copy", -1);
-        ggml_set_input(lctx.inp_s_copy);
-        return lctx.inp_s_copy;
-    }
-
-    struct ggml_tensor * build_inp_s_mask() {
-        lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
-        cb(lctx.inp_s_mask, "inp_s_mask", -1);
-        ggml_set_input(lctx.inp_s_mask);
-        return lctx.inp_s_mask;
-    }
-
     struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
         // find result_norm tensor for input
         struct ggml_tensor * inp = nullptr;
@@ -1431,37 +909,37 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_tensor * build_pos_bucket(bool causal) {
-        if (causal) {
-            lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
-        } else {
-            lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
-        }
+    //struct ggml_tensor * build_pos_bucket(bool causal) {
+    //    if (causal) {
+    //        lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
+    //    } else {
+    //        lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
+    //    }
 
-        ggml_set_input(lctx.inp_pos_bucket);
-        cb(lctx.inp_pos_bucket, "pos_bucket", -1);
+    //    ggml_set_input(lctx.inp_pos_bucket);
+    //    cb(lctx.inp_pos_bucket, "pos_bucket", -1);
 
-        return lctx.inp_pos_bucket;
-    }
+    //    return lctx.inp_pos_bucket;
+    //}
 
-    struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
-        struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
-        cb(pos_bucket_1d, "pos_bucket_1d", -1);
+    //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
+    //    struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
+    //    cb(pos_bucket_1d, "pos_bucket_1d", -1);
 
-        struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
-        cb(pos_bias, "pos_bias", -1);
+    //    struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
+    //    cb(pos_bias, "pos_bias", -1);
 
-        pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0],  0);
-        cb(pos_bias, "pos_bias", -1);
+    //    pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0],  0);
+    //    cb(pos_bias, "pos_bias", -1);
 
-        pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
-        cb(pos_bias, "pos_bias", -1);
+    //    pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
+    //    cb(pos_bias, "pos_bias", -1);
 
-        pos_bias = ggml_cont(ctx0, pos_bias);
-        cb(pos_bias, "pos_bias", -1);
+    //    pos_bias = ggml_cont(ctx0, pos_bias);
+    //    cb(pos_bias, "pos_bias", -1);
 
-        return pos_bias;
-    }
+    //    return pos_bias;
+    //}
 
     struct ggml_tensor * build_inp_embd_enc() {
         const int64_t n_embd = hparams.n_embd;
@@ -1491,45 +969,44 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -1550,9 +1027,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -1574,12 +1051,12 @@ struct llm_build_context {
             // feed-forward network
             if (model.layers[il].ffn_gate_inp == nullptr) {
 
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -1588,12 +1065,12 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
             } else {
                 // MoE branch
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                cur = build_moe_ffn(cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -1615,7 +1092,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1624,13 +1101,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         // For Granite architecture
         if (hparams.f_logit_scale) {
@@ -1657,13 +1134,12 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -1676,37 +1152,37 @@ struct llm_build_context {
                 cur = inpL;
             } else {
                 // norm
-                cur = llm_build_norm(ctx0, inpL, hparams,
+                cur = build_norm(inpL,
                         model.layers[il].attn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "attn_norm", il);
             }
 
             if (n_head > 0 && n_head_kv == 0) {
                 // "linear attention" of Llama-3_1-Nemotron-51B
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+                cur = build_lora_mm(model.layers[il].wo, cur);
                 cb(cur, "wo", il);
             } else if (n_head > 0) {
                 // self-attention
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -1727,9 +1203,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -1754,12 +1230,12 @@ struct llm_build_context {
 
             // feed-forward network
             if (model.layers[il].ffn_gate_inp == nullptr) {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -1776,7 +1252,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1785,13 +1261,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         // For Granite architecture
         if (hparams.f_logit_scale) {
@@ -1815,31 +1291,30 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 switch (model.type) {
@@ -1865,9 +1340,9 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -1882,12 +1357,12 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -1897,7 +1372,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1906,13 +1381,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -1930,31 +1405,30 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -1970,9 +1444,9 @@ struct llm_build_context {
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -1987,12 +1461,12 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -2002,7 +1476,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2011,11 +1485,11 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
+        cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -2034,37 +1508,36 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
 
-            attn_norm = llm_build_norm(ctx0, inpL, hparams,
+            attn_norm = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(attn_norm, "attn_norm", il);
 
             // self-attention
             {
                 if (model.layers[il].attn_norm_2) {
                     // Falcon-40B
-                    cur = llm_build_norm(ctx0, inpL, hparams,
+                    cur = build_norm(inpL,
                             model.layers[il].attn_norm_2,
                             model.layers[il].attn_norm_2_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(cur, "attn_norm_2", il);
                 } else {
                     cur = attn_norm;
                 }
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
@@ -2091,9 +1564,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2108,7 +1581,7 @@ struct llm_build_context {
 
             // feed forward
             {
-                cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result
+                cur = build_ffn(attn_norm, // !! use the attn norm, not the result
                         model.layers[il].ffn_up,   NULL, NULL,
                         NULL,                      NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -2119,7 +1592,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
             cur = ggml_add(ctx0, cur, inpL);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2129,13 +1602,13 @@ struct llm_build_context {
         cur = inpL;
 
         // norm
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -2156,7 +1629,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // multiply by embedding_multiplier_scale of 78.38367176906169
         inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
@@ -2164,37 +1637,36 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -2215,9 +1687,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2231,9 +1703,9 @@ struct llm_build_context {
             // Grok
             // if attn_out_norm is present then apply it before adding the input
             if (model.layers[il].attn_out_norm) {
-                cur = llm_build_norm(ctx0, cur, hparams,
+                cur = build_norm(cur,
                         model.layers[il].attn_out_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "attn_out_norm", il);
             }
 
@@ -2242,12 +1714,12 @@ struct llm_build_context {
 
             // feed-forward network
             // MoE branch
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+            cur = build_moe_ffn(cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -2264,16 +1736,16 @@ struct llm_build_context {
             // if layer_out_norm is present then apply it before adding the input
             // Idea: maybe ffn_out_norm is a better name
             if (model.layers[il].layer_out_norm) {
-                cur = llm_build_norm(ctx0, cur, hparams,
+                cur = build_norm(cur,
                         model.layers[il].layer_out_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "layer_out_norm", il);
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2282,13 +1754,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         // Grok
         // multiply logits by output_multiplier_scale of 0.5773502691896257
@@ -2316,21 +1788,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                                 model.layers[il].attn_norm, NULL,
-                                 LLM_NORM, cb, il);
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
@@ -2339,7 +1810,7 @@ struct llm_build_context {
                 struct ggml_tensor * Kcur = nullptr;
                 struct ggml_tensor * Vcur = nullptr;
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -2367,9 +1838,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2385,12 +1856,12 @@ struct llm_build_context {
 
             // feed-forward network
             // MoE branch
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                                 model.layers[il].attn_out_norm, NULL,
-                                 LLM_NORM, cb, il);
+            cur = build_norm(ffn_inp,
+                    model.layers[il].attn_out_norm, NULL,
+                    LLM_NORM, il);
             cb(cur, "attn_out_norm", il);
 
-            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+            cur = build_moe_ffn(cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -2406,7 +1877,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2415,13 +1886,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
-                             model.output_norm, NULL,
-                             LLM_NORM, cb, -1);
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
 
@@ -2440,13 +1911,12 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -2455,15 +1925,15 @@ struct llm_build_context {
         cb(inpL, "inpL", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -2479,9 +1949,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2497,13 +1967,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -2513,20 +1983,20 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -2543,28 +2013,27 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
@@ -2573,9 +2042,9 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 cb(Qcur, "Qcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2590,12 +2059,12 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -2605,7 +2074,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2614,13 +2083,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -2645,7 +2114,7 @@ struct llm_build_context {
         }
 
         // construct input embeddings (token, type, position)
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // token types are hardcoded to zero ("Sentence A")
         struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
@@ -2656,11 +2125,10 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // embed layer norm
-        inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
+        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
+        lctx.build_attn_inp(ctx0, n_tokens, false, false, worst_case);
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
@@ -2672,33 +2140,33 @@ struct llm_build_context {
 
             // self-attention
             if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
-                Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
+                Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
                 if (model.layers[il].attn_q_norm) {
-                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                    Qcur = build_norm(Qcur,
                             model.layers[il].attn_q_norm,
                             model.layers[il].attn_q_norm_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                 }
 
-                Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk);
+                Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
                 if (model.layers[il].attn_k_norm) {
-                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                    Kcur = build_norm(Kcur,
                             model.layers[il].attn_k_norm,
                             model.layers[il].attn_k_norm_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                 }
-                Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv);
+                Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
             } else {
                 // compute Q and K and RoPE them
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
@@ -2730,7 +2198,8 @@ struct llm_build_context {
             struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
             cb(kq, "kq", il);
 
-            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
+            //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
+            kq = lctx.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
             cb(kq, "kq_soft_max_ext", il);
 
             struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -2747,7 +2216,7 @@ struct llm_build_context {
 
             ggml_build_forward_expand(gf, cur);
 
-            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+            cur = build_lora_mm(model.layers[il].wo, cur);
             if (model.layers[il].bo) {
                 cb(cur, "kqv_wo", il);
             }
@@ -2768,11 +2237,11 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, inpL);
 
             // attention layer norm
-            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
+            cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
 
             if (model.layers[il].attn_norm_2 != nullptr) {
                 cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
-                cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
+                cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
             }
 
             struct ggml_tensor * ffn_inp = cur;
@@ -2780,21 +2249,21 @@ struct llm_build_context {
 
             // feed-forward network
             if (model.arch == LLM_ARCH_BERT) {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
             } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL,                        NULL,
                         model.layers[il].ffn_gate, NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
             } else {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -2807,7 +2276,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
 
             // output layer norm
-            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
+            cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
 
             // input for next layer
             inpL = cur;
@@ -2832,27 +2301,26 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
-        inpL = llm_build_norm(ctx0, inpL, hparams,
+        inpL = build_norm(inpL,
                 model.tok_norm,
                 model.tok_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -2868,9 +2336,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2886,13 +2354,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -2902,20 +2370,20 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -2934,10 +2402,9 @@ struct llm_build_context {
         struct ggml_tensor * pos;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         if (model.pos_embd) {
             // inp_pos - contains the positions
@@ -2952,17 +2419,17 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
 
-            attn_norm = llm_build_norm(ctx0, inpL, hparams,
+            attn_norm = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(attn_norm, "attn_norm", il);
 
             // self-attention
             {
                 cur = attn_norm;
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 if (model.layers[il].bqkv){
@@ -2985,30 +2452,30 @@ struct llm_build_context {
 
                 // Q/K Layernorm
                 if (model.layers[il].attn_q_norm) {
-                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                    Qcur = build_norm(Qcur,
                             model.layers[il].attn_q_norm,
                             model.layers[il].attn_q_norm_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(Qcur, "Qcur", il);
 
-                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                    Kcur = build_norm(Kcur,
                             model.layers[il].attn_k_norm,
                             model.layers[il].attn_k_norm_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(Kcur, "Kcur", il);
 
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
-                    cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                    cur = build_attn(gf,
                             model.layers[il].wo, model.layers[il].bo,
-                            Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                            Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 } else {
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                    cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                    cur = build_attn(gf,
                             model.layers[il].wo, model.layers[il].bo,
-                            Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                            Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 }
             }
 
@@ -3025,12 +2492,12 @@ struct llm_build_context {
 
             // feed forward
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -3040,7 +2507,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3049,13 +2516,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3072,22 +2539,21 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             struct ggml_tensor * inpSA = cur;
@@ -3095,21 +2561,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -3122,17 +2588,17 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 if (model.layers[il].attn_q_norm) {
-                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                    Qcur = build_norm(Qcur,
                             model.layers[il].attn_q_norm,
                             NULL,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(Qcur, "Qcur", il);
                 }
                 if (model.layers[il].attn_k_norm) {
-                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                    Kcur = build_norm(Kcur,
                             model.layers[il].attn_k_norm,
                             NULL,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(Kcur, "Kcur", il);
                 }
 
@@ -3151,9 +2617,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3170,16 +2636,16 @@ struct llm_build_context {
             // feed-forward network
             {
                 if (model.layers[il].ffn_norm) {
-                    cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    cur = build_norm(ffn_inp,
                             model.layers[il].ffn_norm,
                             model.layers[il].ffn_norm_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(cur, "ffn_norm", il);
                 } else {
                     // parallel residual
                     cur = inpSA;
                 }
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -3189,7 +2655,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3198,14 +2664,14 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3222,25 +2688,24 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -3270,9 +2735,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3287,12 +2752,12 @@ struct llm_build_context {
 
             // feed-forward forward
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -3302,7 +2767,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3311,13 +2776,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3335,37 +2800,36 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
@@ -3384,9 +2848,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3400,12 +2864,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -3414,7 +2878,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3423,13 +2887,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3446,7 +2910,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
@@ -3454,8 +2918,8 @@ struct llm_build_context {
         ggml_set_input(lctx.inp_pos);
         struct ggml_tensor * inp_pos = lctx.inp_pos;
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
         int sections[4];
         std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
 
@@ -3463,25 +2927,25 @@ struct llm_build_context {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
@@ -3502,9 +2966,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3518,12 +2982,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -3532,7 +2996,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3541,13 +3005,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3568,37 +3032,36 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self_attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
@@ -3617,9 +3080,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3634,13 +3097,13 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // MoE branch
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             ggml_tensor * moe_out =
-                    llm_build_moe_ffn(ctx0, lctx, cur,
+                    build_moe_ffn(cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -3655,14 +3118,14 @@ struct llm_build_context {
 
             // FFN shared expert
             {
-                ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
+                ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
                 cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
 
                 // sigmoid
                 ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
                 cb(cur_gate, "ffn_shexp_gate", il);
 
-                ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur,
+                ggml_tensor * cur_ffn = build_ffn(cur,
                         model.layers[il].ffn_up_shexp,   NULL, NULL,
                         model.layers[il].ffn_gate_shexp, NULL, NULL,
                         model.layers[il].ffn_down_shexp, NULL, NULL,
@@ -3680,7 +3143,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3689,13 +3152,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3715,19 +3178,18 @@ struct llm_build_context {
         struct ggml_tensor * ffn_output;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
+            attn_norm_output = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(attn_norm_output, "attn_norm", il);
 
             // self-attention
@@ -3737,7 +3199,7 @@ struct llm_build_context {
                 struct ggml_tensor * Vcur = nullptr;
 
                 if (model.layers[il].wqkv) {
-                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
+                    cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
                     cb(cur, "wqkv", il);
 
                     cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -3747,9 +3209,9 @@ struct llm_build_context {
                     Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                     Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
                 } else {
-                    Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
-                    Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
-                    Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                    Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
                 }
 
                 cb(Qcur, "Qcur", il);
@@ -3776,9 +3238,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3791,7 +3253,7 @@ struct llm_build_context {
 
             // FF
             {
-                ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output,
+                ffn_output = build_ffn(attn_norm_output,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -3802,20 +3264,20 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_output);
             cur = ggml_add(ctx0, cur, inpL);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output_no_bias", -1);
 
         cur = ggml_add(ctx0, cur, model.output_b);
@@ -3834,19 +3296,13 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = nullptr;
-        if (hparams.n_swa == 0) {
-            // Phi-4 doesn't use sliding window attention
-            KQ_mask = build_inp_KQ_mask();
-        } else {
-            KQ_mask = build_inp_KQ_mask_swa();
-        }
+        lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             auto residual = inpL;
@@ -3854,12 +3310,12 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
-                struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
+                struct ggml_tensor* attn_norm_output = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
                 cb(attn_norm_output, "attn_norm", il);
 
                 struct ggml_tensor * Qcur = nullptr;
@@ -3867,16 +3323,16 @@ struct llm_build_context {
                 struct ggml_tensor * Vcur = nullptr;
 
                 if (model.layers[il].wqkv) {
-                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
+                    cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
                     cb(cur, "wqkv", il);
 
                     Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
                     Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
                     Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
                 } else {
-                    Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
-                    Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
-                    Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                    Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
                 }
 
                 cb(Qcur, "Qcur", il);
@@ -3901,9 +3357,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3916,14 +3372,14 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, residual);
             residual = cur;
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                 model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                LLM_NORM_RMS, cb, il);
+                LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             // feed-forward network
             if (model.layers[il].ffn_gate_inp == nullptr) {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         NULL,                      NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -3932,7 +3388,7 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
             } else {
                 // MoE branch
-                cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                cur = build_moe_ffn(cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -3947,20 +3403,20 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, residual, cur);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
             model.output_norm,
             model.output_norm_b,
-            LLM_NORM_RMS, cb, -1);
+            LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         if (model.output_b != nullptr) {
             cb(cur, "result_output_no_bias", -1);
@@ -3984,20 +3440,19 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             struct ggml_tensor * attention_norm = cur;
@@ -4005,13 +3460,13 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -4026,9 +3481,9 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
             struct ggml_tensor * sa_out = cur;
 
@@ -4044,7 +3499,7 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -4055,7 +3510,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, sa_out);
             cur = ggml_add(ctx0, cur, inpL);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4064,13 +3519,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4089,13 +3544,12 @@ struct llm_build_context {
         struct ggml_tensor * pos;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -4104,15 +3558,15 @@ struct llm_build_context {
         cb(inpL, "inpL", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -4128,9 +3582,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4146,13 +3600,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -4162,20 +3616,20 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4194,24 +3648,23 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -4239,9 +3692,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4257,13 +3710,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -4273,20 +3726,20 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4304,41 +3757,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 // if (model.layers[il].bq) {
                 //     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 //     cb(Qcur, "Qcur", il);
                 // }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 // if (model.layers[il].bk) {
                 //     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 //     cb(Kcur, "Kcur", il);
                 // }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 // if (model.layers[il].bv) {
                 //     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -4359,9 +3811,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4375,12 +3827,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -4389,7 +3841,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4398,13 +3850,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4422,41 +3874,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -4477,9 +3928,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4493,12 +3944,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -4507,7 +3958,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4516,13 +3967,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4546,7 +3997,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // scale the input embeddings
         inpL = ggml_scale(ctx0, inpL, scale_embd);
@@ -4555,17 +4006,16 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            struct ggml_tensor * rope_factors = build_rope_factors(il);
+            struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self_attention
@@ -4575,9 +4025,9 @@ struct llm_build_context {
                 q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
                 cb(q, "q", il);
 
-                q = llm_build_norm(ctx0, q, hparams,
+                q = build_norm(q,
                         model.layers[il].attn_q_a_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(q, "q", il);
 
                 // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
@@ -4616,9 +4066,9 @@ struct llm_build_context {
                 cb(k_pe, "k_pe", il);
 
                 kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
-                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
+                kv_compressed = build_norm(kv_compressed,
                         model.layers[il].attn_kv_a_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(kv_compressed, "kv_compressed", il);
 
                 // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
@@ -4670,9 +4120,9 @@ struct llm_build_context {
                 struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
                 cb(k_states, "k_states", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        k_states, v_states, q_states, n_tokens, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4692,12 +4142,12 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -4711,7 +4161,7 @@ struct llm_build_context {
             cb(cur, "hidden_scaled_ffn", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4720,9 +4170,9 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head scaling
@@ -4731,7 +4181,7 @@ struct llm_build_context {
         cb(cur, "lmhead_scaling", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4747,7 +4197,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
         cb(inpL, "inp_scaled", -1);
@@ -4755,26 +4205,25 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -4792,9 +4241,9 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4807,14 +4256,14 @@ struct llm_build_context {
             struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
             cb(sa_out, "sa_out", il);
 
-            cur = llm_build_norm(ctx0, sa_out, hparams,
+            cur = build_norm(sa_out,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -4824,7 +4273,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, sa_out);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4833,13 +4282,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4855,7 +4304,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
         cb(inpL, "inp_scaled", -1);
@@ -4863,31 +4312,25 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        // gemma 2 requires different mask for layers using sliding window (SWA)
-        struct ggml_tensor * KQ_mask     = build_inp_KQ_mask(true);
-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
+        lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            // (il % 2) layers use SWA
-            struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask;
-
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -4911,14 +4354,14 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
             }
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                     model.layers[il].attn_post_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_post_norm", il);
 
             if (il == n_layer - 1) {
@@ -4931,14 +4374,14 @@ struct llm_build_context {
             struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
             cb(sa_out, "sa_out", il);
 
-            cur = llm_build_norm(ctx0, sa_out, hparams,
+            cur = build_norm(sa_out,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -4947,13 +4390,13 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
             }
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                 model.layers[il].ffn_post_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
             cb(cur, "ffn_post_norm", -1);
 
             cur = ggml_add(ctx0, cur, sa_out);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4962,13 +4405,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         // final logit soft-capping
         cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
@@ -4993,41 +4436,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -5048,9 +4490,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5065,12 +4507,12 @@ struct llm_build_context {
 
             // feed-forward network
 
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -5079,7 +4521,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5088,13 +4530,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5109,21 +4551,20 @@ struct llm_build_context {
         struct ggml_tensor * inpL;
 
         // {n_embd, n_tokens}
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = build_inp_s_copy();
-        struct ggml_tensor * state_mask = build_inp_s_mask();
+        struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
-            cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur,
-                    state_copy, state_mask,
-                    kv_head, n_kv, cb, il);
+            //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
+            cur = lctx.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
 
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
@@ -5142,13 +4583,13 @@ struct llm_build_context {
         }
 
         // final rmsnorm
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5167,41 +4608,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
             struct ggml_tensor * ffn_inp = cur;
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -5220,16 +4660,16 @@ struct llm_build_context {
                                 0);
                     cb(Kcur, "Kcur", il);
 
-                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                    Qcur = build_norm(Qcur,
                                 model.layers[il].attn_q_norm,
                                 NULL,
-                                LLM_NORM, cb, il);
+                                LLM_NORM, il);
                     cb(Qcur, "Qcur", il);
 
-                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                    Kcur = build_norm(Kcur,
                             model.layers[il].attn_k_norm,
                             NULL,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(Kcur, "Kcur", il);
                 }
 
@@ -5247,9 +4687,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5264,7 +4704,7 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, lctx, ffn_inp,
+                cur = build_ffn(ffn_inp,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -5276,7 +4716,7 @@ struct llm_build_context {
             // add together residual + FFN + self-attention
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5285,13 +4725,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         if (f_logit_scale) {
             cur = ggml_scale(ctx0, cur, f_logit_scale);
@@ -5315,15 +4755,12 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        // cohere2 requires different mask for layers using sliding window (SWA)
-        struct ggml_tensor * KQ_mask     = build_inp_KQ_mask();
-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
+        lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         // sliding window switch pattern
         const int32_t sliding_window_pattern = 4;
@@ -5331,35 +4768,34 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             // three layers sliding window attention (window size 4096) and ROPE
             // fourth layer uses global attention without positional embeddings
-            const bool           is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
-            struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
+            const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
+            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
             cb(cur, "attn_norm", il);
             struct ggml_tensor * ffn_inp = cur;
 
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -5385,8 +4821,8 @@ struct llm_build_context {
                     cb(Kcur, "Kcur", il);
                 }
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
-                                   KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
+                cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
+                                   n_tokens, 1.0f / sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5401,7 +4837,7 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
+                cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
                                     NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
                                     cb, il);
                 cb(cur, "ffn_out", il);
@@ -5410,7 +4846,7 @@ struct llm_build_context {
             // add together residual + FFN + self-attention
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5419,11 +4855,11 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1);
+        cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         if (f_logit_scale) {
             cur = ggml_scale(ctx0, cur, f_logit_scale);
@@ -5455,41 +4891,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     NULL, NULL,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (hparams.f_clamp_kqv > 0.0f) {
                     Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (hparams.f_clamp_kqv > 0.0f) {
                     Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (hparams.f_clamp_kqv > 0.0f) {
                     Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -5510,9 +4945,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, nullptr,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5527,12 +4962,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     NULL, NULL,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -5543,7 +4978,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5552,13 +4987,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 NULL, NULL,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5579,13 +5014,12 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5595,21 +5029,21 @@ struct llm_build_context {
             // self_attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
-                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, il);
                 cb(Qcur, "Qcur_normed", il);
 
-                Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, il);
                 cb(Kcur, "Kcur_normed", il);
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -5629,14 +5063,14 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                     model.layers[il].attn_post_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_post_norm", il);
 
             if (il == n_layer - 1) {
@@ -5651,7 +5085,7 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_ffn(ctx0, lctx, ffn_inp,
+            cur = build_ffn(ffn_inp,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -5659,15 +5093,15 @@ struct llm_build_context {
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
             cb(cur, "ffn_out", il);
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                 model.layers[il].ffn_post_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
             cb(cur, "ffn_post_norm", -1);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5676,13 +5110,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5707,41 +5141,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self_attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
-                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, il);
                 cb(Qcur, "Qcur_normed", il);
 
-                Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, il);
                 cb(Kcur, "Kcur_normed", il);
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -5761,9 +5194,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5778,12 +5211,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // MoE branch
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+            cur = build_moe_ffn(cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -5797,7 +5230,7 @@ struct llm_build_context {
             cb(cur, "ffn_moe_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5806,13 +5239,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5828,13 +5261,12 @@ struct llm_build_context {
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             const int64_t n_head    = hparams.n_head(il);
@@ -5845,14 +5277,14 @@ struct llm_build_context {
             struct ggml_tensor * residual = cur;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
@@ -5866,14 +5298,14 @@ struct llm_build_context {
                 struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
                 cb(Vcur, "Vcur", il);
 
-                Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                Qcur = build_norm(Qcur,
                         model.layers[il].attn_q_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(Qcur, "Qcur", il);
 
-                Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                Kcur = build_norm(Kcur,
                         model.layers[il].attn_k_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(Kcur, "Kcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -5891,9 +5323,9 @@ struct llm_build_context {
                 Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
                 cb(Qcur, "Vcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5908,12 +5340,12 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -5923,7 +5355,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             inpL = cur;
@@ -5932,12 +5364,12 @@ struct llm_build_context {
         cur = inpL;
 
         // norm
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5955,24 +5387,23 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -6000,9 +5431,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -6019,13 +5450,13 @@ struct llm_build_context {
 
                 struct ggml_tensor * attn_out = cur;
 
-                cur = llm_build_norm(ctx0, inpL, hparams,
+                cur = build_norm(inpL,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -6037,7 +5468,7 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
 
                 cur = ggml_add(ctx0, cur, attn_out);
-                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cur = cvec.apply_to(ctx0, cur, il);
                 cb(cur, "l_out", il);
 
                 // input for next layer
@@ -6050,13 +5481,13 @@ struct llm_build_context {
                 struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
                 cb(ffn_inp, "ffn_inp", il);
 
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -6065,7 +5496,7 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
 
                 cur = ggml_add(ctx0, cur, ffn_inp);
-                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cur = cvec.apply_to(ctx0, cur, il);
                 cb(cur, "l_out", il);
 
                 // input for next layer
@@ -6073,13 +5504,13 @@ struct llm_build_context {
             }
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -6100,33 +5531,32 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -6143,9 +5573,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -6160,12 +5590,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -6177,12 +5607,12 @@ struct llm_build_context {
             cb(ffn_out, "ffn_out", il);
 
             // MoE
-            cur = llm_build_norm(ctx0, inpSA, hparams,
+            cur = build_norm(inpSA,
                     model.layers[il].ffn_norm_exps, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm_exps", il);
 
-            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+            cur = build_moe_ffn(cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -6198,7 +5628,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_out);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6207,13 +5637,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -6234,44 +5664,45 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -6292,9 +5723,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -6309,13 +5740,13 @@ struct llm_build_context {
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             if ((uint32_t) il < hparams.n_layer_dense_lead) {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -6325,7 +5756,7 @@ struct llm_build_context {
             } else {
                 // MoE branch
                 ggml_tensor * moe_out =
-                        llm_build_moe_ffn(ctx0, lctx, cur,
+                        build_moe_ffn(cur,
                             model.layers[il].ffn_gate_inp,
                             model.layers[il].ffn_up_exps,
                             model.layers[il].ffn_gate_exps,
@@ -6340,7 +5771,7 @@ struct llm_build_context {
 
                 // FFN shared expert
                 {
-                    ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
+                    ggml_tensor * ffn_shexp = build_ffn(cur,
                             model.layers[il].ffn_up_shexp,   NULL, NULL,
                             model.layers[il].ffn_gate_shexp, NULL, NULL,
                             model.layers[il].ffn_down_shexp, NULL, NULL,
@@ -6354,7 +5785,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6363,13 +5794,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
 
@@ -6400,21 +5831,20 @@ struct llm_build_context {
         struct ggml_tensor * inpL;
 
         // {n_embd, n_tokens}
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self_attention
@@ -6425,9 +5855,9 @@ struct llm_build_context {
                     q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
                     cb(q, "q", il);
 
-                    q = llm_build_norm(ctx0, q, hparams,
+                    q = build_norm(q,
                             model.layers[il].attn_q_a_norm, NULL,
-                            LLM_NORM_RMS, cb, il);
+                            LLM_NORM_RMS, il);
                     cb(q, "q", il);
 
                     // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
@@ -6470,9 +5900,9 @@ struct llm_build_context {
                 cb(k_pe, "k_pe", il);
 
                 kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
-                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
+                kv_compressed = build_norm(kv_compressed,
                         model.layers[il].attn_kv_a_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(kv_compressed, "kv_compressed", il);
 
                 // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
@@ -6524,9 +5954,9 @@ struct llm_build_context {
                 struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
                 cb(k_states, "k_states", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        k_states, v_states, q_states, n_tokens, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -6540,13 +5970,13 @@ struct llm_build_context {
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             if ((uint32_t) il < hparams.n_layer_dense_lead) {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -6556,7 +5986,7 @@ struct llm_build_context {
             } else {
                 // MoE branch
                 ggml_tensor * moe_out =
-                        llm_build_moe_ffn(ctx0, lctx, cur,
+                        build_moe_ffn(cur,
                             model.layers[il].ffn_gate_inp,
                             model.layers[il].ffn_up_exps,
                             model.layers[il].ffn_gate_exps,
@@ -6571,7 +6001,7 @@ struct llm_build_context {
 
                 // FFN shared expert
                 {
-                    ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
+                    ggml_tensor * ffn_shexp = build_ffn(cur,
                             model.layers[il].ffn_up_shexp,   NULL, NULL,
                             model.layers[il].ffn_gate_shexp, NULL, NULL,
                             model.layers[il].ffn_down_shexp, NULL, NULL,
@@ -6585,7 +6015,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6594,9 +6024,9 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
@@ -6617,26 +6047,25 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 if (model.layers[il].wq_scale) {
                     Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
                 }
@@ -6647,7 +6076,7 @@ struct llm_build_context {
                 }
 
                 // B1.K
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 if (model.layers[il].wk_scale) {
                     Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
                 }
@@ -6658,7 +6087,7 @@ struct llm_build_context {
                 }
 
                 // B1.V
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 if (model.layers[il].wv_scale) {
                     Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
                 }
@@ -6682,16 +6111,16 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         NULL, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
-                cur = llm_build_norm(ctx0, cur, hparams,
+                cur = build_norm(cur,
                         model.layers[il].attn_sub_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "attn_sub_norm", il);
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+                cur = build_lora_mm(model.layers[il].wo, cur);
                 if (model.layers[il].wo_scale) {
                     cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
                 }
@@ -6712,12 +6141,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward forward
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, model.layers[il].ffn_up_scale,
                     model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
                     NULL,                      NULL, NULL,
@@ -6725,12 +6154,12 @@ struct llm_build_context {
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
             cb(cur, "ffn_sub_out", il);
 
-            cur = llm_build_norm(ctx0, cur, hparams,
-                            model.layers[il].ffn_sub_norm, NULL,
-                            LLM_NORM_RMS, cb, il);
+            cur = build_norm(cur,
+                    model.layers[il].ffn_sub_norm, NULL,
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_sub_norm", il);
 
-            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
+            cur = build_lora_mm(model.layers[il].ffn_down, cur);
             if (model.layers[il].ffn_down_scale) {
                 cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
             }
@@ -6745,356 +6174,356 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
         // FIXME: do not use model.tok_embd directly, duplicate as model.output
-        cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur);
+        cur = build_lora_mm(model.tok_embd, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
         return gf;
     }
 
-    struct ggml_cgraph * build_t5_enc() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
+    //struct ggml_cgraph * build_t5_enc() {
+    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+    //    // mutable variable, needed during the last layer of the computation to skip unused tokens
+    //    int32_t n_tokens = this->n_tokens;
 
-        GGML_ASSERT(lctx.is_encoding);
-        struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false);
+    //    const int64_t n_embd_head = hparams.n_embd_head_v;
+    //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+    //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
+    //    struct ggml_tensor * cur;
+    //    struct ggml_tensor * inpL;
 
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
+    //    inpL = build_inp_embd(model.tok_embd);
 
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm_enc, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
+    //    GGML_ASSERT(lctx.is_encoding);
+    //    struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false);
 
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
-                cb(Qcur, "Qcur", il);
+    //    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+    //    struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
-                cb(Kcur, "Kcur", il);
+    //    for (int il = 0; il < n_layer; ++il) {
+    //        struct ggml_tensor * inpSA = inpL;
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
-                cb(Vcur, "Vcur", il);
+    //        // norm
+    //        cur = build_norm(inpL,
+    //                model.layers[il].attn_norm_enc, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "attn_norm", il);
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+    //        // self-attention
+    //        {
+    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
+    //            cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-                struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
+    //            cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-                cb(kq, "kq", il);
+    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
+    //            cb(Vcur, "Vcur", il);
 
-                struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
-                struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b);
-                struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
-                cb(kq_b, "kq_b", il);
+    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+    //            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
-                kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
-                cb(kq, "kq_soft_max_ext", il);
+    //            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+    //            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
 
-                struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
-                cb(v, "v", il);
+    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    //            cb(kq, "kq", il);
 
-                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
-                cb(kqv, "kqv", il);
+    //            struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+    //            struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b);
+    //            struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+    //            cb(kq_b, "kq_b", il);
 
-                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-                cb(kqv_merged, "kqv_merged", il);
+    //            kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
+    //            cb(kq, "kq_soft_max_ext", il);
 
-                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-                cb(cur, "kqv_merged_cont", il);
+    //            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
+    //            cb(v, "v", il);
 
-                ggml_build_forward_expand(gf, cur);
+    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
+    //            cb(kqv, "kqv", il);
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
-                cb(cur, "kqv_out", il);
-            }
+    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    //            cb(kqv_merged, "kqv_merged", il);
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
+    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+    //            cb(cur, "kqv_merged_cont", il);
 
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
+    //            ggml_build_forward_expand(gf, cur);
 
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm_enc, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
+    //            cur = build_lora_mm(model.layers[il].wo_enc, cur);
+    //            cb(cur, "kqv_out", il);
+    //        }
 
-                // T5 uses relu, flan-T5 uses gelu-gated
-                cur = llm_build_ffn(ctx0, lctx, cur,
-                        model.layers[il].ffn_up_enc,   NULL, NULL,
-                        model.layers[il].ffn_gate_enc, NULL, NULL,
-                        model.layers[il].ffn_down_enc, NULL, NULL,
-                        NULL,
-                        model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
-                        model.layers[il].ffn_gate_enc ? LLM_FFN_PAR  : LLM_FFN_SEQ,
-                        cb, il);
-                cb(cur, "ffn_out", il);
-            }
+    //        if (il == n_layer - 1) {
+    //            // skip computing output for unused tokens
+    //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+    //            n_tokens = n_outputs;
+    //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+    //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+    //        }
 
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
+    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+    //        cb(ffn_inp, "ffn_inp", il);
 
-            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-            if (layer_dir != nullptr) {
-                cur = ggml_add(ctx0, cur, layer_dir);
-            }
-            cb(cur, "l_out", il);
+    //        // feed-forward network
+    //        {
+    //            cur = build_norm(ffn_inp,
+    //                    model.layers[il].ffn_norm_enc, NULL,
+    //                    LLM_NORM_RMS, il);
+    //            cb(cur, "ffn_norm", il);
 
-            // input for next layer
-            inpL = cur;
-        }
+    //            // T5 uses relu, flan-T5 uses gelu-gated
+    //            cur = build_ffn(cur,
+    //                    model.layers[il].ffn_up_enc,   NULL, NULL,
+    //                    model.layers[il].ffn_gate_enc, NULL, NULL,
+    //                    model.layers[il].ffn_down_enc, NULL, NULL,
+    //                    NULL,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_PAR  : LLM_FFN_SEQ,
+    //                    cb, il);
+    //            cb(cur, "ffn_out", il);
+    //        }
 
-        cur = inpL;
-        cb(cur, "result_embd", -1);
+    //        cur = ggml_add(ctx0, cur, ffn_inp);
+    //        cb(cur, "ffn_out", il);
 
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm_enc, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
+    //        ggml_tensor * layer_dir = cvec.tensor_for(il);
+    //        if (layer_dir != nullptr) {
+    //            cur = ggml_add(ctx0, cur, layer_dir);
+    //        }
+    //        cb(cur, "l_out", il);
 
-        ggml_build_forward_expand(gf, cur);
+    //        // input for next layer
+    //        inpL = cur;
+    //    }
 
-        return gf;
-    }
+    //    cur = inpL;
+    //    cb(cur, "result_embd", -1);
 
-    struct ggml_cgraph * build_t5_dec() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    //    cur = build_norm(cur,
+    //            model.output_norm_enc, NULL,
+    //            LLM_NORM_RMS, -1);
+    //    cb(cur, "result_norm", -1);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
+    //    ggml_build_forward_expand(gf, cur);
 
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    //    return gf;
+    //}
 
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
+    //struct ggml_cgraph * build_t5_dec() {
+    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+    //    // mutable variable, needed during the last layer of the computation to skip unused tokens
+    //    int32_t n_tokens = this->n_tokens;
 
-        GGML_ASSERT(!lctx.is_encoding);
-        GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
+    //    const int64_t n_embd_head = hparams.n_embd_head_v;
+    //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+    //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
-        struct ggml_tensor * embd_enc       = build_inp_embd_enc();
-        struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true);
+    //    struct ggml_tensor * cur;
+    //    struct ggml_tensor * inpL;
 
-        struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
-        struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross();
+    //    inpL = build_inp_embd(model.tok_embd);
 
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
+    //    GGML_ASSERT(!lctx.is_encoding);
+    //    GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
 
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
+    //    struct ggml_tensor * embd_enc       = build_inp_embd_enc();
+    //    struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true);
 
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
+    //    struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
+    //    struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross();
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
+    //    for (int il = 0; il < n_layer; ++il) {
+    //        struct ggml_tensor * inpSA = inpL;
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
+    //        // norm
+    //        cur = build_norm(inpL,
+    //                model.layers[il].attn_norm, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "attn_norm", il);
 
-                llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
+    //        // self-attention
+    //        {
+    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+    //            cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * k =
-                    ggml_view_3d(ctx0, kv_self.k_l[il],
-                            n_embd_head_k, n_kv, n_head_kv,
-                            ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                            ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-                            0);
-                cb(k, "k", il);
+    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+    //            cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * v =
-                    ggml_view_3d(ctx0, kv_self.v_l[il],
-                            n_kv, n_embd_head_v, n_head_kv,
-                            ggml_element_size(kv_self.v_l[il])*n_ctx,
-                            ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
-                            0);
-                cb(v, "v", il);
+    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+    //            cb(Vcur, "Vcur", il);
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+    //            build_kv_store(gf, Kcur, Vcur, il);
 
-                struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+    //            struct ggml_tensor * k =
+    //                ggml_view_3d(ctx0, kv_self.k_l[il],
+    //                        n_embd_head_k, n_kv, n_head_kv,
+    //                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+    //                        ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+    //                        0);
+    //            cb(k, "k", il);
 
-                struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-                cb(kq, "kq", il);
+    //            struct ggml_tensor * v =
+    //                ggml_view_3d(ctx0, kv_self.v_l[il],
+    //                        n_kv, n_embd_head_v, n_head_kv,
+    //                        ggml_element_size(kv_self.v_l[il])*n_ctx,
+    //                        ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+    //                        0);
+    //            cb(v, "v", il);
 
-                struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
-                struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b);
-                struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
-                cb(kq_b, "kq_b", il);
+    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
-                cb(kq, "kq_soft_max_ext", il);
+    //            struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
 
-                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
-                cb(kqv, "kqv", il);
+    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    //            cb(kq, "kq", il);
 
-                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-                cb(kqv_merged, "kqv_merged", il);
+    //            struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+    //            struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b);
+    //            struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+    //            cb(kq_b, "kq_b", il);
 
-                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-                cb(cur, "kqv_merged_cont", il);
+    //            kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
+    //            cb(kq, "kq_soft_max_ext", il);
 
-                ggml_build_forward_expand(gf, cur);
+    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+    //            cb(kqv, "kqv", il);
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
-                cb(cur, "kqv_out", il);
-            }
+    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    //            cb(kqv_merged, "kqv_merged", il);
 
-            cur = ggml_add(ctx0, cur, inpSA);
-            cb(cur, "cross_inp", il);
+    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+    //            cb(cur, "kqv_merged_cont", il);
 
-            struct ggml_tensor * inpCA = cur;
+    //            ggml_build_forward_expand(gf, cur);
 
-            // norm
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.layers[il].attn_norm_cross, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm_cross", il);
+    //            cur = build_lora_mm(model.layers[il].wo, cur);
+    //            cb(cur, "kqv_out", il);
+    //        }
 
-            // cross-attention
-            {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
-                cb(Qcur, "Qcur", il);
+    //        cur = ggml_add(ctx0, cur, inpSA);
+    //        cb(cur, "cross_inp", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
-                cb(Kcur, "Kcur", il);
+    //        struct ggml_tensor * inpCA = cur;
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
-                cb(Vcur, "Vcur", il);
+    //        // norm
+    //        cur = build_norm(cur,
+    //                model.layers[il].attn_norm_cross, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "attn_norm_cross", il);
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+    //        // cross-attention
+    //        {
+    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
+    //            cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-                struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
+    //            cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-                cb(kq, "kq", il);
+    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
+    //            cb(Vcur, "Vcur", il);
 
-                kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
-                cb(kq, "kq_soft_max_ext", il);
+    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+    //            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
 
-                struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
-                cb(v, "v", il);
+    //            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+    //            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
 
-                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
-                cb(kqv, "kqv", il);
+    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    //            cb(kq, "kq", il);
 
-                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-                cb(kqv_merged, "kqv_merged", il);
+    //            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
+    //            cb(kq, "kq_soft_max_ext", il);
 
-                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-                cb(cur, "kqv_merged_cont", il);
+    //            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
+    //            cb(v, "v", il);
 
-                ggml_build_forward_expand(gf, cur);
+    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
+    //            cb(kqv, "kqv", il);
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
-                cb(cur, "kqv_out", il);
-            }
+    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    //            cb(kqv_merged, "kqv_merged", il);
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-                inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
-            }
+    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+    //            cb(cur, "kqv_merged_cont", il);
 
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
-            cb(ffn_inp, "ffn_inp", il);
+    //            ggml_build_forward_expand(gf, cur);
 
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
+    //            cur = build_lora_mm(model.layers[il].wo_cross, cur);
+    //            cb(cur, "kqv_out", il);
+    //        }
 
-                // T5 uses relu, flan-T5 uses gelu-gated
-                cur = llm_build_ffn(ctx0, lctx, cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
-                        model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
-                        cb, il);
-                cb(cur, "ffn_out", il);
-            }
+    //        if (il == n_layer - 1) {
+    //            // skip computing output for unused tokens
+    //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+    //            n_tokens = n_outputs;
+    //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+    //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+    //            inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
+    //        }
 
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
+    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
+    //        cb(ffn_inp, "ffn_inp", il);
 
-            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-            if (layer_dir != nullptr) {
-                cur = ggml_add(ctx0, cur, layer_dir);
-            }
-            cb(cur, "l_out", il);
+    //        // feed-forward network
+    //        {
+    //            cur = build_norm(ffn_inp,
+    //                    model.layers[il].ffn_norm, NULL,
+    //                    LLM_NORM_RMS, il);
+    //            cb(cur, "ffn_norm", il);
 
-            // input for next layer
-            inpL = cur;
-        }
+    //            // T5 uses relu, flan-T5 uses gelu-gated
+    //            cur = build_ffn(cur,
+    //                    model.layers[il].ffn_up,   NULL, NULL,
+    //                    model.layers[il].ffn_gate, NULL, NULL,
+    //                    model.layers[il].ffn_down, NULL, NULL,
+    //                    NULL,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+    //                    cb, il);
+    //            cb(cur, "ffn_out", il);
+    //        }
 
-        cur = inpL;
-        cb(cur, "result_embd", -1);
+    //        cur = ggml_add(ctx0, cur, ffn_inp);
+    //        cb(cur, "ffn_out", il);
 
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
+    //        ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+    //        if (layer_dir != nullptr) {
+    //            cur = ggml_add(ctx0, cur, layer_dir);
+    //        }
+    //        cb(cur, "l_out", il);
 
-        // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
+    //        // input for next layer
+    //        inpL = cur;
+    //    }
 
-        ggml_build_forward_expand(gf, cur);
+    //    cur = inpL;
+    //    cb(cur, "result_embd", -1);
 
-        return gf;
-    }
+    //    cur = build_norm(cur,
+    //            model.output_norm, NULL,
+    //            LLM_NORM_RMS, -1);
+    //    cb(cur, "result_norm", -1);
+
+    //    // lm_head
+    //    cur = build_lora_mm(model.output, cur);
+    //    cb(cur, "result_output", -1);
+
+    //    ggml_build_forward_expand(gf, cur);
+
+    //    return gf;
+    //}
 
     struct ggml_cgraph * build_jais() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -7106,21 +6535,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -7136,9 +6564,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -7154,13 +6582,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -7173,13 +6601,13 @@ struct llm_build_context {
             cb(inpL, "l_out", il);
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
 
@@ -7198,21 +6626,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
@@ -7221,7 +6648,7 @@ struct llm_build_context {
                 struct ggml_tensor * Kcur = nullptr;
                 struct ggml_tensor * Vcur = nullptr;
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -7249,9 +6676,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
             }
 
@@ -7268,13 +6695,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         NULL,                      NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -7288,13 +6715,13 @@ struct llm_build_context {
             cb(inpL, "l_out", il);
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -7312,42 +6739,41 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -7368,9 +6794,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -7384,13 +6810,13 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm,
                     model.layers[il].ffn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                     NULL,                      NULL,                        NULL,
                     model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -7409,13 +6835,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -7436,44 +6862,43 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -7494,9 +6919,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -7511,12 +6936,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -7536,13 +6961,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -7550,230 +6975,232 @@ struct llm_build_context {
         return gf;
     }
 
-    ggml_cgraph * build_rwkv6() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        // Token shift state dimensions should be 2 * n_emb
-        GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
-
-        const int64_t n_seqs = ubatch.n_seqs;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_tokens = ubatch.n_tokens;
-        GGML_ASSERT(n_seqs != 0);
-        GGML_ASSERT(ubatch.equal_seqs);
-        GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-        struct ggml_tensor * state_copy = build_inp_s_copy();
-        struct ggml_tensor * state_mask = build_inp_s_mask();
-
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
-        inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            const llama_layer * layer = &model.layers[il];
-
-            // (ab)using the KV cache to store the states
-            struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
-                    gf, kv_self.k_l[il], state_copy, state_mask,
-                    hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
-            struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
-                    gf, kv_self.v_l[il], state_copy, state_mask,
-                    hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
-
-            cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-            token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
-
-            struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
-            struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
-
-            struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
-            struct ggml_tensor * x_prev = ggml_concat(
-                ctx0,
-                att_shift,
-                ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
-                1
-            );
-
-            cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
-            ggml_build_forward_expand(gf, cur);
-            ggml_build_forward_expand(
-                gf,
-                ggml_cpy(
-                    ctx0,
-                    wkv_states,
-                    ggml_view_1d(
-                        ctx0,
-                        kv_self.v_l[il],
-                        hparams.n_embd_v_s() * n_seqs,
-                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-                    )
-                )
-            );
-
-            struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
-            x_prev = ggml_concat(
-                ctx0,
-                ffn_shift,
-                ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
-                1
-            );
-            cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
-            ggml_build_forward_expand(gf, cur);
-
-            struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
-            struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
-
-            token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
-
-            ggml_build_forward_expand(
-                gf,
-                ggml_cpy(
-                    ctx0,
-                    ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
-                    ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
-                )
-            );
-
-            if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
-                cur = ggml_scale(ctx0, cur, 0.5F);
-            }
-
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-
-        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
+    //ggml_cgraph * build_rwkv6() {
+    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+    //    // Token shift state dimensions should be 2 * n_emb
+    //    GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
+
+    //    const int64_t n_seqs = ubatch.n_seqs;
+    //    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+    //    const int64_t n_tokens = ubatch.n_tokens;
+    //    GGML_ASSERT(n_seqs != 0);
+    //    GGML_ASSERT(ubatch.equal_seqs);
+    //    GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+
+    //    struct ggml_tensor * cur;
+    //    struct ggml_tensor * inpL;
+    //    struct ggml_tensor * state_copy = build_inp_s_copy();
+    //    struct ggml_tensor * state_mask = build_inp_s_mask();
+
+    //    inpL = build_inp_embd(model.tok_embd);
+    //    inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+
+    //    for (int il = 0; il < n_layer; ++il) {
+    //        const llama_layer * layer = &model.layers[il];
+
+    //        // (ab)using the KV cache to store the states
+    //        struct ggml_tensor * token_shift = build_copy_mask_state(
+    //                gf, kv_self.k_l[il], state_copy, state_mask,
+    //                hparams.n_embd_k_s(), n_seqs);
+
+    //        struct ggml_tensor * wkv_states = build_copy_mask_state(
+    //                gf, kv_self.v_l[il], state_copy, state_mask,
+    //                hparams.n_embd_v_s(), n_seqs);
+
+    //        cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+    //        token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
+
+    //        struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+    //        struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
+
+    //        struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
+    //        struct ggml_tensor * x_prev = ggml_concat(
+    //            ctx0,
+    //            att_shift,
+    //            ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
+    //            1
+    //        );
+
+    //        cur = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
+    //        ggml_build_forward_expand(gf, cur);
+    //        ggml_build_forward_expand(
+    //            gf,
+    //            ggml_cpy(
+    //                ctx0,
+    //                wkv_states,
+    //                ggml_view_1d(
+    //                    ctx0,
+    //                    kv_self.v_l[il],
+    //                    hparams.n_embd_v_s() * n_seqs,
+    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+    //                )
+    //            )
+    //        );
+
+    //        struct ggml_tensor * x_norm_ffn = build_norm(cur, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
+    //        x_prev = ggml_concat(
+    //            ctx0,
+    //            ffn_shift,
+    //            ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
+    //            1
+    //        );
+    //        cur = ggml_add(ctx0, cur, build_rwkv6_channel_mix(layer, x_norm_ffn, x_prev));
+    //        ggml_build_forward_expand(gf, cur);
+
+    //        struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
+    //        struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
+
+    //        token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
+
+    //        ggml_build_forward_expand(
+    //            gf,
+    //            ggml_cpy(
+    //                ctx0,
+    //                ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
+    //                ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
+    //            )
+    //        );
+
+    //        if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
+    //            cur = ggml_scale(ctx0, cur, 0.5F);
+    //        }
+
+    //        cur = lctx.cvec.apply_to(ctx0, cur, il);
+    //        cb(cur, "l_out", il);
+
+    //        // input for next layer
+    //        inpL = cur;
+    //    }
+
+    //    cur = inpL;
+    //    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    //    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+
+    //    cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+    //    cb(cur, "result_norm", -1);
+
+    //    cur = build_lora_mm(model.output, cur);
+    //    cb(cur, "result_output", -1);
+
+    //    ggml_build_forward_expand(gf, cur);
+
+    //    return gf;
+    //}
 
     // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
-    ggml_cgraph * build_rwkv6qwen2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        GGML_ASSERT(n_embd == hparams.n_embd_k_s());
-
-        const int64_t n_seqs = ubatch.n_seqs;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_tokens = ubatch.n_tokens;
-        GGML_ASSERT(n_seqs != 0);
-        GGML_ASSERT(ubatch.equal_seqs);
-        GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-        struct ggml_tensor * state_copy = build_inp_s_copy();
-        struct ggml_tensor * state_mask = build_inp_s_mask();
-
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
-
-        for (int il = 0; il < n_layer; ++il) {
-            const llama_layer * layer = &model.layers[il];
-
-            // (ab)using the KV cache to store the states
-            struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
-                    gf, kv_self.k_l[il], state_copy, state_mask,
-                    hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
-            struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
-                    gf, kv_self.v_l[il], state_copy, state_mask,
-                    hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
-
-            cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-            token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
-
-            struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il);
-            struct ggml_tensor * x_prev = ggml_concat(
-                ctx0,
-                token_shift,
-                ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
-                1
-            );
-
-            ggml_build_forward_expand(
-                gf,
-                ggml_cpy(
-                    ctx0,
-                    wkv_states,
-                    ggml_view_1d(
-                        ctx0,
-                        kv_self.v_l[il],
-                        hparams.n_embd_v_s() * n_seqs,
-                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-                    )
-                )
-            );
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
-            ggml_build_forward_expand(gf, ffn_inp);
-            ggml_build_forward_expand(
-                gf,
-                ggml_cpy(
-                    ctx0,
-                    wkv_states,
-                    ggml_view_1d(
-                        ctx0,
-                        kv_self.v_l[il],
-                        hparams.n_embd_v_s() * n_seqs,
-                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-                    )
-                )
-            );
-
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = llm_build_ffn(ctx0, lctx, cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-
-        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
+    //ggml_cgraph * build_rwkv6qwen2() {
+    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+    //    GGML_ASSERT(n_embd == hparams.n_embd_k_s());
+
+    //    const int64_t n_seqs = ubatch.n_seqs;
+    //    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+    //    const int64_t n_tokens = ubatch.n_tokens;
+    //    GGML_ASSERT(n_seqs != 0);
+    //    GGML_ASSERT(ubatch.equal_seqs);
+    //    GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+
+    //    struct ggml_tensor * cur;
+    //    struct ggml_tensor * inpL;
+    //    struct ggml_tensor * state_copy = build_inp_s_copy();
+    //    struct ggml_tensor * state_mask = build_inp_s_mask();
+
+    //    inpL = build_inp_embd(model.tok_embd);
+
+    //    for (int il = 0; il < n_layer; ++il) {
+    //        const llama_layer * layer = &model.layers[il];
+
+    //        // (ab)using the KV cache to store the states
+    //        struct ggml_tensor * token_shift = build_copy_mask_state(
+    //                gf, kv_self.k_l[il], state_copy, state_mask,
+    //                hparams.n_embd_k_s(), n_seqs);
+
+    //        struct ggml_tensor * wkv_states = build_copy_mask_state(
+    //                gf, kv_self.v_l[il], state_copy, state_mask,
+    //                hparams.n_embd_v_s(), n_seqs);
+
+    //        cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+    //        token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
+
+    //        struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
+    //        struct ggml_tensor * x_prev = ggml_concat(
+    //            ctx0,
+    //            token_shift,
+    //            ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
+    //            1
+    //        );
+
+    //        ggml_build_forward_expand(
+    //            gf,
+    //            ggml_cpy(
+    //                ctx0,
+    //                wkv_states,
+    //                ggml_view_1d(
+    //                    ctx0,
+    //                    kv_self.v_l[il],
+    //                    hparams.n_embd_v_s() * n_seqs,
+    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+    //                )
+    //            )
+    //        );
+
+    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
+    //        ggml_build_forward_expand(gf, ffn_inp);
+    //        ggml_build_forward_expand(
+    //            gf,
+    //            ggml_cpy(
+    //                ctx0,
+    //                wkv_states,
+    //                ggml_view_1d(
+    //                    ctx0,
+    //                    kv_self.v_l[il],
+    //                    hparams.n_embd_v_s() * n_seqs,
+    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+    //                )
+    //            )
+    //        );
+
+    //        cb(ffn_inp, "ffn_inp", il);
+
+    //        // feed-forward network
+    //        cur = build_norm(ffn_inp,
+    //                model.layers[il].ffn_norm, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "ffn_norm", il);
+
+    //        cur = build_ffn(cur,
+    //                model.layers[il].ffn_up,   NULL, NULL,
+    //                model.layers[il].ffn_gate, NULL, NULL,
+    //                model.layers[il].ffn_down, NULL, NULL,
+    //                NULL,
+    //                LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+    //        cb(cur, "ffn_out", il);
+
+    //        cur = ggml_add(ctx0, cur, ffn_inp);
+    //        cur = lctx.cvec.apply_to(ctx0, cur, il);
+    //        cb(cur, "l_out", il);
+
+    //        // input for next layer
+    //        inpL = cur;
+    //    }
+
+    //    cur = inpL;
+    //    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    //    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+
+    //    cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+    //    cb(cur, "result_norm", -1);
+
+    //    cur = build_lora_mm(model.output, cur);
+    //    cb(cur, "result_output", -1);
+
+    //    ggml_build_forward_expand(gf, cur);
+
+    //    return gf;
+    //}
 
     // ref: https://github.com/facebookresearch/chameleon
     // based on the original build_llama() function, changes:
@@ -7794,13 +7221,12 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7809,22 +7235,22 @@ struct llm_build_context {
             if (hparams.swin_norm) {
                 cur = inpL;
             } else {
-                cur = llm_build_norm(ctx0, inpL, hparams,
+                cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
                 cb(cur, "attn_norm", il);
             }
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 if (model.layers[il].attn_q_norm) {
@@ -7834,10 +7260,10 @@ struct llm_build_context {
                                 0);
                     cb(Qcur, "Qcur", il);
 
-                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                    Qcur = build_norm(Qcur,
                                 model.layers[il].attn_q_norm,
                                 model.layers[il].attn_q_norm_b,
-                                LLM_NORM, cb, il);
+                                LLM_NORM, il);
                     cb(Qcur, "Qcur", il);
                 }
 
@@ -7848,10 +7274,10 @@ struct llm_build_context {
                                 0);
                     cb(Kcur, "Kcur", il);
 
-                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                    Kcur = build_norm(Kcur,
                                model.layers[il].attn_k_norm,
                                model.layers[il].attn_k_norm_b,
-                               LLM_NORM, cb, il);
+                               LLM_NORM, il);
                     cb(Kcur, "Kcur", il);
                 }
 
@@ -7869,14 +7295,14 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, nullptr,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
                 if (hparams.swin_norm) {
-                    cur = llm_build_norm(ctx0, cur, hparams,
+                    cur = build_norm(cur,
                         model.layers[il].attn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 }
             }
 
@@ -7893,13 +7319,13 @@ struct llm_build_context {
 
             // feed-forward network
             if (!hparams.swin_norm) {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
             }
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -7908,9 +7334,9 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             if (hparams.swin_norm) {
-                cur = llm_build_norm(ctx0, cur, hparams,
+                cur = build_norm(cur,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
             }
 
@@ -7926,13 +7352,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output_with_img_logits", -1);
 
         // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
@@ -7959,7 +7385,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
 
@@ -7978,20 +7404,20 @@ struct llm_build_context {
                 case 3:
                 case 4:
                     {
-                        cur = llm_build_norm(ctx0, cur, hparams,
+                        cur = build_norm(cur,
                                 layer.norm1,
                                 layer.norm1_b,
-                                LLM_NORM_GROUP, cb, 0);
+                                LLM_NORM_GROUP, 0);
 
                         cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
                         cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
                         cur = ggml_add(ctx0, cur, layer.conv1_b);
 
-                        cur = llm_build_norm(ctx0, cur, hparams,
+                        cur = build_norm(cur,
                                 layer.norm2,
                                 layer.norm2_b,
-                                LLM_NORM_GROUP, cb, 0);
+                                LLM_NORM_GROUP, 0);
 
                         cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
@@ -8002,10 +7428,10 @@ struct llm_build_context {
                     } break;
                 case 2:
                     {
-                        cur = llm_build_norm(ctx0, cur, hparams,
+                        cur = build_norm(cur,
                                 layer.attn_norm,
                                 layer.attn_norm_b,
-                                LLM_NORM_GROUP, cb, 0);
+                                LLM_NORM_GROUP, 0);
 
                         struct ggml_tensor * q;
                         struct ggml_tensor * k;
@@ -8035,10 +7461,10 @@ struct llm_build_context {
                     } break;
                 case 5:
                     {
-                        cur = llm_build_norm(ctx0, cur, hparams,
+                        cur = build_norm(cur,
                                 layer.norm,
                                 layer.norm_b,
-                                LLM_NORM_GROUP, cb, 0);
+                                LLM_NORM_GROUP, 0);
                     } break;
                 default: GGML_ABORT("unknown posnet layer");
             };
@@ -8046,10 +7472,10 @@ struct llm_build_context {
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.tok_norm,
                 model.tok_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 
@@ -8066,12 +7492,12 @@ struct llm_build_context {
 
             cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                     layer.norm,
                     layer.norm_b,
-                    LLM_NORM, cb, -1);
+                    LLM_NORM, -1);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     layer.pw1, layer.pw1_b, NULL,
                     NULL,      NULL,        NULL,
                     layer.pw2, layer.pw2_b, NULL,
@@ -8089,13 +7515,13 @@ struct llm_build_context {
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         cur = ggml_add(ctx0, cur, model.output_b);
         cb(cur, "result_embd", -1);
@@ -8106,7 +7532,7 @@ struct llm_build_context {
     }
 };
 
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx) {
     llama_ubatch dummy = {};
     dummy.equal_seqs = true;
 
@@ -8116,7 +7542,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
 
     llm.init();
 
-    struct ggml_cgraph * result = llm.build_defrag(ids);
+    struct ggml_cgraph * result = llm.build_defrag();
 
     llm.free();
 
@@ -8356,18 +7782,18 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_bitnet();
             } break;
-        case LLM_ARCH_T5:
-            {
-                if (lctx.is_encoding) {
-                    result = llm.build_t5_enc();
-                } else {
-                    result = llm.build_t5_dec();
-                }
-            } break;
-        case LLM_ARCH_T5ENCODER:
-            {
-                result = llm.build_t5_enc();
-            } break;
+        //case LLM_ARCH_T5:
+        //    {
+        //        if (lctx.is_encoding) {
+        //            result = llm.build_t5_enc();
+        //        } else {
+        //            result = llm.build_t5_dec();
+        //        }
+        //    } break;
+        //case LLM_ARCH_T5ENCODER:
+        //    {
+        //        result = llm.build_t5_enc();
+        //    } break;
         case LLM_ARCH_JAIS:
             {
                 result = llm.build_jais();
@@ -8380,14 +7806,14 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_exaone();
             } break;
-        case LLM_ARCH_RWKV6:
-            {
-                result = llm.build_rwkv6();
-            } break;
-        case LLM_ARCH_RWKV6QWEN2:
-            {
-                result = llm.build_rwkv6qwen2();
-            } break;
+        //case LLM_ARCH_RWKV6:
+        //    {
+        //        result = llm.build_rwkv6();
+        //    } break;
+        //case LLM_ARCH_RWKV6QWEN2:
+        //    {
+        //        result = llm.build_rwkv6qwen2();
+        //    } break;
         case LLM_ARCH_CHAMELEON:
             {
                 result = llm.build_chameleon();
@@ -8543,6 +7969,7 @@ static int llama_decode_impl(
         } else {
             ubatch = lctx.sbatch.split_simple(n_ubatch);
         }
+
         const uint32_t n_tokens = ubatch.n_tokens;
 
         // count the outputs in this u_batch
@@ -8567,6 +7994,8 @@ static int llama_decode_impl(
 
         GGML_ASSERT(n_threads > 0);
 
+        lctx.prepare_decode(ubatch);
+
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
             llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update()
@@ -8600,6 +8029,12 @@ static int llama_decode_impl(
 
         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
+        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+        ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+
+        lctx.set_inputs(ubatch);
+
         // the output is always the last tensor in the graph
         struct ggml_tensor * res  = ggml_graph_node(gf, -1);
         struct ggml_tensor * embd = ggml_graph_node(gf, -2);
@@ -8623,12 +8058,6 @@ static int llama_decode_impl(
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
 
-        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
-
-        ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
-
-        llama_set_inputs(lctx, ubatch);
-
         const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
         if (compute_status != GGML_STATUS_SUCCESS) {
             kv_slot_restorer.restore(kv_self);
@@ -8850,11 +8279,17 @@ static int llama_encode_impl(
 
     GGML_ASSERT(n_threads > 0);
 
+    lctx.prepare_decode(ubatch);
+
     ggml_backend_sched_reset(lctx.sched.get());
     ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
     ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
+    ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+
+    lctx.set_inputs(ubatch);
+
     // the output embeddings after the final encoder normalization
     struct ggml_tensor * embd = nullptr;
 
@@ -8875,10 +8310,6 @@ static int llama_encode_impl(
         }
     }
 
-    ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
-
-    llama_set_inputs(lctx, ubatch);
-
     const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
     switch (compute_status) {
         case GGML_STATUS_SUCCESS:
@@ -8966,227 +8397,6 @@ static int llama_encode_impl(
     return 0;
 }
 
-// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-    auto & kv_self = lctx.kv_self;
-
-    const auto & hparams = lctx.model.hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    const uint32_t n_kv   = kv_self.cell_max();
-    const uint32_t n_used = kv_self.used;
-
-    assert(n_used <= n_kv);
-
-    //const int64_t t_start = ggml_time_us();
-
-    // number of cells moved
-    uint32_t n_moves = 0;
-
-    // each move requires 6*n_layer tensors (see build_defrag)
-    //   - source view, destination view, copy operation
-    //   - x2 for keys and values
-    //const uint32_t max_moves = model.max_nodes()/(6*n_layer);
-    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
-    const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer);
-
-    // determine which KV cells to move where
-    //
-    //  cell i moves to ids[i]
-    //
-    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
-    //
-    std::vector<uint32_t> ids(n_kv, n_kv);
-
-    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
-        const auto & cell0 = kv_self.cells[i0];
-
-        if (!cell0.is_empty()) {
-            ids[i0] = i0;
-
-            continue;
-        }
-
-        // found a hole - fill it with data from the end of the cache
-
-        uint32_t nh = 1;
-
-        // determine the size of the hole
-        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
-            nh++;
-        }
-
-        uint32_t nf = 0;
-        uint32_t is = n_kv - 1;
-
-        // starting from the end, find nh non-empty cells
-        for (; is > i0; --is) {
-            const auto & cell1 = kv_self.cells[is];
-
-            if (cell1.is_empty() || ids[is] != n_kv) {
-                continue;
-            }
-
-            // non-empty cell which is not yet moved
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        // this can only happen if `n_used` is not accurate, which would be a bug
-        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
-
-        nf = 0;
-
-        uint32_t i1 = is;
-
-        // are we moving a continuous block of memory?
-        bool cont = false;
-
-        // should we stop searching for the next move?
-        bool stop = false;
-
-        // go back and move the nf cells to the hole
-        for (; i1 < n_kv; ++i1) {
-            auto & cell1 = kv_self.cells[i1];
-
-            if (cell1.is_empty() || ids[i1] != n_kv) {
-                if (n_moves == max_moves) {
-                    stop = true;
-                    break;
-                }
-
-                cont = false;
-                continue;
-            }
-
-            // this cell goes to (i0 + nf)
-            ids[i1] = i0 + nf;
-
-            // move the cell meta data
-            kv_self.cells[i0 + nf] = cell1;
-
-            // clear the old cell and move the head there
-            cell1 = llama_kv_cell();
-            kv_self.head = n_used;
-
-            if (!cont) {
-                n_moves++;
-                cont = true;
-            }
-
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        if (stop || n_moves == max_moves) {
-            break;
-        }
-
-        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
-
-        i0 += nh - 1;
-    }
-
-    if (n_moves == 0) {
-        return;
-    }
-
-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
-
-#if 0
-    // CPU defrag
-    //
-    // TODO: optimizations are possible:
-    //       - multiple threads
-    //       - avoid copying to the host memory when already there
-    //
-    // likely not worth the effort, as we have ggml_graph based defrag
-    //
-
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-
-    const uint32_t kv_size = kv_self.size;
-
-    std::vector<uint8_t> buf_k;
-    std::vector<uint8_t> buf_v;
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
-        const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
-
-        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
-        const size_t v_size    = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
-
-        buf_k.resize(k_size);
-        buf_v.resize(v_size);
-
-        ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
-
-        // batch move [i, i+nm) to [id, id+nm)
-        // note: cells can move only to a lower index
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == n_kv) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < n_kv && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-            // move keys
-            {
-                const int64_t os =  i*k_size_row;
-                const int64_t od = id*k_size_row;
-
-                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
-            }
-
-            // move values (note: they are transposed)
-            {
-                const int64_t os =  i;
-                const int64_t od = id;
-
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
-                }
-            }
-
-            i += nm - 1;
-        }
-
-        ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
-    }
-#else
-    // ggml_graph defrag
-
-    ggml_backend_sched_reset(lctx.sched.get());
-
-    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
-
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-#endif
-
-    //const int64_t t_end = ggml_time_us();
-
-    //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
-}
-
 // TODO: move to llama_context
 static void llama_kv_self_update_impl(llama_context & lctx) {
     bool need_reserve = false;
@@ -9200,13 +8410,15 @@ static void llama_kv_self_update_impl(llama_context & lctx) {
 
         // apply K-shift if needed
         if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
+            lctx.prepare_k_shift();
+
             ggml_backend_sched_reset(lctx.sched.get());
 
             ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
 
             ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
 
-            lctx.set_k_shift(kv);
+            lctx.set_inputs({});
 
             llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 
@@ -9224,7 +8436,13 @@ static void llama_kv_self_update_impl(llama_context & lctx) {
 
     // defragment the KV cache if needed
     if (kv.do_defrag) {
-        llama_kv_cache_defrag_impl(lctx);
+        lctx.prepare_defrag();
+
+        ggml_backend_sched_reset(lctx.sched.get());
+
+        ggml_cgraph * gf = llama_build_graph_defrag(lctx);
+
+        llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 
         need_reserve = true;
 
@@ -9253,16 +8471,16 @@ int32_t llama_set_adapter_lora(
             struct llama_context * ctx,
             struct llama_adapter_lora * adapter,
             float scale) {
-    ctx->lora[adapter] = scale;
+    ctx->loras[adapter] = scale;
     return 0;
 }
 
 int32_t llama_rm_adapter_lora(
             struct llama_context * ctx,
             struct llama_adapter_lora * adapter) {
-    auto pos = ctx->lora.find(adapter);
-    if (pos != ctx->lora.end()) {
-        ctx->lora.erase(pos);
+    auto pos = ctx->loras.find(adapter);
+    if (pos != ctx->loras.end()) {
+        ctx->loras.erase(pos);
         return 0;
     }
 
@@ -9270,7 +8488,7 @@ int32_t llama_rm_adapter_lora(
 }
 
 void llama_clear_adapter_lora(struct llama_context * ctx) {
-    ctx->lora.clear();
+    ctx->loras.clear();
 }
 
 int32_t llama_apply_adapter_cvec(

From b4ec1d44294b628a811cc97367bb7ace0a32c9fd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 16 Jan 2025 21:55:12 +0200
Subject: [PATCH 12/84] cont : move kv_self update to llama_context

ggml-ci
---
 src/llama-context.cpp | 119 +++++++++++++++++++++++++++
 src/llama-context.h   |  10 +++
 src/llama.cpp         | 182 +++++++-----------------------------------
 3 files changed, 157 insertions(+), 154 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 910e2243d7e8a..daea125fe0704 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -32,6 +32,38 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
+enum ggml_status llama_context::compute_graph(
+            ggml_cgraph * graph,
+                   bool   batched) {
+    int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
+    ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
+
+    if (backend_cpu != nullptr) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
+        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
+        set_threadpool_fn(backend_cpu, tp);
+    }
+
+    // set the number of threads for all the backends
+    for (const auto & set_n_threads_fn : set_n_threads_fns) {
+        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
+    }
+
+    auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph);
+    if (status != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
+    }
+
+    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
+
+    return status;
+}
+
+
+llama_pos llama_context::pos_max() const {
+    return kv_self.pos_max();
+}
+
 // TODO: improve
 void llama_context::reset() {
     inp_tokens          = nullptr;
@@ -540,6 +572,93 @@ ggml_tensor * llama_context::build_lora_mm_id(
     return res;
 }
 
+bool llama_context::kv_self_update() {
+    bool need_reserve = false;
+
+    auto & kv = kv_self;
+
+    if (kv.has_shift) {
+        if (!kv.can_shift) {
+            GGML_ABORT("The current context does not support K-shift");
+        }
+
+        // apply K-shift if needed
+        if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
+            prepare_k_shift();
+
+            ggml_backend_sched_reset(sched.get());
+
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ buf_compute_meta.size(),
+                /*.mem_buffer =*/ buf_compute_meta.data(),
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx0 = ggml_init(params);
+
+            reset();
+
+            ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+            build_k_shift(ctx0, gf);
+
+            ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+            set_inputs({});
+
+            compute_graph(gf, false);
+
+            ggml_free(ctx0);
+
+            need_reserve = true;
+        }
+
+        {
+            kv.has_shift = false;
+
+            for (uint32_t i = 0; i < kv.size; ++i) {
+                kv.cells[i].delta = 0;
+            }
+        }
+    }
+
+    // defragment the KV cache if needed
+    if (kv.do_defrag) {
+        prepare_defrag();
+
+        ggml_backend_sched_reset(sched.get());
+
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ buf_compute_meta.size(),
+            /*.mem_buffer =*/ buf_compute_meta.data(),
+            /*.no_alloc   =*/ true,
+        };
+
+        ggml_context * ctx0 = ggml_init(params);
+
+        reset();
+
+        ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        build_defrag(ctx0, gf);
+
+        ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+        // no input
+        //set_inputs({});
+
+        compute_graph(gf, false);
+
+        ggml_free(ctx0);
+
+        need_reserve = true;
+
+        kv.do_defrag = false;
+    }
+
+    return need_reserve;
+}
+
 void llama_context::build_attn_inp(
         ggml_context * ctx0,
              int32_t   n_tokens,
diff --git a/src/llama-context.h b/src/llama-context.h
index a2f41b5c8fc7d..bc33fc6ef4890 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -79,6 +79,13 @@ struct llama_context {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
+    // returns the result of ggml_backend_sched_graph_compute_async execution
+    enum ggml_status compute_graph(
+                ggml_cgraph * graph,
+                       bool   batched);
+
+    llama_pos pos_max() const;
+
     void reset();
 
     void prepare_k_shift();
@@ -129,6 +136,9 @@ struct llama_context {
     struct ggml_tensor * inp_KQ_mask_cross;   // F32 [n_outputs_enc, n_batch]
     struct ggml_tensor * inp_K_shift;         // I32 [kv_size]
 
+    // return true if need to reserve new worst-case graph
+    bool kv_self_update();
+
     void build_attn_inp(
             ggml_context * ctx0,
                  int32_t   n_tokens,
diff --git a/src/llama.cpp b/src/llama.cpp
index a2e5e0bea0fb5..6e2faa71c342b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -110,7 +110,6 @@ struct llm_build_context {
     const llama_hparams      & hparams;
     const llama_cparams      & cparams;
     const llama_ubatch       & ubatch;
-    //const llama_kv_cache     & kv_self;
     const llama_adapter_cvec & cvec;
     const llama_loras        & loras;
 
@@ -137,8 +136,6 @@ struct llm_build_context {
     const float norm_rms_eps;
 
     const int32_t n_tokens;
-    //const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
-    //const int32_t kv_head;  // index of where we store new KV data in the cache
     const int32_t n_outputs;
     const int32_t n_outputs_enc;
     const int32_t n_ctx_orig;
@@ -166,7 +163,6 @@ struct llm_build_context {
         hparams          (model.hparams),
         cparams          (lctx.cparams),
         ubatch           (ubatch),
-        //kv_self          (lctx.kv_self),
         cvec             (lctx.cvec),
         loras            (lctx.loras),
         n_embd           (hparams.n_embd),
@@ -190,8 +186,6 @@ struct llm_build_context {
         norm_eps         (hparams.f_norm_eps),
         norm_rms_eps     (hparams.f_norm_rms_eps),
         n_tokens         (ubatch.n_tokens),
-        //n_kv             (worst_case ? kv_self.size : kv_self.n),
-        //kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
         n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
         n_outputs_enc    (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
         n_ctx_orig       (cparams.n_ctx_orig_yarn),
@@ -7532,40 +7526,6 @@ struct llm_build_context {
     }
 };
 
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx) {
-    llama_ubatch dummy = {};
-    dummy.equal_seqs = true;
-
-    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
-
-    struct llm_build_context llm(lctx, dummy, cb, false);
-
-    llm.init();
-
-    struct ggml_cgraph * result = llm.build_defrag();
-
-    llm.free();
-
-    return result;
-}
-
-static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
-    llama_ubatch dummy = {};
-    dummy.equal_seqs = true;
-
-    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
-
-    struct llm_build_context llm(lctx, dummy, cb, false);
-
-    llm.init();
-
-    struct ggml_cgraph * result = llm.build_k_shift();
-
-    llm.free();
-
-    return result;
-}
-
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
     const llama_ubatch & ubatch,
@@ -7836,33 +7796,6 @@ static struct ggml_cgraph * llama_build_graph(
     return result;
 }
 
-// returns the result of ggml_backend_sched_graph_compute_async execution
-static enum ggml_status llama_graph_compute(
-          llama_context & lctx,
-            ggml_cgraph * gf,
-                    int   n_threads,
-        ggml_threadpool * threadpool) {
-    if (lctx.backend_cpu != nullptr) {
-        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
-        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
-        set_threadpool_fn(lctx.backend_cpu, threadpool);
-    }
-
-    // set the number of threads for all the backends
-    for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) {
-        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
-    }
-
-    auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
-    if (status != GGML_STATUS_SUCCESS) {
-        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
-    }
-
-    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
-
-    return status;
-}
-
 // decode a batch of tokens by evaluating the transformer
 // in case of unsuccessful decoding (error or warning),
 // the kv_cache state will be returned to its original state
@@ -7887,7 +7820,7 @@ static int llama_decode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens_all = batch.n_tokens;
@@ -7989,16 +7922,11 @@ static int llama_decode_impl(
             lctx.n_outputs = n_outputs_new;
         }
 
-        int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-        ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
-
-        GGML_ASSERT(n_threads > 0);
-
         lctx.prepare_decode(ubatch);
 
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
-            llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update()
+            llama_kv_self_update(&lctx);
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
@@ -8058,7 +7986,7 @@ static int llama_decode_impl(
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
 
-        const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
+        const auto compute_status = lctx.compute_graph(gf, n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
             kv_slot_restorer.restore(kv_self);
             switch (compute_status) {
@@ -8226,7 +8154,7 @@ static int llama_encode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens = batch.n_tokens;
@@ -8274,11 +8202,6 @@ static int llama_encode_impl(
     lctx.inp_embd_enc = NULL;
     lctx.n_outputs = n_tokens;
 
-    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-    ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
-
-    GGML_ASSERT(n_threads > 0);
-
     lctx.prepare_decode(ubatch);
 
     ggml_backend_sched_reset(lctx.sched.get());
@@ -8310,7 +8233,7 @@ static int llama_encode_impl(
         }
     }
 
-    const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
+    const auto compute_status = lctx.compute_graph(gf, n_tokens > 1);
     switch (compute_status) {
         case GGML_STATUS_SUCCESS:
             break;
@@ -8397,76 +8320,6 @@ static int llama_encode_impl(
     return 0;
 }
 
-// TODO: move to llama_context
-static void llama_kv_self_update_impl(llama_context & lctx) {
-    bool need_reserve = false;
-
-    auto & kv = lctx.kv_self;
-
-    if (kv.has_shift) {
-        if (!kv.can_shift) {
-            GGML_ABORT("The current context does not support K-shift");
-        }
-
-        // apply K-shift if needed
-        if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
-            lctx.prepare_k_shift();
-
-            ggml_backend_sched_reset(lctx.sched.get());
-
-            ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
-
-            ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
-
-            lctx.set_inputs({});
-
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-
-            need_reserve = true;
-        }
-
-        {
-            kv.has_shift = false;
-
-            for (uint32_t i = 0; i < kv.size; ++i) {
-                kv.cells[i].delta = 0;
-            }
-        }
-    }
-
-    // defragment the KV cache if needed
-    if (kv.do_defrag) {
-        lctx.prepare_defrag();
-
-        ggml_backend_sched_reset(lctx.sched.get());
-
-        ggml_cgraph * gf = llama_build_graph_defrag(lctx);
-
-        llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-
-        need_reserve = true;
-
-        kv.do_defrag = false;
-    }
-
-    // reserve a worst case graph again
-    if (need_reserve) {
-        // TODO: extract to a function
-        // build worst-case graph
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
-        llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
-
-        // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(lctx.sched.get());
-        if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        }
-    }
-}
-
 int32_t llama_set_adapter_lora(
             struct llama_context * ctx,
             struct llama_adapter_lora * adapter,
@@ -9224,9 +9077,30 @@ void llama_kv_cache_update(llama_context * ctx) {
     llama_kv_self_update(ctx);
 }
 
-// TODO: move to llama-context
 void llama_kv_self_update(llama_context * ctx) {
-    llama_kv_self_update_impl(*ctx);
+    const bool need_reserve = ctx->kv_self_update();
+
+    // reserve a worst case graph again
+    if (need_reserve) {
+        // TODO: extract to a function
+        const auto & cparams = ctx->cparams;
+        const auto & model   = ctx->model;
+
+        // build worst-case graph
+        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+        ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true);
+
+        // initialize scheduler with the worst-case graph
+        ggml_backend_sched_reset(ctx->sched.get());
+        if (!ggml_backend_sched_reserve(ctx->sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+        }
+    }
 }
 
 ///

From f0713498fd05afe117647c76f536866640b77b90 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 11:51:35 +0200
Subject: [PATCH 13/84] context : add get_ctx_padding()

ggml-ci
---
 src/llama-context.cpp | 4 ++++
 src/llama-context.h   | 3 +++
 src/llama.cpp         | 4 +++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index daea125fe0704..6a73659d05136 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -64,6 +64,10 @@ llama_pos llama_context::pos_max() const {
     return kv_self.pos_max();
 }
 
+uint32_t llama_context::get_ctx_padding(const llama_cparams & cparams) const {
+    return kv_self.get_padding(cparams);
+}
+
 // TODO: improve
 void llama_context::reset() {
     inp_tokens          = nullptr;
diff --git a/src/llama-context.h b/src/llama-context.h
index bc33fc6ef4890..45eaafaad16cb 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -84,8 +84,11 @@ struct llama_context {
                 ggml_cgraph * graph,
                        bool   batched);
 
+    // max token position across all sequences in the current context
     llama_pos pos_max() const;
 
+    uint32_t get_ctx_padding(const llama_cparams & cparams) const;
+
     void reset();
 
     void prepare_k_shift();
diff --git a/src/llama.cpp b/src/llama.cpp
index 6e2faa71c342b..569c67c028305 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7820,6 +7820,7 @@ static int llama_decode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
     llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
@@ -8154,6 +8155,7 @@ static int llama_encode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
     llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
@@ -8629,7 +8631,7 @@ struct llama_context * llama_init_from_model(
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
     // this is necessary due to kv_self.n being padded later during inference
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, ctx->kv_self.get_padding(cparams));
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams));
 
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;

From c75ba6851e1f6079ff7c823672908a2e5767418a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 12:41:16 +0200
Subject: [PATCH 14/84] context : move adapter code in the implementation [no
 ci]

---
 src/llama-context.cpp | 37 +++++++++++++++++++++++++++++++++++++
 src/llama.cpp         | 40 +++++-----------------------------------
 2 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 6a73659d05136..5cb31abc085ee 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1788,6 +1788,43 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
     return it->second.data();
 }
 
+// llama adapter API
+
+int32_t llama_set_adapter_lora(
+            struct llama_context * ctx,
+            struct llama_adapter_lora * adapter,
+            float scale) {
+    ctx->loras[adapter] = scale;
+    return 0;
+}
+
+int32_t llama_rm_adapter_lora(
+            struct llama_context * ctx,
+            struct llama_adapter_lora * adapter) {
+    auto pos = ctx->loras.find(adapter);
+    if (pos != ctx->loras.end()) {
+        ctx->loras.erase(pos);
+        return 0;
+    }
+
+    return -1;
+}
+
+void llama_clear_adapter_lora(struct llama_context * ctx) {
+    ctx->loras.clear();
+}
+
+int32_t llama_apply_adapter_cvec(
+        struct llama_context * ctx,
+                 const float * data,
+                      size_t   len,
+                     int32_t   n_embd,
+                     int32_t   il_start,
+                     int32_t   il_end) {
+    return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
+}
+
+
 // llama state API
 
 // deprecated
diff --git a/src/llama.cpp b/src/llama.cpp
index 569c67c028305..b80b1c4d1688a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8322,40 +8322,6 @@ static int llama_encode_impl(
     return 0;
 }
 
-int32_t llama_set_adapter_lora(
-            struct llama_context * ctx,
-            struct llama_adapter_lora * adapter,
-            float scale) {
-    ctx->loras[adapter] = scale;
-    return 0;
-}
-
-int32_t llama_rm_adapter_lora(
-            struct llama_context * ctx,
-            struct llama_adapter_lora * adapter) {
-    auto pos = ctx->loras.find(adapter);
-    if (pos != ctx->loras.end()) {
-        ctx->loras.erase(pos);
-        return 0;
-    }
-
-    return -1;
-}
-
-void llama_clear_adapter_lora(struct llama_context * ctx) {
-    ctx->loras.clear();
-}
-
-int32_t llama_apply_adapter_cvec(
-        struct llama_context * ctx,
-                 const float * data,
-                      size_t   len,
-                     int32_t   n_embd,
-                     int32_t   il_start,
-                     int32_t   il_end) {
-    return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
-}
-
 //
 // interface implementation
 //
@@ -8924,7 +8890,7 @@ struct llama_context * llama_new_context_with_model(
 }
 
 //
-// kv cache
+// kv cache view
 //
 
 struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
@@ -8935,6 +8901,10 @@ void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view *
     llama_kv_cache_view_update(view, ctx->kv_self);
 }
 
+//
+// kv cache
+//
+
 // deprecated
 int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
     return llama_kv_self_n_tokens(ctx);

From 133ad6a7232914459afc902107a53342d3abfb3b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 14:42:09 +0200
Subject: [PATCH 15/84] context : initial need_reserve logic

ggml-ci
---
 src/llama-context.cpp | 171 ++++++++++++++++++++-
 src/llama-context.h   |   4 +-
 src/llama.cpp         | 337 +++++++++++++-----------------------------
 3 files changed, 268 insertions(+), 244 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5cb31abc085ee..d696090cc5b3f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -576,9 +576,7 @@ ggml_tensor * llama_context::build_lora_mm_id(
     return res;
 }
 
-bool llama_context::kv_self_update() {
-    bool need_reserve = false;
-
+void llama_context::kv_self_update() {
     auto & kv = kv_self;
 
     if (kv.has_shift) {
@@ -655,12 +653,14 @@ bool llama_context::kv_self_update() {
 
         ggml_free(ctx0);
 
-        need_reserve = true;
-
         kv.do_defrag = false;
+
+        need_reserve = true;
     }
+}
 
-    return need_reserve;
+void llama_kv_self_update(llama_context * ctx) {
+    ctx->kv_self_update();
 }
 
 void llama_context::build_attn_inp(
@@ -1824,6 +1824,165 @@ int32_t llama_apply_adapter_cvec(
     return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
 }
 
+//
+// kv cache view
+//
+
+struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
+    return llama_kv_cache_view_init(ctx->kv_self, n_seq_max);
+}
+
+void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
+    llama_kv_cache_view_update(view, ctx->kv_self);
+}
+
+//
+// kv cache
+//
+
+// deprecated
+int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
+    return llama_kv_self_n_tokens(ctx);
+}
+
+int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
+    return llama_kv_cache_n_tokens(&ctx->kv_self);
+}
+
+// deprecated
+int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
+    return llama_kv_self_used_cells(ctx);
+}
+
+int32_t llama_kv_self_used_cells(const llama_context * ctx) {
+    return llama_kv_cache_used_cells(&ctx->kv_self);
+}
+
+// deprecated
+void llama_kv_cache_clear(llama_context * ctx) {
+    llama_kv_self_clear(ctx);
+}
+
+void llama_kv_self_clear(llama_context * ctx) {
+    llama_kv_cache_clear(&ctx->kv_self);
+}
+
+// deprecated
+bool llama_kv_cache_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
+}
+
+bool llama_kv_self_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1);
+}
+
+// deprecated
+void llama_kv_cache_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_self_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
+}
+
+// deprecated
+void llama_kv_cache_seq_keep(
+        llama_context * ctx,
+         llama_seq_id   seq_id) {
+    return llama_kv_self_seq_keep(ctx, seq_id);
+}
+
+void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id);
+}
+
+// deprecated
+void llama_kv_cache_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
+}
+
+void llama_kv_self_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta);
+}
+
+// deprecated
+void llama_kv_cache_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
+}
+
+void llama_kv_self_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d);
+}
+
+// deprecated
+llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_self_seq_pos_max(ctx, seq_id);
+}
+
+llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id);
+}
+
+// deprecated
+void llama_kv_cache_defrag(llama_context * ctx) {
+    return llama_kv_self_defrag(ctx);
+}
+
+void llama_kv_self_defrag(llama_context * ctx) {
+    return llama_kv_cache_defrag(&ctx->kv_self);
+}
+
+// deprecated
+bool llama_kv_cache_can_shift(const llama_context * ctx) {
+    return llama_kv_self_can_shift(ctx);
+}
+
+bool llama_kv_self_can_shift(const llama_context * ctx) {
+    return llama_kv_cache_can_shift(&ctx->kv_self);
+}
+
+// deprecated
+void llama_kv_cache_update(llama_context * ctx) {
+    llama_kv_self_update(ctx);
+}
 
 // llama state API
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 45eaafaad16cb..eb9a1739170dc 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -62,6 +62,7 @@ struct llama_context {
     int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
 
     bool logits_all = false;
+    bool need_reserve = false;
 
     // embeddings output (2-dimensional array: [n_outputs][n_embd])
     // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
@@ -87,6 +88,7 @@ struct llama_context {
     // max token position across all sequences in the current context
     llama_pos pos_max() const;
 
+    // certain implementations could require a padding for the context size
     uint32_t get_ctx_padding(const llama_cparams & cparams) const;
 
     void reset();
@@ -140,7 +142,7 @@ struct llama_context {
     struct ggml_tensor * inp_K_shift;         // I32 [kv_size]
 
     // return true if need to reserve new worst-case graph
-    bool kv_self_update();
+    void kv_self_update();
 
     void build_attn_inp(
             ggml_context * ctx0,
diff --git a/src/llama.cpp b/src/llama.cpp
index b80b1c4d1688a..5807fa38802da 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -28,57 +28,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
-    // loading time will be recalculated after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    model.t_load_us = 0;
-    time_meas tm(model.t_load_us);
-
-    model.t_start_us = tm.t_start_us;
-
-    try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
-
-        ml.print_info();
-
-        model.hparams.vocab_only = params.vocab_only;
-
-        try {
-            model.load_arch(ml);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
-        }
-        try {
-            model.load_hparams(ml);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
-        }
-        try {
-            model.load_vocab(ml);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
-        }
-
-        model.load_stats(ml);
-        model.print_info();
-
-        if (params.vocab_only) {
-            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return 0;
-        }
-
-        if (!model.load_tensors(ml)) {
-            return -2;
-        }
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
-        return -1;
-    }
-
-    return 0;
-}
-
 //
 // llm_build
 //
@@ -7951,6 +7900,30 @@ static int llama_decode_impl(
             }
         }
 
+        // reserve a worst case graph if needed
+        // TODO: extract to a function
+        if (lctx.need_reserve) {
+            const auto & cparams = lctx.cparams;
+            const auto & model   = lctx.model;
+
+            // build worst-case graph
+            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+            ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
+
+            // initialize scheduler with the worst-case graph
+            ggml_backend_sched_reset(lctx.sched.get());
+            if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+            }
+
+            lctx.need_reserve = false;
+        }
+
         //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
         ggml_backend_sched_reset(lctx.sched.get());
@@ -8206,6 +8179,31 @@ static int llama_encode_impl(
 
     lctx.prepare_decode(ubatch);
 
+    // reserve a worst case graph if needed
+    // TODO: extract to a function
+    if (lctx.need_reserve) {
+        // TODO: extract to a function
+        const auto & cparams = lctx.cparams;
+        const auto & model   = lctx.model;
+
+        // build worst-case graph
+        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
+
+        // initialize scheduler with the worst-case graph
+        ggml_backend_sched_reset(lctx.sched.get());
+        if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+        }
+
+        lctx.need_reserve = false;
+    }
+
     ggml_backend_sched_reset(lctx.sched.get());
     ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
@@ -8419,6 +8417,57 @@ int64_t llama_time_us(void) {
     return ggml_time_us();
 }
 
+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
+static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+    // loading time will be recalculated after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    model.t_load_us = 0;
+    time_meas tm(model.t_load_us);
+
+    model.t_start_us = tm.t_start_us;
+
+    try {
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
+
+        ml.print_info();
+
+        model.hparams.vocab_only = params.vocab_only;
+
+        try {
+            model.load_arch(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
+        }
+        try {
+            model.load_hparams(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
+        }
+        try {
+            model.load_vocab(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
+        }
+
+        model.load_stats(ml);
+        model.print_info();
+
+        if (params.vocab_only) {
+            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
+            return 0;
+        }
+
+        if (!model.load_tensors(ml)) {
+            return -2;
+        }
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+        return -1;
+    }
+
+    return 0;
+}
+
 static struct llama_model * llama_model_load_from_file_impl(
         const std::string & path_model,
         std::vector<std::string> & splits,
@@ -8889,192 +8938,6 @@ struct llama_context * llama_new_context_with_model(
     return llama_init_from_model(model, params);
 }
 
-//
-// kv cache view
-//
-
-struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
-    return llama_kv_cache_view_init(ctx->kv_self, n_seq_max);
-}
-
-void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
-    llama_kv_cache_view_update(view, ctx->kv_self);
-}
-
-//
-// kv cache
-//
-
-// deprecated
-int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
-    return llama_kv_self_n_tokens(ctx);
-}
-
-int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
-    return llama_kv_cache_n_tokens(&ctx->kv_self);
-}
-
-// deprecated
-int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
-    return llama_kv_self_used_cells(ctx);
-}
-
-int32_t llama_kv_self_used_cells(const llama_context * ctx) {
-    return llama_kv_cache_used_cells(&ctx->kv_self);
-}
-
-// deprecated
-void llama_kv_cache_clear(llama_context * ctx) {
-    llama_kv_self_clear(ctx);
-}
-
-void llama_kv_self_clear(llama_context * ctx) {
-    llama_kv_cache_clear(&ctx->kv_self);
-}
-
-// deprecated
-bool llama_kv_cache_seq_rm(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
-}
-
-bool llama_kv_self_seq_rm(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1);
-}
-
-// deprecated
-void llama_kv_cache_seq_cp(
-        llama_context * ctx,
-         llama_seq_id   seq_id_src,
-         llama_seq_id   seq_id_dst,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
-}
-
-void llama_kv_self_seq_cp(
-        llama_context * ctx,
-         llama_seq_id   seq_id_src,
-         llama_seq_id   seq_id_dst,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
-}
-
-// deprecated
-void llama_kv_cache_seq_keep(
-        llama_context * ctx,
-         llama_seq_id   seq_id) {
-    return llama_kv_self_seq_keep(ctx, seq_id);
-}
-
-void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id);
-}
-
-// deprecated
-void llama_kv_cache_seq_add(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-            llama_pos   delta) {
-    return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
-}
-
-void llama_kv_self_seq_add(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-            llama_pos   delta) {
-    return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta);
-}
-
-// deprecated
-void llama_kv_cache_seq_div(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-                  int   d) {
-    return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
-}
-
-void llama_kv_self_seq_div(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-                  int   d) {
-    return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d);
-}
-
-// deprecated
-llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_self_seq_pos_max(ctx, seq_id);
-}
-
-llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id);
-}
-
-// deprecated
-void llama_kv_cache_defrag(llama_context * ctx) {
-    return llama_kv_self_defrag(ctx);
-}
-
-void llama_kv_self_defrag(llama_context * ctx) {
-    return llama_kv_cache_defrag(&ctx->kv_self);
-}
-
-// deprecated
-bool llama_kv_cache_can_shift(const llama_context * ctx) {
-    return llama_kv_self_can_shift(ctx);
-}
-
-bool llama_kv_self_can_shift(const llama_context * ctx) {
-    return llama_kv_cache_can_shift(&ctx->kv_self);
-}
-
-// deprecated
-void llama_kv_cache_update(llama_context * ctx) {
-    llama_kv_self_update(ctx);
-}
-
-void llama_kv_self_update(llama_context * ctx) {
-    const bool need_reserve = ctx->kv_self_update();
-
-    // reserve a worst case graph again
-    if (need_reserve) {
-        // TODO: extract to a function
-        const auto & cparams = ctx->cparams;
-        const auto & model   = ctx->model;
-
-        // build worst-case graph
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-        ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true);
-
-        // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(ctx->sched.get());
-        if (!ggml_backend_sched_reserve(ctx->sched.get(), gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        }
-    }
-}
-
 ///
 
 int32_t llama_encode(

From cb8f2095c6f74d9fbb9bdfbb2ae1bf6178472150 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 19:37:52 +0200
Subject: [PATCH 16/84] wip

---
 src/llama.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 5807fa38802da..6c8df8a112a0b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7773,6 +7773,7 @@ static int llama_decode_impl(
     llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
+
     const uint32_t n_tokens_all = batch.n_tokens;
 
     const auto & model   = lctx.model;
@@ -7800,9 +7801,6 @@ static int llama_decode_impl(
     }
     lctx.n_queued_tokens += n_tokens_all;
 
-    auto & kv_self = lctx.kv_self;
-    llama_kv_slot_restorer kv_slot_restorer(kv_self);
-
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = vocab.n_tokens();
 
@@ -7828,16 +7826,19 @@ static int llama_decode_impl(
         n_outputs = 1;
     }
 
-    lctx.sbatch.from_batch(batch, n_embd,
-        /* simple_split */ !kv_self.recurrent,
-        /* logits_all   */ n_outputs == n_tokens_all);
-
     // reserve output buffer
     if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
         return -2;
     };
 
+    auto & kv_self = lctx.kv_self;
+    llama_kv_slot_restorer kv_slot_restorer(kv_self);
+
+    lctx.sbatch.from_batch(batch, n_embd,
+        /* simple_split */ !kv_self.recurrent,
+        /* logits_all   */ n_outputs == n_tokens_all);
+
     while (lctx.sbatch.n_tokens > 0) {
         llama_ubatch ubatch;
         if (kv_self.recurrent) {
@@ -8645,7 +8646,6 @@ struct llama_context * llama_init_from_model(
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
-    // this is necessary due to kv_self.n being padded later during inference
     cparams.n_ctx            = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams));
 
     // with causal attention, the batch size is limited by the context size

From 99422dfa3f0c686d89492958946a9b2ca91012da Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 20:30:16 +0200
Subject: [PATCH 17/84] context : introduce llama_batch_manager

ggml-ci
---
 src/llama-context.cpp | 130 ++++++++++++++++++++++++++++++++++++++++--
 src/llama-context.h   |  18 +++++-
 src/llama.cpp         |  87 ++++++----------------------
 3 files changed, 162 insertions(+), 73 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d696090cc5b3f..de54321df2f1a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -32,6 +32,132 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
+struct llama_batch_manager : public llama_batch_manager_i {
+    llama_batch_manager(llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
+        const auto & hparams = lctx.model.hparams;
+        const auto & n_embd  = hparams.n_embd;
+
+        const auto & kv_self = lctx.kv_self;
+
+        lctx.sbatch.from_batch(batch, n_embd,
+                /* simple_split */ !kv_self.recurrent,
+                /* logits_all   */ logits_all);
+    }
+
+    ~llama_batch_manager() override {
+    }
+
+    virtual llama_ubatch next() override {
+        ubatch = llama_ubatch();
+
+        const auto & cparams = lctx.cparams;
+        const auto & kv_self = lctx.kv_self;
+
+        const auto & n_ubatch = cparams.n_ubatch;
+
+        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
+        if (kv_self.recurrent) {
+            if (embd_pooled) {
+                // Pooled embeddings cannot be split across ubatches (yet)
+                ubatch = lctx.sbatch.split_seq(n_ubatch);
+            } else {
+                // recurrent model architectures are easier to implement
+                // with equal-length sequences
+                ubatch = lctx.sbatch.split_equal(n_ubatch);
+            }
+        } else {
+            ubatch = lctx.sbatch.split_simple(n_ubatch);
+        }
+
+        return ubatch;
+    }
+
+    virtual bool prepare() override {
+        const auto & cparams = lctx.cparams;
+        const auto & hparams = lctx.model.hparams;
+
+        auto & kv_self = lctx.kv_self;
+
+        // non-causal masks do not use the KV cache
+        if (hparams.causal_attn) {
+            llama_kv_self_update(&lctx);
+
+            // if we have enough unused cells before the current head ->
+            //   better to start searching from the beginning of the cache, hoping to fill it
+            if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
+                kv_self.head = 0;
+            }
+
+            const auto slot_info = kv_self.find_slot(ubatch);
+            if (!slot_info) {
+                return false;
+            }
+
+            kv_slot_restorer.save(slot_info);
+
+            if (!kv_self.recurrent) {
+                // a heuristic, to avoid attending the full cache if it is not yet utilized
+                // after enough generations, the benefit from this heuristic disappears
+                // if we start defragmenting the cache, the benefit from this will be more important
+                const uint32_t pad = kv_self.get_padding(cparams);
+                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
+                //kv_self.n = llama_kv_cache_cell_max(kv_self);
+            }
+        }
+
+        return true;
+    }
+
+    virtual void restore() override {
+        kv_slot_restorer.restore(lctx.kv_self);
+    }
+
+    virtual void update() override {
+        auto & kv_self = lctx.kv_self;
+
+        // update the kv ring buffer
+        {
+            kv_self.head += ubatch.n_tokens;
+
+            // Ensure kv cache head points to a valid index.
+            if (kv_self.head >= kv_self.size) {
+                kv_self.head = 0;
+            }
+        }
+    }
+
+    virtual void finalize() override {
+        const auto & cparams = lctx.cparams;
+
+        auto & kv_self = lctx.kv_self;
+
+        // decide if we need to defrag the kv cache
+        if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
+            const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
+
+            // queue defragmentation for next llama_kv_cache_update
+            if (fragmentation > cparams.defrag_thold) {
+                //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
+
+                kv_self.defrag();
+            }
+        }
+    }
+
+    llama_context & lctx;
+
+    const llama_batch & batch;
+
+    llama_ubatch ubatch;
+
+    llama_kv_slot_restorer kv_slot_restorer;
+};
+
+std::unique_ptr<llama_batch_manager_i> llama_context::prepare_batch(const llama_batch & batch, bool logits_all) {
+    return std::make_unique<llama_batch_manager>(*this, batch, logits_all);
+}
+
 enum ggml_status llama_context::compute_graph(
             ggml_cgraph * graph,
                    bool   batched) {
@@ -59,7 +185,6 @@ enum ggml_status llama_context::compute_graph(
     return status;
 }
 
-
 llama_pos llama_context::pos_max() const {
     return kv_self.pos_max();
 }
@@ -94,9 +219,6 @@ void llama_context::prepare_k_shift() {
 void llama_context::prepare_defrag() {
 }
 
-void llama_context::prepare_decode(const llama_ubatch & /*ubatch*/) {
-}
-
 // llama input
 
 void llama_context::set_inputs(const llama_ubatch & ubatch) {
diff --git a/src/llama-context.h b/src/llama-context.h
index eb9a1739170dc..47233f4f52497 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -16,6 +16,20 @@
 
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
+// TODO: this is very WIP - improve
+struct llama_batch_manager_i {
+    virtual ~llama_batch_manager_i() = default;
+
+    //bool is_done() const;
+
+    virtual llama_ubatch next() = 0;
+
+    virtual bool prepare() = 0;
+    virtual void restore() = 0;
+    virtual void update() = 0;
+    virtual void finalize() = 0;
+};
+
 struct llama_context {
     llama_context(const llama_model & model)
         : model(model)
@@ -80,6 +94,9 @@ struct llama_context {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
+    // TODO: do not pass logits_all explicitly
+    std::unique_ptr<llama_batch_manager_i> prepare_batch(const llama_batch & batch, bool logits_all);
+
     // returns the result of ggml_backend_sched_graph_compute_async execution
     enum ggml_status compute_graph(
                 ggml_cgraph * graph,
@@ -95,7 +112,6 @@ struct llama_context {
 
     void prepare_k_shift();
     void prepare_defrag();
-    void prepare_decode(const llama_ubatch & ubatch);
 
     void set_inputs(const llama_ubatch & ubatch);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 6c8df8a112a0b..8f6de199a505c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7807,8 +7807,6 @@ static int llama_decode_impl(
     uint32_t n_outputs = 0;
     uint32_t n_outputs_prev = 0;
 
-    const auto n_ubatch = cparams.n_ubatch;
-
     // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
     const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
 
@@ -7832,27 +7830,19 @@ static int llama_decode_impl(
         return -2;
     };
 
-    auto & kv_self = lctx.kv_self;
-    llama_kv_slot_restorer kv_slot_restorer(kv_self);
+    const bool logits_all = n_outputs == n_tokens_all;
+
+    //auto & kv_self = lctx.kv_self;
+    //llama_kv_slot_restorer kv_slot_restorer(kv_self);
+
+    //lctx.sbatch.from_batch(batch, n_embd,
+    //    /* simple_split */ !kv_self.recurrent,
+    //    /* logits_all   */ logits_all);
 
-    lctx.sbatch.from_batch(batch, n_embd,
-        /* simple_split */ !kv_self.recurrent,
-        /* logits_all   */ n_outputs == n_tokens_all);
+    auto batch_manager = lctx.prepare_batch(batch, logits_all);
 
     while (lctx.sbatch.n_tokens > 0) {
-        llama_ubatch ubatch;
-        if (kv_self.recurrent) {
-            if (embd_pooled) {
-                // Pooled embeddings cannot be split across ubatches (yet)
-                ubatch = lctx.sbatch.split_seq(n_ubatch);
-            } else {
-                // recurrent model architectures are easier to implement
-                // with equal-length sequences
-                ubatch = lctx.sbatch.split_equal(n_ubatch);
-            }
-        } else {
-            ubatch = lctx.sbatch.split_simple(n_ubatch);
-        }
+        llama_ubatch ubatch = batch_manager->next();
 
         const uint32_t n_tokens = ubatch.n_tokens;
 
@@ -7873,32 +7863,10 @@ static int llama_decode_impl(
             lctx.n_outputs = n_outputs_new;
         }
 
-        lctx.prepare_decode(ubatch);
-
-        // non-causal masks do not use the KV cache
-        if (hparams.causal_attn) {
-            llama_kv_self_update(&lctx);
-
-            // if we have enough unused cells before the current head ->
-            //   better to start searching from the beginning of the cache, hoping to fill it
-            if (kv_self.head > kv_self.used + 2*n_tokens) {
-                kv_self.head = 0;
-            }
-
-            const auto slot_info = kv_self.find_slot(ubatch);
-            if (!slot_info) {
-                return 1;
-            }
-            kv_slot_restorer.save(slot_info);
-
-            if (!kv_self.recurrent) {
-                // a heuristic, to avoid attending the full cache if it is not yet utilized
-                // after enough generations, the benefit from this heuristic disappears
-                // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = kv_self.get_padding(cparams);
-                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
-                //kv_self.n = llama_kv_cache_cell_max(kv_self);
-            }
+        if (!batch_manager->prepare()) {
+            LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
+            batch_manager->restore();
+            return -3;
         }
 
         // reserve a worst case graph if needed
@@ -7963,7 +7931,7 @@ static int llama_decode_impl(
 
         const auto compute_status = lctx.compute_graph(gf, n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
-            kv_slot_restorer.restore(kv_self);
+            batch_manager->restore();
             switch (compute_status) {
                 case GGML_STATUS_ABORTED:
                     return 2;
@@ -7975,15 +7943,7 @@ static int llama_decode_impl(
             }
         }
 
-        // update the kv ring buffer
-        {
-            kv_self.head += n_tokens;
-
-            // Ensure kv cache head points to a valid index.
-            if (kv_self.head >= kv_self.size) {
-                kv_self.head = 0;
-            }
-        }
+        batch_manager->update();
 
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {
@@ -8061,6 +8021,7 @@ static int llama_decode_impl(
                     }
             }
         }
+
         n_outputs_prev += lctx.n_outputs;
     }
 
@@ -8089,17 +8050,7 @@ static int llama_decode_impl(
     // wait for the computation to finish (automatically done when obtaining the model output)
     //llama_synchronize(&lctx);
 
-    // decide if we need to defrag the kv cache
-    if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
-        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
-
-        // queue defragmentation for next llama_kv_cache_update
-        if (fragmentation > cparams.defrag_thold) {
-            //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
-
-            kv_self.defrag();
-        }
-    }
+    batch_manager->finalize();
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
@@ -8178,7 +8129,7 @@ static int llama_encode_impl(
     lctx.inp_embd_enc = NULL;
     lctx.n_outputs = n_tokens;
 
-    lctx.prepare_decode(ubatch);
+    //batch_manager->prepare(ubatch);
 
     // reserve a worst case graph if needed
     // TODO: extract to a function

From a0c500b4dc91b87acba2529d2db7a2d28f1c3bb6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 21:11:03 +0200
Subject: [PATCH 18/84] context : prepare for abstraction

ggml-ci
---
 src/llama-context.cpp | 307 ++++++++++++++++++++++++++++++++++++++++-
 src/llama-context.h   |  11 +-
 src/llama.cpp         | 314 ++----------------------------------------
 3 files changed, 323 insertions(+), 309 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index de54321df2f1a..4e6033ff15640 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -32,6 +32,309 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
+llama_context::llama_context(const llama_model & model, const llama_context_params & params, std::function<ggml_cgraph *(llama_context &, const llama_ubatch &)> fn_build_graph_worst) :
+    model(model),
+    t_start_us(model.t_start_us),
+    t_load_us (model.t_load_us) {
+
+    const auto & hparams = model.hparams;
+
+    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
+    cparams.n_threads        = params.n_threads;
+    cparams.n_threads_batch  = params.n_threads_batch;
+    cparams.yarn_ext_factor  = params.yarn_ext_factor;
+    cparams.yarn_attn_factor = params.yarn_attn_factor;
+    cparams.yarn_beta_fast   = params.yarn_beta_fast;
+    cparams.yarn_beta_slow   = params.yarn_beta_slow;
+    cparams.defrag_thold     = params.defrag_thold;
+    cparams.embeddings       = params.embeddings;
+    cparams.offload_kqv      = params.offload_kqv;
+    cparams.flash_attn       = params.flash_attn;
+    cparams.no_perf          = params.no_perf;
+    cparams.pooling_type     = params.pooling_type;
+
+    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
+    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
+
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams));
+
+    // with causal attention, the batch size is limited by the context size
+    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
+
+    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
+    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
+    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
+    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
+        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
+        cparams.n_batch = GGML_KQ_MASK_PAD;
+    }
+
+    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+
+    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
+                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
+                                                              hparams.n_ctx_train;
+
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
+
+    auto rope_scaling_type = params.rope_scaling_type;
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
+        rope_scaling_type = hparams.rope_scaling_type_train;
+    }
+
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
+        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
+    }
+
+    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
+        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
+    }
+
+    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
+
+    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
+        } else {
+            cparams.pooling_type = hparams.pooling_type;
+        }
+    }
+
+    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
+        cparams.causal_attn = hparams.causal_attn;
+    } else {
+        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+    }
+
+    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
+    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
+    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
+    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
+    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
+    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
+    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
+    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
+    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
+
+    if (n_ctx_per_seq < hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
+
+    if (n_ctx_per_seq > hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
+
+    logits_all = params.logits_all;
+
+    // build worst-case graph for encoder if a model contains encoder
+    is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder()
+
+    uint32_t kv_size = cparams.n_ctx;
+    ggml_type type_k = params.type_k;
+    ggml_type type_v = params.type_v;
+
+    // Mamba only needs a constant number of KV cache cells per sequence
+    if (llama_model_is_recurrent(&model)) {
+        // Mamba needs at least as many KV cells as there are sequences kept at any time
+        kv_size = std::max((uint32_t) 1, params.n_seq_max);
+        // it's probably best to keep as much precision as possible for the states
+        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
+        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
+    }
+
+    GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
+    GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
+
+    if (!hparams.vocab_only) {
+        // GPU backends
+        for (auto * dev : model.devices) {
+            ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                throw std::runtime_error("failed to initialize backend");
+            }
+            backends.emplace_back(backend);
+        }
+
+        // add ACCEL backends (such as BLAS)
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                    throw std::runtime_error("failed to initialize backend");
+                }
+                backends.emplace_back(backend);
+            }
+        }
+
+        // add CPU backend
+        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        if (backend_cpu == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
+            throw std::runtime_error("failed to initialize CPU backend");
+        }
+        backends.emplace_back(backend_cpu);
+
+        // create a list of the set_n_threads functions in the backends
+        for (auto & backend : backends) {
+            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
+            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+            if (reg) {
+                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+                if (ggml_backend_set_n_threads_fn) {
+                    set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
+                }
+            }
+        }
+
+        llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data);
+
+        if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
+            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
+            throw std::runtime_error("failed to initialize self-attention cache");
+        }
+
+        {
+            const size_t memory_size_k = kv_self.size_k_bytes();
+            const size_t memory_size_v = kv_self.size_v_bytes();
+
+            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+        }
+
+        // graph outputs buffer
+        {
+            // resized during inference when a batch uses more outputs
+            if (llama_output_reserve(*this, params.n_seq_max) < params.n_seq_max) {
+                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
+                throw std::runtime_error("failed to reserve initial output buffer");
+            }
+
+            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
+                    ggml_backend_buffer_name    (buf_output.get()),
+                    ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0);
+        }
+
+        // scheduler and compute buffers
+        {
+            // buffer types used for the compute buffer of each backend
+            std::vector<ggml_backend_buffer_type_t> backend_buft;
+            std::vector<ggml_backend_t> backend_ptrs;
+            for (auto & backend : backends) {
+                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
+                auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
+                    // use the host buffer of the first device CPU for faster transfer of the intermediate state
+                    auto * dev = model.devices[0];
+                    auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
+                    if (host_buft) {
+                        buft = host_buft;
+                    }
+                }
+                backend_buft.push_back(buft);
+                backend_ptrs.push_back(backend.get());
+            }
+
+            const size_t max_nodes = model.max_nodes();
+
+            // buffer used to store the computation graph and the tensor meta data
+            buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+
+            // TODO: move these checks to ggml_backend_sched
+            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
+            bool pipeline_parallel =
+                model.n_devices() > 1 &&
+                model.params.n_gpu_layers > (int) model.hparams.n_layer &&
+                model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
+                params.offload_kqv;
+
+            // pipeline parallelism requires support for async compute and events in all devices
+            if (pipeline_parallel) {
+                for (auto & backend : backends) {
+                    auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                    if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                        // ignore CPU backend
+                        continue;
+                    }
+                    auto * dev = ggml_backend_get_device(backend.get());
+                    ggml_backend_dev_props props;
+                    ggml_backend_dev_get_props(dev, &props);
+                    if (!props.caps.async || !props.caps.events) {
+                        // device does not support async compute or events
+                        pipeline_parallel = false;
+                        break;
+                    }
+                }
+            }
+
+            sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
+
+            if (pipeline_parallel) {
+                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
+            }
+
+            // initialize scheduler with the worst-case graph
+            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+
+            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf_pp = fn_build_graph_worst(*this, ubatch_pp);
+
+            // reserve pp graph first so that buffers are only allocated once
+            ggml_backend_sched_reserve(sched.get(), gf_pp);
+            int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
+            int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
+
+            // reserve with tg graph to get the number of splits and nodes
+            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf_tg = fn_build_graph_worst(*this, ubatch_tg);
+            ggml_backend_sched_reserve(sched.get(), gf_tg);
+            int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
+            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
+
+            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
+            gf_pp = fn_build_graph_worst(*this, ubatch_pp);
+            if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+                throw std::runtime_error("failed to allocate compute buffers");
+            }
+
+            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+                ggml_backend_t backend = backend_ptrs[i];
+                ggml_backend_buffer_type_t buft = backend_buft[i];
+                size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+                if (size > 1) {
+                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+                            ggml_backend_buft_name(buft),
+                            size / 1024.0 / 1024.0);
+                }
+            }
+
+            if (n_nodes_pp == n_nodes_tg) {
+                LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
+            } else {
+                LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+            }
+            if (n_splits_pp == n_splits_tg) {
+                LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
+            } else {
+                LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
+            }
+        }
+    }
+
+}
+
 struct llama_batch_manager : public llama_batch_manager_i {
     llama_batch_manager(llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
         const auto & hparams = lctx.model.hparams;
@@ -81,7 +384,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
 
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
-            llama_kv_self_update(&lctx);
+            lctx.kv_self_update();
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
@@ -106,6 +409,8 @@ struct llama_batch_manager : public llama_batch_manager_i {
             }
         }
 
+        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
+
         return true;
     }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 47233f4f52497..d0356e3ed28c3 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -30,11 +30,14 @@ struct llama_batch_manager_i {
     virtual void finalize() = 0;
 };
 
+// TODO: make implementation details private
+// TODO: become abstract base class, split the current implementation into different child classes
 struct llama_context {
-    llama_context(const llama_model & model)
-        : model(model)
-        , t_start_us(model.t_start_us)
-        , t_load_us (model.t_load_us) {}
+    // TODO: store the worst-case graph build function and reuse it later
+    llama_context(
+            const llama_model & model,
+            const llama_context_params & params,
+            std::function<ggml_cgraph *(llama_context &, const llama_ubatch &)> fn_build_graph_worst);
 
     const struct llama_model & model;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 8f6de199a505c..408bd9030ffae 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7893,8 +7893,6 @@ static int llama_decode_impl(
             lctx.need_reserve = false;
         }
 
-        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
-
         ggml_backend_sched_reset(lctx.sched.get());
         ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
@@ -8574,309 +8572,17 @@ struct llama_context * llama_init_from_model(
         return nullptr;
     }
 
-    llama_context * ctx = new llama_context(*model);
-
-    const auto & hparams = model->hparams;
-    auto       & cparams = ctx->cparams;
-
-    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
-    cparams.n_threads        = params.n_threads;
-    cparams.n_threads_batch  = params.n_threads_batch;
-    cparams.yarn_ext_factor  = params.yarn_ext_factor;
-    cparams.yarn_attn_factor = params.yarn_attn_factor;
-    cparams.yarn_beta_fast   = params.yarn_beta_fast;
-    cparams.yarn_beta_slow   = params.yarn_beta_slow;
-    cparams.defrag_thold     = params.defrag_thold;
-    cparams.embeddings       = params.embeddings;
-    cparams.offload_kqv      = params.offload_kqv;
-    cparams.flash_attn       = params.flash_attn;
-    cparams.no_perf          = params.no_perf;
-    cparams.pooling_type     = params.pooling_type;
-
-    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
-    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
-    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
-
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams));
-
-    // with causal attention, the batch size is limited by the context size
-    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
-
-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
-    }
-
-    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
-
-    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
-                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
-                                                              hparams.n_ctx_train;
-
-    cparams.cb_eval           = params.cb_eval;
-    cparams.cb_eval_user_data = params.cb_eval_user_data;
-
-    auto rope_scaling_type = params.rope_scaling_type;
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
-        rope_scaling_type = hparams.rope_scaling_type_train;
-    }
-
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
-        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
-    }
-
-    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
-        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
-    }
-
-    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
-
-    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
-        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
-            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
-        } else {
-            cparams.pooling_type = hparams.pooling_type;
-        }
-    }
+    llama_context * ctx = nullptr;
 
-    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
-        cparams.causal_attn = hparams.causal_attn;
-    } else {
-        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
-    }
-
-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
-    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
-    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
-    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
-    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
-    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
-    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
-    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
-
-    if (n_ctx_per_seq < hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
-    }
-
-    if (n_ctx_per_seq > hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
-    }
-
-    ctx->logits_all = params.logits_all;
-
-    // build worst-case graph for encoder if a model contains encoder
-    ctx->is_encoding = llama_model_has_encoder(model);
-
-    uint32_t kv_size = cparams.n_ctx;
-    ggml_type type_k = params.type_k;
-    ggml_type type_v = params.type_v;
-
-    // Mamba only needs a constant number of KV cache cells per sequence
-    if (llama_model_is_recurrent(model)) {
-        // Mamba needs at least as many KV cells as there are sequences kept at any time
-        kv_size = std::max((uint32_t) 1, params.n_seq_max);
-        // it's probably best to keep as much precision as possible for the states
-        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
-        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
-    }
-
-    GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
-    GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
-
-    if (!hparams.vocab_only) {
-        // GPU backends
-        for (auto * dev : model->devices) {
-            ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.emplace_back(backend);
-        }
-
-        // add ACCEL backends (such as BLAS)
-        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
-                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.emplace_back(backend);
-            }
-        }
-
-        // add CPU backend
-        ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
-        if (ctx->backend_cpu == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
-            llama_free(ctx);
-            return nullptr;
-        }
-        ctx->backends.emplace_back(ctx->backend_cpu);
-
-        // create a list of the set_n_threads functions in the backends
-        for (auto & backend : ctx->backends) {
-            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
-            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
-            if (reg) {
-                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
-                if (ggml_backend_set_n_threads_fn) {
-                    ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
-                }
-            }
-        }
-
-        llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
-
-        if (!ctx->kv_self.init(ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
-            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
-            llama_free(ctx);
-            return nullptr;
-        }
-
-        {
-            const size_t memory_size_k = ctx->kv_self.size_k_bytes();
-            const size_t memory_size_v = ctx->kv_self.size_v_bytes();
-
-            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
-                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
-        }
-
-        // graph outputs buffer
-        {
-            // resized during inference when a batch uses more outputs
-            if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
-                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-
-            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
-                    ggml_backend_buffer_name(ctx->buf_output.get()),
-                    ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0);
-        }
-
-        // scheduler and compute buffers
-        {
-            // buffer types used for the compute buffer of each backend
-            std::vector<ggml_backend_buffer_type_t> backend_buft;
-            std::vector<ggml_backend_t> backend_ptrs;
-            for (auto & backend : ctx->backends) {
-                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
-                auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
-                if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
-                    // use the host buffer of the first device CPU for faster transfer of the intermediate state
-                    auto * dev = model->devices[0];
-                    auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
-                    if (host_buft) {
-                        buft = host_buft;
-                    }
-                }
-                backend_buft.push_back(buft);
-                backend_ptrs.push_back(backend.get());
-            }
-
-            const size_t max_nodes = model->max_nodes();
-
-            // buffer used to store the computation graph and the tensor meta data
-            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
-
-            // TODO: move these checks to ggml_backend_sched
-            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
-            bool pipeline_parallel =
-                model->n_devices() > 1 &&
-                model->params.n_gpu_layers > (int)model->hparams.n_layer &&
-                model->params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
-                params.offload_kqv;
-
-            // pipeline parallelism requires support for async compute and events in all devices
-            if (pipeline_parallel) {
-                for (auto & backend : ctx->backends) {
-                    auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
-                    if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
-                        // ignore CPU backend
-                        continue;
-                    }
-                    auto * dev = ggml_backend_get_device(backend.get());
-                    ggml_backend_dev_props props;
-                    ggml_backend_dev_get_props(dev, &props);
-                    if (!props.caps.async || !props.caps.events) {
-                        // device does not support async compute or events
-                        pipeline_parallel = false;
-                        break;
-                    }
-                }
-            }
-
-            ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
-
-            if (pipeline_parallel) {
-                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get()));
-            }
-
-            // initialize scheduler with the worst-case graph
-            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-            llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
-
-            // reserve pp graph first so that buffers are only allocated once
-            ggml_backend_sched_reserve(ctx->sched.get(), gf_pp);
-            int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get());
-            int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
-
-            // reserve with tg graph to get the number of splits and nodes
-            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
-            ggml_backend_sched_reserve(ctx->sched.get(), gf_tg);
-            int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get());
-            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
-
-            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-            gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
-            if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-
-            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
-                ggml_backend_t backend = backend_ptrs[i];
-                ggml_backend_buffer_type_t buft = backend_buft[i];
-                size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend);
-                if (size > 1) {
-                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
-                            ggml_backend_buft_name(buft),
-                            size / 1024.0 / 1024.0);
-                }
-            }
-
-            if (n_nodes_pp == n_nodes_tg) {
-                LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
-            } else {
-                LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
-            }
-            if (n_splits_pp == n_splits_tg) {
-                LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
-            } else {
-                LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
-            }
-        }
+    try {
+        // TODO: add logic which llama_context implementation to construct
+        ctx = new llama_context(*model, params,
+                [](llama_context & lctx, const llama_ubatch & ubatch) {
+                    return llama_build_graph(lctx, ubatch, true);
+                });
+    } catch (const std::exception & e) {
+        LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what());
+        return nullptr;
     }
 
     return ctx;

From 918885697e4409208b8157ffd18a6c347ca5b04d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 29 Jan 2025 14:45:04 +0200
Subject: [PATCH 19/84] llama : resolve rwkv conflict

ggml-ci
---
 src/llama.cpp | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index f410f7a2f1259..0ca8070cd56f1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7076,19 +7076,13 @@ struct llm_build_context {
     //            1
     //        );
 
+    //        struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
     //        ggml_build_forward_expand(
     //            gf,
     //            ggml_cpy(
     //                ctx0,
-    //                wkv_states,
-    //                ggml_view_1d(
-    //                    ctx0,
-    //                    kv_self.v_l[il],
-    //                    hparams.n_embd_v_s() * n_seqs,
-    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-    //                )
-    //            )
-    //        );
+    //                ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0),
+    //                ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
 
     //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
     //        ggml_build_forward_expand(gf, ffn_inp);

From 3e23be7911704f8474e7dcb32424bb043be63b06 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 2 Feb 2025 10:17:42 +0200
Subject: [PATCH 20/84] context : store graph build function callback

ggml-ci
---
 src/llama-context.cpp | 37 +++++++++++++++++++++++++++++++++----
 src/llama-context.h   |  8 ++++++--
 src/llama.cpp         |  4 ++--
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 37e43213aaaec..1cd168db23fb7 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -33,8 +33,12 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
-llama_context::llama_context(const llama_model & model, const llama_context_params & params, std::function<ggml_cgraph *(llama_context &, const llama_ubatch &)> fn_build_graph_worst) :
+llama_context::llama_context(
+        const llama_model & model,
+        const llama_context_params & params,
+        build_graph_callback && cb_build_graph) :
     model(model),
+    cb_build_graph(std::move(cb_build_graph)),
     t_start_us(model.t_start_us),
     t_load_us (model.t_load_us) {
 
@@ -289,7 +293,7 @@ llama_context::llama_context(const llama_model & model, const llama_context_para
             llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_pp = fn_build_graph_worst(*this, ubatch_pp);
+            ggml_cgraph * gf_pp = this->cb_build_graph(*this, ubatch_pp, true);
 
             // reserve pp graph first so that buffers are only allocated once
             ggml_backend_sched_reserve(sched.get(), gf_pp);
@@ -298,13 +302,13 @@ llama_context::llama_context(const llama_model & model, const llama_context_para
 
             // reserve with tg graph to get the number of splits and nodes
             llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_tg = fn_build_graph_worst(*this, ubatch_tg);
+            ggml_cgraph * gf_tg = this->cb_build_graph(*this, ubatch_tg, true);
             ggml_backend_sched_reserve(sched.get(), gf_tg);
             int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
             int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
 
             // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-            gf_pp = fn_build_graph_worst(*this, ubatch_pp);
+            gf_pp = this->cb_build_graph(*this, ubatch_pp, true);
             if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                 throw std::runtime_error("failed to allocate compute buffers");
@@ -475,6 +479,31 @@ struct llama_batch_manager : public llama_batch_manager_i {
 
         //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
+        // reserve a worst case graph if needed
+        if (lctx.need_reserve) {
+            LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
+
+            const auto & cparams = lctx.cparams;
+            const auto & model   = lctx.model;
+
+            // build worst-case graph
+            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+            ggml_cgraph * gf = lctx.cb_build_graph(lctx, ubatch, true);
+
+            // initialize scheduler with the worst-case graph
+            ggml_backend_sched_reset(lctx.sched.get());
+            if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+            }
+
+            lctx.need_reserve = false;
+        }
+
         return true;
     }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 1277645de4a35..5958deaef21a9 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -36,11 +36,13 @@ struct llama_batch_manager_i {
 // TODO: make implementation details private
 // TODO: become abstract base class, split the current implementation into different child classes
 struct llama_context {
-    // TODO: store the worst-case graph build function and reuse it later
+    // TODO: tmp until llama-model starts implementing the graph build function
+    typedef std::function<ggml_cgraph *(llama_context &, const llama_ubatch &, bool worst_case)> build_graph_callback;
+
     llama_context(
             const llama_model & model,
             const llama_context_params & params,
-            std::function<ggml_cgraph *(llama_context &, const llama_ubatch &)> fn_build_graph_worst);
+            build_graph_callback && cb_build_graph);
 
     const struct llama_model & model;
 
@@ -49,6 +51,8 @@ struct llama_context {
     llama_adapter_cvec cvec;
     llama_loras        loras;
 
+    build_graph_callback cb_build_graph;
+
     std::vector<ggml_backend_ptr> backends;
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 0ca8070cd56f1..6268249f21f7a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8508,8 +8508,8 @@ struct llama_context * llama_init_from_model(
     try {
         // TODO: add logic which llama_context implementation to construct
         ctx = new llama_context(*model, params,
-                [](llama_context & lctx, const llama_ubatch & ubatch) {
-                    return llama_build_graph(lctx, ubatch, true);
+                [](llama_context & lctx, const llama_ubatch & ubatch, bool worst_case) {
+                    return llama_build_graph(lctx, ubatch, worst_case);
                 });
     } catch (const std::exception & e) {
         LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what());

From 1eca8916b51a6952a304e68f312b63649a6cead9 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Mon, 3 Feb 2025 20:17:50 +0800
Subject: [PATCH 21/84] llama : fix rwkv inference (#11618)

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/llama-context.cpp | 222 +++++++++++++++++
 src/llama-context.h   |  27 +++
 src/llama.cpp         | 547 ++++++++++++++----------------------------
 3 files changed, 428 insertions(+), 368 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 1cd168db23fb7..3bc0513ca1be0 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1970,6 +1970,228 @@ ggml_tensor * llama_context::build_mamba_layer(
 }
 
 
+ggml_tensor * llama_context::build_rwkv_token_shift_load(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto token_shift_count = hparams.token_shift_count;
+
+    const auto & n_tokens = ubatch.n_tokens;
+    const int64_t n_seqs  = ubatch.n_seqs;
+
+    struct ggml_tensor * token_shift_all = kv_self.k_l[il];
+
+    struct ggml_tensor * token_shift = build_copy_mask_state(
+            ctx0, graph, token_shift_all, state_copy, state_mask,
+            n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
+
+    token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
+
+    return token_shift;
+}
+
+
+ggml_tensor * llama_context::build_rwkv_token_shift_store(
+        ggml_context * ctx0,
+         ggml_tensor * token_shift,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto token_shift_count = hparams.token_shift_count;
+    const auto n_embd = hparams.n_embd;
+
+    const auto & n_tokens = ubatch.n_tokens;
+    const int64_t n_seqs  = ubatch.n_seqs;
+
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    return ggml_cpy(
+        ctx0,
+        ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
+        ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
+    );
+}
+
+
+ggml_tensor * llama_context::build_rwkv6_time_mix(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * cur,
+         ggml_tensor * x_prev,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto n_tokens = ubatch.n_tokens;
+    const auto n_seqs = ubatch.n_seqs;
+    const auto n_embd = hparams.n_embd;
+    const auto head_size = hparams.wkv_head_size;
+    const auto n_head = n_embd / head_size;
+    const auto n_head_kv = hparams.n_head_kv(il);
+
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    const auto layer = &model.layers[il];
+
+    bool is_qrwkv = layer->time_mix_first == nullptr;
+
+    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+    struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur);
+
+    xxx = ggml_reshape_4d(
+        ctx0,
+        ggml_tanh(
+            ctx0,
+            ggml_mul_mat(ctx0, layer->time_mix_w1, xxx)
+        ),
+        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
+    );
+
+    xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
+
+    xxx = ggml_mul_mat(
+        ctx0,
+        ggml_reshape_4d(
+            ctx0,
+            layer->time_mix_w2,
+            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
+        ),
+        xxx
+    );
+
+    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
+    if (layer->time_mix_lerp_fused) {
+        // fusing these weights makes some performance improvement
+        sx  = ggml_reshape_3d(ctx0, sx,  n_embd, 1, n_tokens);
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+        xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur);
+        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+    } else {
+        // for backward compatibility
+        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+
+        xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur);
+        xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur);
+        xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur);
+        xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur);
+        xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur);
+    }
+
+    struct ggml_tensor * r = build_lora_mm(ctx0, layer->time_mix_receptance, xr);
+    struct ggml_tensor * k = build_lora_mm(ctx0, layer->time_mix_key,        xk);
+    struct ggml_tensor * v = build_lora_mm(ctx0, layer->time_mix_value,      xv);
+    if (layer->time_mix_receptance_b) {
+        r = ggml_add(ctx0, r, layer->time_mix_receptance_b);
+    }
+    if (layer->time_mix_key_b) {
+        k = ggml_add(ctx0, k, layer->time_mix_key_b);
+    }
+    if (layer->time_mix_value_b) {
+        v = ggml_add(ctx0, v, layer->time_mix_value_b);
+    }
+
+    struct ggml_tensor * g = build_lora_mm(ctx0, layer->time_mix_gate, xg);
+    if (is_qrwkv) {
+        g = ggml_sigmoid(ctx0, g);
+    } else {
+        g = ggml_silu(ctx0, g);
+    }
+
+    if (n_head_kv != 0 && n_head_kv != n_head) {
+        GGML_ASSERT(n_head % n_head_kv == 0);
+        k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
+        v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
+        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
+        k = ggml_repeat(ctx0, k, tmp);
+        v = ggml_repeat(ctx0, v, tmp);
+    }
+
+    k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
+    v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
+    r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
+
+    struct ggml_tensor * w = ggml_mul_mat(
+        ctx0,
+        layer->time_mix_decay_w2,
+        ggml_tanh(
+            ctx0,
+            ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw)
+        )
+    );
+
+    w = ggml_add(ctx0, w, layer->time_mix_decay);
+    w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
+    w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
+
+    if (is_qrwkv) {
+        // k = k * (1 - w)
+        k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
+    }
+
+    struct ggml_tensor * wkv_state = build_copy_mask_state(
+            ctx0, graph, kv_self.v_l[il], state_copy, state_mask,
+            n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
+
+    struct ggml_tensor * wkv_output;
+    if (is_qrwkv) {
+        wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
+    } else {
+        wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, wkv_state);
+    }
+    cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
+    wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
+
+    ggml_build_forward_expand(
+        graph,
+        ggml_cpy(
+            ctx0,
+            wkv_state,
+            ggml_view_1d(
+                ctx0,
+                kv_self.v_l[il],
+                hparams.n_embd_v_s() * n_seqs,
+                hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+            )
+        )
+    );
+
+    if (!is_qrwkv) {
+        // group norm with head_count groups
+        cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
+        cur = ggml_norm(ctx0, cur, 64e-5f);
+
+        // Convert back to regular vectors.
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b);
+    } else {
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    }
+
+    cur = ggml_mul(ctx0, cur, g);
+    cur = build_lora_mm(ctx0, layer->time_mix_output, cur);
+
+    return cur;
+}
+
 // llama output
 
 size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
diff --git a/src/llama-context.h b/src/llama-context.h
index 5958deaef21a9..4cf4a6312ede0 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -248,6 +248,33 @@ struct llama_context {
                      int   il,
                     bool   worst_case);
 
+    ggml_tensor * build_rwkv_token_shift_load(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case);
+
+    ggml_tensor * build_rwkv_token_shift_store(
+            ggml_context * ctx0,
+             ggml_tensor * token_shift,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case);
+
+    ggml_tensor * build_rwkv6_time_mix(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * cur,
+             ggml_tensor * x_prev,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case);
+
     struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
     struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 64a5efd2da06d..171ea20178d0f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -574,175 +574,34 @@ struct llm_build_context {
         return cur;
     }
 
-    //struct ggml_tensor * build_rwkv6_time_mix(
-    //        const struct llama_layer * layer,
-    //        struct ggml_tensor * cur,
-    //        struct ggml_tensor * x_prev,
-    //        struct ggml_tensor ** wkv_state,
-    //        size_t wkv_head_size,
-    //        size_t head_count_kv) {
-    //    size_t n_embd       = cur->ne[0];
-    //    size_t n_seq_tokens = cur->ne[1];
-    //    size_t n_seqs       = cur->ne[2];
-
-    //    size_t head_size  = wkv_head_size;
-    //    size_t head_count = n_embd / head_size;
-
-    //    size_t n_tokens = n_seqs * n_seq_tokens;
-
-    //    bool is_qrwkv = layer->time_mix_first == nullptr;
-
-    //    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
-
-    //    sx  = ggml_reshape_2d(ctx0, sx,  n_embd, n_tokens);
-    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-
-    //    struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur);
-
-    //    xxx = ggml_reshape_4d(
-    //        ctx0,
-    //        ggml_tanh(
-    //            ctx0,
-    //            ggml_mul_mat(ctx0, layer->time_mix_w1, xxx)
-    //        ),
-    //        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
-    //    );
-
-    //    xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
-
-    //    xxx = ggml_mul_mat(
-    //        ctx0,
-    //        ggml_reshape_4d(
-    //            ctx0,
-    //            layer->time_mix_w2,
-    //            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
-    //        ),
-    //        xxx
-    //    );
-
-    //    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
-    //    if (layer->time_mix_lerp_fused) {
-    //        // fusing these weights makes some performance improvement
-    //        sx  = ggml_reshape_3d(ctx0, sx,  n_embd, 1, n_tokens);
-    //        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
-    //        xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur);
-    //        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
-    //        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
-    //        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
-    //        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
-    //        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-    //    } else {
-    //        // for backward compatibility
-    //        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
-    //        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
-    //        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
-    //        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
-    //        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-
-    //        xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur);
-    //        xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur);
-    //        xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur);
-    //        xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur);
-    //        xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur);
-    //    }
-
-    //    struct ggml_tensor * r = build_lora_mm(layer->time_mix_receptance, xr);
-    //    struct ggml_tensor * k = build_lora_mm(layer->time_mix_key,        xk);
-    //    struct ggml_tensor * v = build_lora_mm(layer->time_mix_value,      xv);
-    //    if (layer->time_mix_receptance_b) {
-    //        r = ggml_add(ctx0, r, layer->time_mix_receptance_b);
-    //    }
-    //    if (layer->time_mix_key_b) {
-    //        k = ggml_add(ctx0, k, layer->time_mix_key_b);
-    //    }
-    //    if (layer->time_mix_value_b) {
-    //        v = ggml_add(ctx0, v, layer->time_mix_value_b);
-    //    }
-
-    //    struct ggml_tensor * g = build_lora_mm(layer->time_mix_gate, xg);
-    //    if (is_qrwkv) {
-    //        g = ggml_sigmoid(ctx0, g);
-    //    } else {
-    //        g = ggml_silu(ctx0, g);
-    //    }
-
-    //    if (head_count_kv != head_count) {
-    //        GGML_ASSERT(head_count % head_count_kv == 0);
-    //        k = ggml_reshape_4d(ctx0, k, head_size, 1, head_count_kv, n_tokens);
-    //        v = ggml_reshape_4d(ctx0, v, head_size, 1, head_count_kv, n_tokens);
-    //        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
-    //        k = ggml_repeat(ctx0, k, tmp);
-    //        v = ggml_repeat(ctx0, v, tmp);
-    //    }
-
-    //    k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
-    //    v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
-    //    r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
-
-    //    struct ggml_tensor * w = ggml_mul_mat(
-    //        ctx0,
-    //        layer->time_mix_decay_w2,
-    //        ggml_tanh(
-    //            ctx0,
-    //            ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw)
-    //        )
-    //    );
-
-    //    w = ggml_add(ctx0, w, layer->time_mix_decay);
-    //    w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
-    //    w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
-
-    //    if (is_qrwkv) {
-    //        // k = k * (1 - w)
-    //        k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
-    //    }
-
-    //    struct ggml_tensor * wkv_output;
-    //    if (!layer->time_mix_first) {
-    //        wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
-    //    } else {
-    //        wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, *wkv_state);
-    //    }
-    //    cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
-    //    *wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
-
-    //    if (!is_qrwkv) {
-    //        // group norm with head_count groups
-    //        cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
-    //        cur = ggml_norm(ctx0, cur, 64e-5f);
-
-    //        // Convert back to regular vectors.
-    //        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-    //        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b);
-    //    } else {
-    //        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-    //    }
-
-    //    cur = ggml_mul(ctx0, cur, g);
-    //    cur = build_lora_mm(layer->time_mix_output, cur);
-
-    //    return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
-    //}
+    struct ggml_tensor * build_rwkv_channel_mix(
+        const struct llama_layer * layer,
+        struct ggml_tensor * cur,
+        struct ggml_tensor * x_prev,
+        const llm_arch arch) {
+        struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+        switch (arch) {
+            case LLM_ARCH_RWKV6:
+            {
+                struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
+                struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
+
+                struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
+                struct ggml_tensor * k = ggml_sqr(
+                        ctx0,
+                        ggml_relu(
+                            ctx0,
+                            build_lora_mm(layer->channel_mix_key, xk)
+                            )
+                    );
+                cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
+            } break;
+            default:
+                GGML_ABORT("fatal error");
+        }
 
-    //struct ggml_tensor * build_rwkv6_channel_mix(
-    //    const struct llama_layer * layer,
-    //    struct ggml_tensor * cur,
-    //    struct ggml_tensor * x_prev) {
-    //    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
-    //    struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
-    //    struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
-
-    //    struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
-    //    struct ggml_tensor * k = ggml_sqr(
-    //            ctx0,
-    //            ggml_relu(
-    //                ctx0,
-    //                build_lora_mm(layer->channel_mix_key, xk)
-    //                )
-    //            );
-
-    //    return ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
-    //}
+        return cur;
+    }
 
     struct ggml_cgraph * build_k_shift() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -6935,226 +6794,178 @@ struct llm_build_context {
         return gf;
     }
 
-    //ggml_cgraph * build_rwkv6() {
-    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    ggml_cgraph * build_rwkv6() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-    //    // Token shift state dimensions should be 2 * n_emb
-    //    GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
+        GGML_ASSERT(hparams.token_shift_count == 2);
 
-    //    const int64_t n_seqs = ubatch.n_seqs;
-    //    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-    //    const int64_t n_tokens = ubatch.n_tokens;
-    //    GGML_ASSERT(n_seqs != 0);
-    //    GGML_ASSERT(ubatch.equal_seqs);
-    //    GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
 
-    //    struct ggml_tensor * cur;
-    //    struct ggml_tensor * inpL;
-    //    struct ggml_tensor * state_copy = build_inp_s_copy();
-    //    struct ggml_tensor * state_mask = build_inp_s_mask();
+        inpL = build_inp_embd(model.tok_embd);
+        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-    //    inpL = build_inp_embd(model.tok_embd);
-    //    inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+        struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case);
 
-    //    for (int il = 0; il < n_layer; ++il) {
-    //        const llama_layer * layer = &model.layers[il];
-
-    //        // (ab)using the KV cache to store the states
-    //        struct ggml_tensor * token_shift = build_copy_mask_state(
-    //                gf, kv_self.k_l[il], state_copy, state_mask,
-    //                hparams.n_embd_k_s(), n_seqs);
-
-    //        struct ggml_tensor * wkv_states = build_copy_mask_state(
-    //                gf, kv_self.v_l[il], state_copy, state_mask,
-    //                hparams.n_embd_v_s(), n_seqs);
-
-    //        cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-    //        token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
-
-    //        struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
-    //        struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
-
-    //        struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
-    //        struct ggml_tensor * x_prev = ggml_concat(
-    //            ctx0,
-    //            att_shift,
-    //            ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
-    //            1
-    //        );
-
-    //        cur = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
-    //        ggml_build_forward_expand(gf, cur);
-    //        ggml_build_forward_expand(
-    //            gf,
-    //            ggml_cpy(
-    //                ctx0,
-    //                wkv_states,
-    //                ggml_view_1d(
-    //                    ctx0,
-    //                    kv_self.v_l[il],
-    //                    hparams.n_embd_v_s() * n_seqs,
-    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-    //                )
-    //            )
-    //        );
-
-    //        struct ggml_tensor * x_norm_ffn = build_norm(cur, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
-    //        x_prev = ggml_concat(
-    //            ctx0,
-    //            ffn_shift,
-    //            ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
-    //            1
-    //        );
-    //        cur = ggml_add(ctx0, cur, build_rwkv6_channel_mix(layer, x_norm_ffn, x_prev));
-    //        ggml_build_forward_expand(gf, cur);
-
-    //        struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
-    //        struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
-
-    //        token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
-
-    //        ggml_build_forward_expand(
-    //            gf,
-    //            ggml_cpy(
-    //                ctx0,
-    //                ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
-    //                ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
-    //            )
-    //        );
-
-    //        if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
-    //            cur = ggml_scale(ctx0, cur, 0.5F);
-    //        }
+        const auto n_embd = hparams.n_embd;
+        const auto n_seq_tokens = ubatch.n_seq_tokens;
+        const auto n_seqs = ubatch.n_seqs;
 
-    //        cur = lctx.cvec.apply_to(ctx0, cur, il);
-    //        cb(cur, "l_out", il);
+        for (int il = 0; il < n_layer; ++il) {
+            const llama_layer * layer = &model.layers[il];
 
-    //        // input for next layer
-    //        inpL = cur;
-    //    }
+            struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load(
+                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
+            );
 
-    //    cur = inpL;
-    //    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-    //    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+            struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+            struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
 
-    //    cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
-    //    cb(cur, "result_norm", -1);
+            struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
+            cb(att_norm, "attn_norm", il);
 
-    //    cur = build_lora_mm(model.output, cur);
-    //    cb(cur, "result_output", -1);
+            struct ggml_tensor * x_prev = ggml_concat(
+                ctx0,
+                att_shift,
+                ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+                1
+            );
 
-    //    ggml_build_forward_expand(gf, cur);
+            cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
 
-    //    return gf;
-    //}
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            struct ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
+            cb(ffn_norm, "ffn_norm", il);
+
+            x_prev = ggml_concat(
+                ctx0,
+                ffn_shift,
+                ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
+                1
+            );
+
+            cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            token_shift = ggml_concat(ctx0, 
+                ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
+                ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
+                1
+            );
+            ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+
+            if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
+                cur = ggml_scale(ctx0, cur, 0.5F);
+            }
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+
+        cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 
     // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
-    //ggml_cgraph * build_rwkv6qwen2() {
-    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    ggml_cgraph * build_rwkv6qwen2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-    //    GGML_ASSERT(n_embd == hparams.n_embd_k_s());
+        GGML_ASSERT(n_embd == hparams.n_embd_k_s());
 
-    //    const int64_t n_seqs = ubatch.n_seqs;
-    //    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-    //    const int64_t n_tokens = ubatch.n_tokens;
-    //    GGML_ASSERT(n_seqs != 0);
-    //    GGML_ASSERT(ubatch.equal_seqs);
-    //    GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
 
-    //    struct ggml_tensor * cur;
-    //    struct ggml_tensor * inpL;
-    //    struct ggml_tensor * state_copy = build_inp_s_copy();
-    //    struct ggml_tensor * state_mask = build_inp_s_mask();
+        inpL = build_inp_embd(model.tok_embd);
 
-    //    inpL = build_inp_embd(model.tok_embd);
+        struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case);
 
-    //    for (int il = 0; il < n_layer; ++il) {
-    //        const llama_layer * layer = &model.layers[il];
-
-    //        // (ab)using the KV cache to store the states
-    //        struct ggml_tensor * token_shift = build_copy_mask_state(
-    //                gf, kv_self.k_l[il], state_copy, state_mask,
-    //                hparams.n_embd_k_s(), n_seqs);
-
-    //        struct ggml_tensor * wkv_states = build_copy_mask_state(
-    //                gf, kv_self.v_l[il], state_copy, state_mask,
-    //                hparams.n_embd_v_s(), n_seqs);
-
-    //        cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-    //        token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
-
-    //        struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
-    //        struct ggml_tensor * x_prev = ggml_concat(
-    //            ctx0,
-    //            token_shift,
-    //            ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
-    //            1
-    //        );
-
-    //        struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
-    //        ggml_build_forward_expand(
-    //            gf,
-    //            ggml_cpy(
-    //                ctx0,
-    //                ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0),
-    //                ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
-
-    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
-    //        ggml_build_forward_expand(gf, ffn_inp);
-    //        ggml_build_forward_expand(
-    //            gf,
-    //            ggml_cpy(
-    //                ctx0,
-    //                wkv_states,
-    //                ggml_view_1d(
-    //                    ctx0,
-    //                    kv_self.v_l[il],
-    //                    hparams.n_embd_v_s() * n_seqs,
-    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-    //                )
-    //            )
-    //        );
+        const auto n_embd = hparams.n_embd;
+        const auto n_seq_tokens = ubatch.n_seq_tokens;
+        const auto n_seqs = ubatch.n_seqs;
 
-    //        cb(ffn_inp, "ffn_inp", il);
+        inpL = build_inp_embd(model.tok_embd);
 
-    //        // feed-forward network
-    //        cur = build_norm(ffn_inp,
-    //                model.layers[il].ffn_norm, NULL,
-    //                LLM_NORM_RMS, il);
-    //        cb(cur, "ffn_norm", il);
-
-    //        cur = build_ffn(cur,
-    //                model.layers[il].ffn_up,   NULL, NULL,
-    //                model.layers[il].ffn_gate, NULL, NULL,
-    //                model.layers[il].ffn_down, NULL, NULL,
-    //                NULL,
-    //                LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-    //        cb(cur, "ffn_out", il);
+        for (int il = 0; il < n_layer; ++il) {
+            const llama_layer * layer = &model.layers[il];
 
-    //        cur = ggml_add(ctx0, cur, ffn_inp);
-    //        cur = lctx.cvec.apply_to(ctx0, cur, il);
-    //        cb(cur, "l_out", il);
+            struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load(
+                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
+            );
 
-    //        // input for next layer
-    //        inpL = cur;
-    //    }
+            struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
+            cb(att_norm, "attn_norm", il);
 
-    //    cur = inpL;
-    //    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-    //    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+            struct ggml_tensor * x_prev = ggml_concat(
+                ctx0,
+                token_shift,
+                ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+                1
+            );
 
-    //    cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
-    //    cb(cur, "result_norm", -1);
+            cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
 
-    //    cur = build_lora_mm(model.output, cur);
-    //    cb(cur, "result_output", -1);
+            token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
+            ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
 
-    //    ggml_build_forward_expand(gf, cur);
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
 
-    //    return gf;
-    //}
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+
+        cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 
     // ref: https://github.com/facebookresearch/chameleon
     // based on the original build_llama() function, changes:
@@ -7726,14 +7537,14 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_exaone();
             } break;
-        //case LLM_ARCH_RWKV6:
-        //    {
-        //        result = llm.build_rwkv6();
-        //    } break;
-        //case LLM_ARCH_RWKV6QWEN2:
-        //    {
-        //        result = llm.build_rwkv6qwen2();
-        //    } break;
+        case LLM_ARCH_RWKV6:
+            {
+                result = llm.build_rwkv6();
+            } break;
+        case LLM_ARCH_RWKV6QWEN2:
+            {
+                result = llm.build_rwkv6qwen2();
+            } break;
         case LLM_ARCH_CHAMELEON:
             {
                 result = llm.build_chameleon();

From e0d913fccbffe7913b2fa6a00590ca68800c9b59 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Feb 2025 10:02:50 +0200
Subject: [PATCH 22/84] llama : clear whitespaces

---
 src/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 171ea20178d0f..f03386af42b9d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6850,7 +6850,7 @@ struct llm_build_context {
             cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            token_shift = ggml_concat(ctx0, 
+            token_shift = ggml_concat(ctx0,
                 ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
                 ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
                 1

From b15fede7a9a044d0a15da03b9ceb08f7007bfc95 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Feb 2025 14:34:45 +0200
Subject: [PATCH 23/84] kv-cache : fix defrag condition

ggml-ci
---
 src/llama-context.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 3bc0513ca1be0..719622eaa74ec 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -532,11 +532,13 @@ struct llama_batch_manager : public llama_batch_manager_i {
 
         // decide if we need to defrag the kv cache
         if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
-            const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
+            // - do not defrag small contexts (i.e. < 2048 tokens)
+            // - count the padding towards the number of used tokens
+            const float fragmentation = kv_self.n >= 2048 ? 1.0f - float(kv_self.used + lctx.get_ctx_padding(cparams))/float(kv_self.n) : 0.0f;
 
             // queue defragmentation for next llama_kv_cache_update
             if (fragmentation > cparams.defrag_thold) {
-                //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
+                LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
 
                 kv_self.defrag();
             }

From f9971ef2e1754f8dde65d5fc0602b7719a0c5326 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Feb 2025 14:59:51 +0200
Subject: [PATCH 24/84] llama : dedup reserve code

---
 src/llama.cpp | 50 ++------------------------------------------------
 1 file changed, 2 insertions(+), 48 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 465938cf02ba1..e89e70bbec560 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7629,30 +7629,6 @@ static int llama_decode_impl(
             return -3;
         }
 
-        // reserve a worst case graph if needed
-        // TODO: extract to a function
-        if (lctx.need_reserve) {
-            const auto & cparams = lctx.cparams;
-            const auto & model   = lctx.model;
-
-            // build worst-case graph
-            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-            ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
-
-            // initialize scheduler with the worst-case graph
-            ggml_backend_sched_reset(lctx.sched.get());
-            if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-            }
-
-            lctx.need_reserve = false;
-        }
-
         ggml_backend_sched_reset(lctx.sched.get());
         ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
@@ -7889,30 +7865,8 @@ static int llama_encode_impl(
 
     //batch_manager->prepare(ubatch);
 
-    // reserve a worst case graph if needed
-    // TODO: extract to a function
-    if (lctx.need_reserve) {
-        // TODO: extract to a function
-        const auto & cparams = lctx.cparams;
-        const auto & model   = lctx.model;
-
-        // build worst-case graph
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
-
-        // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(lctx.sched.get());
-        if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        }
-
-        lctx.need_reserve = false;
-    }
+    // TODO: do reserve
+    GGML_ASSERT(lctx.need_reserve == false);
 
     ggml_backend_sched_reset(lctx.sched.get());
     ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);

From 879ba82777b93f30c32eca731d0bf03e7fd20be7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Feb 2025 15:00:02 +0200
Subject: [PATCH 25/84] server : increase context size for the tests

ggml-ci
---
 examples/server/tests/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index ce06806620c0b..97d650a9c0cd0 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -280,7 +280,7 @@ def tinyllama2() -> ServerProcess:
         server.model_hf_repo = "ggml-org/models"
         server.model_hf_file = "tinyllamas/stories260K.gguf"
         server.model_alias = "tinyllama-2"
-        server.n_ctx = 256
+        server.n_ctx = 512
         server.n_batch = 32
         server.n_slots = 2
         server.n_predict = 64

From ef358ee78f08e4d7af3916e0d101925c5bc6e122 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Feb 2025 16:11:17 +0200
Subject: [PATCH 26/84] context : add decode/encode

ggml-ci
---
 src/llama-context.cpp | 630 ++++++++++++++++++++++++++++++++++--------
 src/llama-context.h   |  32 +--
 src/llama.cpp         | 386 +-------------------------
 3 files changed, 526 insertions(+), 522 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 7705d583bb004..5d21dd5ef2cb3 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -9,6 +9,121 @@
 #include <stdexcept>
 #include <cinttypes>
 
+// llama output (TMP)
+
+// Make sure enough space is available for outputs.
+// Returns max number of outputs for which space was reserved.
+static size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
+    const auto & cparams = lctx.cparams;
+    const auto & hparams = lctx.model.hparams;
+    const auto & vocab   = lctx.model.vocab;
+
+    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
+
+    const auto n_batch = cparams.n_batch;
+    const auto n_vocab = vocab.n_tokens();
+    const auto n_embd  = hparams.n_embd;
+
+    // TODO: use a per-batch flag for logits presence instead
+    const bool has_logits = !cparams.embeddings;
+    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+
+    const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
+    const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
+
+    if (lctx.output_ids.empty()) {
+        // init, never resized afterwards
+        lctx.output_ids.resize(n_batch);
+    }
+
+    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
+
+    // alloc only when more than the current capacity is required
+    // TODO: also consider shrinking the buffer
+    if (!lctx.buf_output || prev_size < new_size) {
+        if (lctx.buf_output) {
+#ifndef NDEBUG
+            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+            lctx.buf_output = nullptr;
+            lctx.logits = nullptr;
+            lctx.embd = nullptr;
+        }
+
+        auto * buft = ggml_backend_cpu_buffer_type();
+        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
+        auto * output_dev = lctx.model.dev_output();
+        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+        if (output_dev_host_buft) {
+            buft = output_dev_host_buft;
+        }
+        lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
+        if (lctx.buf_output == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
+            return 0;
+        }
+    }
+
+    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
+
+    lctx.logits = has_logits ? output_base               : nullptr;
+    lctx.embd   = has_embd   ? output_base + logits_size : nullptr;
+
+    lctx.output_size = n_outputs_max;
+    lctx.logits_size = logits_size;
+    lctx.embd_size   = embd_size;
+
+    // set all ids as invalid (negative)
+    std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
+
+    ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
+
+    lctx.n_outputs = 0;
+
+    return n_outputs_max;
+}
+
+// make the outputs have the same order they had in the user-provided batch
+static void llama_output_reorder(struct llama_context & ctx) {
+    std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
+    if (!out_ids.empty()) {
+        const uint32_t n_vocab = ctx.model.vocab.n_tokens();
+        const uint32_t n_embd  = ctx.model.hparams.n_embd;
+
+        const int32_t n_outputs = ctx.n_outputs;
+        GGML_ASSERT((size_t) n_outputs == out_ids.size());
+
+        // TODO: is there something more efficient which also minimizes swaps?
+        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+        for (int32_t i = 0; i < n_outputs - 1; ++i) {
+            int32_t j_min = i;
+            for (int32_t j = i + 1; j < n_outputs; ++j) {
+                if (out_ids[j] < out_ids[j_min]) {
+                    j_min = j;
+                }
+            }
+            if (j_min == i) { continue; }
+            std::swap(out_ids[i], out_ids[j_min]);
+            if (ctx.logits_size > 0) {
+                for (uint32_t k = 0; k < n_vocab; k++) {
+                    std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]);
+                }
+            }
+            if (ctx.embd_size > 0) {
+                for (uint32_t k = 0; k < n_embd; k++) {
+                    std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]);
+                }
+            }
+        }
+        std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1);
+        for (int32_t i = 0; i < n_outputs; ++i) {
+            ctx.output_ids[out_ids[i]] = i;
+        }
+        out_ids.clear();
+    }
+}
 static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
     // TODO move to hparams if a T5 variant appears that uses a different value
     const int64_t max_distance = 128;
@@ -340,6 +455,20 @@ llama_context::llama_context(
 
 }
 
+struct llama_batch_manager_i {
+    virtual ~llama_batch_manager_i() = default;
+
+    virtual bool is_done() const = 0;
+    virtual llama_ubatch next() = 0;
+    virtual bool prepare() = 0;
+    virtual void restore() = 0;
+    virtual void update() = 0;
+    virtual void finalize() = 0;
+
+    // TODO: might be temporary
+    int64_t n_outputs_all = 0;
+};
+
 struct llama_batch_manager : public llama_batch_manager_i {
     llama_batch_manager(llama_context & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
         const auto & model   = lctx.model;
@@ -398,6 +527,10 @@ struct llama_batch_manager : public llama_batch_manager_i {
     ~llama_batch_manager() override {
     }
 
+    virtual bool is_done() const override {
+        return lctx.sbatch.n_tokens == 0;
+    }
+
     virtual llama_ubatch next() override {
         ubatch = llama_ubatch();
 
@@ -558,6 +691,390 @@ std::unique_ptr<llama_batch_manager_i> llama_context::prepare_batch(const llama_
     return std::make_unique<llama_batch_manager>(*this, batch);
 }
 
+int llama_context::decode(llama_batch & inp_batch) {
+    is_encoding = false;
+
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
+
+    const llama_batch & batch = batch_allocr.batch;
+
+    const auto & vocab   = model.vocab;
+    const auto & hparams = model.hparams;
+
+    const int32_t n_vocab = vocab.n_tokens();
+    const int64_t n_embd  = hparams.n_embd;
+
+    // TODO: try catch
+    auto bman = prepare_batch(batch);
+
+    const auto n_outputs_all = bman->n_outputs_all;
+
+    // reserve output buffer
+    // TODO: move to batch manager?
+    if (llama_output_reserve(*this, bman->n_outputs_all) < (size_t) n_outputs_all) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
+        return -2;
+    };
+
+    int64_t n_outputs_prev = 0;
+
+    while (!bman->is_done()) {
+        llama_ubatch ubatch = bman->next();
+
+        if (!bman->prepare()) {
+            LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
+            bman->restore();
+            return -3;
+        }
+
+        ggml_backend_sched_reset(sched.get());
+        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+        ggml_cgraph * gf = cb_build_graph(*this, ubatch, false);
+
+        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+        ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+        set_inputs(ubatch);
+
+        // the output is always the last tensor in the graph
+        struct ggml_tensor * t_logits = ggml_graph_node(gf, -1);
+        struct ggml_tensor * t_embd   = ggml_graph_node(gf, -2);
+
+        if (n_outputs == 0) {
+            // no output
+            t_logits  = nullptr;
+            t_embd = nullptr;
+        } else if (cparams.embeddings) {
+            t_logits  = nullptr; // do not extract logits for embedding case
+            t_embd = nullptr;
+            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+                    t_embd = ggml_graph_node(gf, i);
+                    break;
+                }
+            }
+            GGML_ASSERT(t_embd != nullptr && "missing embeddings tensor");
+        } else {
+            t_embd = nullptr; // do not extract embeddings when not needed
+            GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor");
+        }
+
+        const auto compute_status = compute_graph(gf, ubatch.n_tokens > 1);
+        if (compute_status != GGML_STATUS_SUCCESS) {
+            bman->restore();
+            switch (compute_status) {
+                case GGML_STATUS_ABORTED:
+                    return 2;
+                case GGML_STATUS_ALLOC_FAILED:
+                    return -2;
+                case GGML_STATUS_FAILED:
+                default:
+                    return -3;
+            }
+        }
+
+        bman->update();
+
+        // plot the computation graph in dot format (for debugging purposes)
+        //if (n_past%100 == 0) {
+        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+        //}
+
+        // extract logits
+        if (t_logits) {
+            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
+            GGML_ASSERT(backend_res != nullptr);
+            GGML_ASSERT(logits != nullptr);
+
+            float * logits_out = logits + n_outputs_prev*n_vocab;
+            const int32_t n_outputs_new = n_outputs;
+
+            if (n_outputs_new) {
+                GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all);
+                GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) logits_size);
+                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
+            }
+        }
+
+        // extract embeddings
+        if (t_embd) {
+            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+            GGML_ASSERT(backend_embd != nullptr);
+
+            switch (cparams.pooling_type) {
+                case LLAMA_POOLING_TYPE_NONE:
+                    {
+                        // extract token embeddings
+                        GGML_ASSERT(embd != nullptr);
+                        float * embd_out = embd + n_outputs_prev*n_embd;
+                        const int32_t n_outputs_new = n_outputs;
+
+                        if (n_outputs_new) {
+                            GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all);
+                            GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) embd_size);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_MEAN:
+                case LLAMA_POOLING_TYPE_CLS:
+                case LLAMA_POOLING_TYPE_LAST:
+                    {
+                        // extract sequence embeddings (cleared before processing each batch)
+                        auto & embd_seq_out = embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(n_embd);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // extract the rerank score - a single float per sequence
+                        auto & embd_seq_out = embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(1);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                    {
+                        GGML_ABORT("unknown pooling type");
+                    }
+            }
+        }
+
+        n_outputs_prev += n_outputs;
+    }
+
+    // set output mappings
+    {
+        bool sorted_output = true;
+
+        GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
+
+        for (size_t i = 0; i < (size_t) n_outputs_all; ++i) {
+            size_t out_id = sbatch.out_ids[i];
+            output_ids[out_id] = i;
+            if (out_id != i) {
+                sorted_output = false;
+            }
+        }
+
+        if (sorted_output) {
+            sbatch.out_ids.clear();
+        }
+    }
+
+    // set to total number of outputs in the batch, for use in llama_get_logits_ith
+    n_outputs = n_outputs_all;
+
+    // wait for the computation to finish (automatically done when obtaining the model output)
+    //llama_synchronize(&;
+
+    bman->finalize();
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    return 0;
+}
+
+int llama_context::encode(llama_batch & inp_batch) {
+    is_encoding = true;
+
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
+
+    const llama_batch & batch = batch_allocr.batch;
+    const uint32_t n_tokens = batch.n_tokens;
+
+    const auto & hparams = model.hparams;
+
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
+
+    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
+    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
+
+    if (t_compute_start_us == 0) {
+        t_compute_start_us = ggml_time_us();
+    }
+
+    n_queued_tokens += n_tokens;
+
+    const int64_t n_embd = hparams.n_embd;
+
+    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+
+    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
+
+    // reserve output buffer
+    if (llama_output_reserve(*this, n_tokens) < n_tokens) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+        return -2;
+    };
+
+    for (uint32_t i = 0; i < n_tokens; ++i) {
+        output_ids[i] = i;
+    }
+
+    inp_embd_enc = NULL;
+    n_outputs = n_tokens;
+
+    //batch_manager->prepare(ubatch);
+
+    // TODO: do reserve
+    GGML_ASSERT(need_reserve == false);
+
+    ggml_backend_sched_reset(sched.get());
+    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+    ggml_cgraph * gf = cb_build_graph(*this, ubatch, false);
+
+    ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+    set_inputs(ubatch);
+
+    // the output embeddings after the final encoder normalization
+    struct ggml_tensor * t_embd = nullptr;
+
+    // there are two cases here
+    if (llama_model_has_decoder(&model)) {
+        // first case is an encoder-decoder T5 model where embeddings are passed to decoder
+        t_embd = ggml_graph_node(gf, -1);
+        GGML_ASSERT(strcmp(t_embd->name, "result_norm") == 0 && "missing result_output tensor");
+    } else {
+        // second case is an encoder-only T5 model
+        if (cparams.embeddings) {
+            // only output embeddings if required
+            t_embd = ggml_graph_node(gf, -1);
+            if (strcmp(t_embd->name, "result_embd_pooled") != 0) {
+                t_embd = ggml_graph_node(gf, -2);
+            }
+            GGML_ASSERT(strcmp(t_embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
+        }
+    }
+
+    const auto compute_status = compute_graph(gf, n_tokens > 1);
+    switch (compute_status) {
+        case GGML_STATUS_SUCCESS:
+            break;
+        case GGML_STATUS_ABORTED:
+            return 2;
+        case GGML_STATUS_ALLOC_FAILED:
+            return -2;
+        case GGML_STATUS_FAILED:
+        default:
+            return -3;
+    }
+
+    // extract embeddings
+    if (t_embd) {
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+        GGML_ASSERT(backend_embd != nullptr);
+
+        if (llama_model_has_decoder(&model)) {
+            embd_enc.resize(n_tokens*n_embd);
+            float * embd_out = embd_enc.data();
+
+            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+            GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+            // remember the sequence ids used during the encoding - needed for cross attention later
+            seq_ids_enc.resize(n_tokens);
+            for (uint32_t i = 0; i < n_tokens; i++) {
+                for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
+                    llama_seq_id seq_id = ubatch.seq_id[i][s];
+                    seq_ids_enc[i].insert(seq_id);
+                }
+            }
+        } else {
+            GGML_ASSERT(embd != nullptr);
+
+            switch (cparams.pooling_type) {
+                case LLAMA_POOLING_TYPE_NONE:
+                    {
+                        // extract token embeddings
+                        GGML_ASSERT(embd != nullptr);
+                        float * embd_out = embd;
+
+                        GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                    } break;
+                case LLAMA_POOLING_TYPE_MEAN:
+                case LLAMA_POOLING_TYPE_CLS:
+                case LLAMA_POOLING_TYPE_LAST:
+                    {
+                        // extract sequence embeddings
+                        auto & embd_seq_out = embd_seq;
+                        embd_seq_out.clear();
+
+                        GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+                        for (uint32_t i = 0; i < n_tokens; i++) {
+                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(n_embd);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                        //       wait for an encoder model that requires this pooling type in order to test it
+                        //       https://github.com/ggerganov/llama.cpp/pull/9510
+                        GGML_ABORT("RANK pooling not implemented yet");
+                    }
+                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                    {
+                        GGML_ABORT("unknown pooling type");
+                    }
+            }
+        }
+    }
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    return 0;
+}
+
 enum ggml_status llama_context::compute_graph(
             ggml_cgraph * graph,
                    bool   batched) {
@@ -2194,119 +2711,6 @@ ggml_tensor * llama_context::build_rwkv6_time_mix(
     return cur;
 }
 
-// llama output
-
-size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
-    const auto & cparams = lctx.cparams;
-    const auto & hparams = lctx.model.hparams;
-    const auto & vocab   = lctx.model.vocab;
-
-    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
-
-    const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_tokens();
-    const auto n_embd  = hparams.n_embd;
-
-    // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
-
-    const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-    const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
-
-    if (lctx.output_ids.empty()) {
-        // init, never resized afterwards
-        lctx.output_ids.resize(n_batch);
-    }
-
-    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
-    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
-
-    // alloc only when more than the current capacity is required
-    // TODO: also consider shrinking the buffer
-    if (!lctx.buf_output || prev_size < new_size) {
-        if (lctx.buf_output) {
-#ifndef NDEBUG
-            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
-#endif
-            lctx.buf_output = nullptr;
-            lctx.logits = nullptr;
-            lctx.embd = nullptr;
-        }
-
-        auto * buft = ggml_backend_cpu_buffer_type();
-        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
-        auto * output_dev = lctx.model.dev_output();
-        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
-        if (output_dev_host_buft) {
-            buft = output_dev_host_buft;
-        }
-        lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
-        if (lctx.buf_output == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
-            return 0;
-        }
-    }
-
-    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
-
-    lctx.logits = has_logits ? output_base               : nullptr;
-    lctx.embd   = has_embd   ? output_base + logits_size : nullptr;
-
-    lctx.output_size = n_outputs_max;
-    lctx.logits_size = logits_size;
-    lctx.embd_size   = embd_size;
-
-    // set all ids as invalid (negative)
-    std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
-
-    ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
-
-    lctx.n_outputs = 0;
-
-    return n_outputs_max;
-}
-
-void llama_output_reorder(struct llama_context & ctx) {
-    std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
-    if (!out_ids.empty()) {
-        const uint32_t n_vocab = ctx.model.vocab.n_tokens();
-        const uint32_t n_embd  = ctx.model.hparams.n_embd;
-
-        const int32_t n_outputs = ctx.n_outputs;
-        GGML_ASSERT((size_t) n_outputs == out_ids.size());
-
-        // TODO: is there something more efficient which also minimizes swaps?
-        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-        for (int32_t i = 0; i < n_outputs - 1; ++i) {
-            int32_t j_min = i;
-            for (int32_t j = i + 1; j < n_outputs; ++j) {
-                if (out_ids[j] < out_ids[j_min]) {
-                    j_min = j;
-                }
-            }
-            if (j_min == i) { continue; }
-            std::swap(out_ids[i], out_ids[j_min]);
-            if (ctx.logits_size > 0) {
-                for (uint32_t k = 0; k < n_vocab; k++) {
-                    std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]);
-                }
-            }
-            if (ctx.embd_size > 0) {
-                for (uint32_t k = 0; k < n_embd; k++) {
-                    std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]);
-                }
-            }
-        }
-        std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1);
-        for (int32_t i = 0; i < n_outputs; ++i) {
-            ctx.output_ids[out_ids[i]] = i;
-        }
-        out_ids.clear();
-    }
-}
-
 //
 // interface implementation
 //
diff --git a/src/llama-context.h b/src/llama-context.h
index 4cf4a6312ede0..f6d63eb3cebfc 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -16,22 +16,7 @@
 
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
-// TODO: this is very WIP - improve
-struct llama_batch_manager_i {
-    virtual ~llama_batch_manager_i() = default;
-
-    //bool is_done() const;
-
-    virtual llama_ubatch next() = 0;
-
-    virtual bool prepare() = 0;
-    virtual void restore() = 0;
-    virtual void update() = 0;
-    virtual void finalize() = 0;
-
-    // TODO: might be temporary
-    int64_t n_outputs_all = 0;
-};
+struct llama_batch_manager_i;
 
 // TODO: make implementation details private
 // TODO: become abstract base class, split the current implementation into different child classes
@@ -44,6 +29,8 @@ struct llama_context {
             const llama_context_params & params,
             build_graph_callback && cb_build_graph);
 
+    virtual ~llama_context() = default;
+
     const struct llama_model & model;
 
     llama_cparams      cparams;
@@ -104,8 +91,10 @@ struct llama_context {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
-    // TODO: do not pass logits_all explicitly
-    std::unique_ptr<llama_batch_manager_i> prepare_batch(const llama_batch & batch);
+    virtual std::unique_ptr<llama_batch_manager_i> prepare_batch(const llama_batch & batch);
+
+    virtual int decode(llama_batch & inp_batch);
+    virtual int encode(llama_batch & inp_batch);
 
     // returns the result of ggml_backend_sched_graph_compute_async execution
     enum ggml_status compute_graph(
@@ -286,13 +275,6 @@ struct llama_context {
     int n_pos_per_token = 1;
 };
 
-// Make sure enough space is available for outputs.
-// Returns max number of outputs for which space was reserved.
-size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
-
-// make the outputs have the same order they had in the user-provided batch
-void llama_output_reorder(struct llama_context & ctx);
-
 // For internal test use
 // TODO: remove
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index e89e70bbec560..ed5e1e5254e7a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7584,213 +7584,7 @@ static struct ggml_cgraph * llama_build_graph(
 static int llama_decode_impl(
          llama_context & lctx,
            llama_batch   inp_batch) {
-
-    lctx.is_encoding = false;
-
-    if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
-        return -1;
-    }
-
-    // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
-
-    const llama_batch & batch = batch_allocr.batch;
-
-    const auto & model   = lctx.model;
-    const auto & vocab   = model.vocab;
-    const auto & cparams = lctx.cparams;
-    const auto & hparams = lctx.model.hparams;
-
-    const int32_t n_vocab = vocab.n_tokens();
-    const int64_t n_embd  = hparams.n_embd;
-
-    // TODO: try catch
-    auto bman = lctx.prepare_batch(batch);
-
-    const auto n_outputs_all = bman->n_outputs_all;
-
-    // reserve output buffer
-    // TODO: move to batch manager?
-    if (llama_output_reserve(lctx, bman->n_outputs_all) < (size_t) n_outputs_all) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
-        return -2;
-    };
-
-    int64_t n_outputs_prev = 0;
-
-    while (lctx.sbatch.n_tokens > 0) {
-        llama_ubatch ubatch = bman->next();
-
-        if (!bman->prepare()) {
-            LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
-            bman->restore();
-            return -3;
-        }
-
-        ggml_backend_sched_reset(lctx.sched.get());
-        ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
-
-        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
-
-        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
-
-        ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
-
-        lctx.set_inputs(ubatch);
-
-        // the output is always the last tensor in the graph
-        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
-        struct ggml_tensor * embd = ggml_graph_node(gf, -2);
-
-        if (lctx.n_outputs == 0) {
-            // no output
-            res  = nullptr;
-            embd = nullptr;
-        } else if (cparams.embeddings) {
-            res  = nullptr; // do not extract logits for embedding case
-            embd = nullptr;
-            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-                    embd = ggml_graph_node(gf, i);
-                    break;
-                }
-            }
-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
-        } else {
-            embd = nullptr; // do not extract embeddings when not needed
-            GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
-        }
-
-        const auto compute_status = lctx.compute_graph(gf, ubatch.n_tokens > 1);
-        if (compute_status != GGML_STATUS_SUCCESS) {
-            bman->restore();
-            switch (compute_status) {
-                case GGML_STATUS_ABORTED:
-                    return 2;
-                case GGML_STATUS_ALLOC_FAILED:
-                    return -2;
-                case GGML_STATUS_FAILED:
-                default:
-                    return -3;
-            }
-        }
-
-        bman->update();
-
-        // plot the computation graph in dot format (for debugging purposes)
-        //if (n_past%100 == 0) {
-        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
-        //}
-
-        // extract logits
-        if (res) {
-            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res);
-            GGML_ASSERT(backend_res != nullptr);
-            GGML_ASSERT(lctx.logits != nullptr);
-
-            float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
-            const int32_t n_outputs_new = lctx.n_outputs;
-
-            if (n_outputs_new) {
-                GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all);
-                GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
-                ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
-            }
-        }
-
-        // extract embeddings
-        if (embd) {
-            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
-            GGML_ASSERT(backend_embd != nullptr);
-
-            switch (cparams.pooling_type) {
-                case LLAMA_POOLING_TYPE_NONE:
-                    {
-                        // extract token embeddings
-                        GGML_ASSERT(lctx.embd != nullptr);
-                        float * embd_out = lctx.embd + n_outputs_prev*n_embd;
-                        const int32_t n_outputs_new = lctx.n_outputs;
-
-                        if (n_outputs_new) {
-                            GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all);
-                            GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_MEAN:
-                case LLAMA_POOLING_TYPE_CLS:
-                case LLAMA_POOLING_TYPE_LAST:
-                    {
-                        // extract sequence embeddings (cleared before processing each batch)
-                        auto & embd_seq_out = lctx.embd_seq;
-
-                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(n_embd);
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_RANK:
-                    {
-                        // extract the rerank score - a single float per sequence
-                        auto & embd_seq_out = lctx.embd_seq;
-
-                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(1);
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_UNSPECIFIED:
-                    {
-                        GGML_ABORT("unknown pooling type");
-                    }
-            }
-        }
-
-        n_outputs_prev += lctx.n_outputs;
-    }
-
-    // set output mappings
-    {
-        bool sorted_output = true;
-
-        GGML_ASSERT(lctx.sbatch.out_ids.size() == (size_t) n_outputs_all);
-
-        for (size_t i = 0; i < (size_t) n_outputs_all; ++i) {
-            size_t out_id = lctx.sbatch.out_ids[i];
-            lctx.output_ids[out_id] = i;
-            if (out_id != i) {
-                sorted_output = false;
-            }
-        }
-
-        if (sorted_output) {
-            lctx.sbatch.out_ids.clear();
-        }
-    }
-
-    // set to total number of outputs in the batch, for use in llama_get_logits_ith
-    lctx.n_outputs = n_outputs_all;
-
-    // wait for the computation to finish (automatically done when obtaining the model output)
-    //llama_synchronize(&lctx);
-
-    bman->finalize();
-
-    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-    // overlap with device computation.
-    ggml_backend_sched_reset(lctx.sched.get());
-
-    return 0;
+    return lctx.decode(inp_batch);
 }
 
 // encode a batch of tokens by evaluating the encoder part of the transformer
@@ -7805,183 +7599,7 @@ static int llama_decode_impl(
 static int llama_encode_impl(
          llama_context & lctx,
            llama_batch   inp_batch) {
-
-    lctx.is_encoding = true;
-
-    if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
-        return -1;
-    }
-
-    // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
-
-    const llama_batch & batch = batch_allocr.batch;
-    const uint32_t n_tokens = batch.n_tokens;
-
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
-
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
-
-    if (batch.token) {
-        for (uint32_t i = 0; i < n_tokens; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
-                return -1;
-            }
-        }
-    }
-
-    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
-    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
-
-    if (lctx.t_compute_start_us == 0) {
-        lctx.t_compute_start_us = ggml_time_us();
-    }
-
-    lctx.n_queued_tokens += n_tokens;
-
-    const int64_t n_embd = hparams.n_embd;
-
-    lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-
-    const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
-
-    // reserve output buffer
-    if (llama_output_reserve(lctx, n_tokens) < n_tokens) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
-        return -2;
-    };
-
-    for (uint32_t i = 0; i < n_tokens; ++i) {
-        lctx.output_ids[i] = i;
-    }
-
-    lctx.inp_embd_enc = NULL;
-    lctx.n_outputs = n_tokens;
-
-    //batch_manager->prepare(ubatch);
-
-    // TODO: do reserve
-    GGML_ASSERT(lctx.need_reserve == false);
-
-    ggml_backend_sched_reset(lctx.sched.get());
-    ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
-
-    ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
-
-    ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
-
-    lctx.set_inputs(ubatch);
-
-    // the output embeddings after the final encoder normalization
-    struct ggml_tensor * embd = nullptr;
-
-    // there are two cases here
-    if (llama_model_has_decoder(&lctx.model)) {
-        // first case is an encoder-decoder T5 model where embeddings are passed to decoder
-        embd = ggml_graph_node(gf, -1);
-        GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
-    } else {
-        // second case is an encoder-only T5 model
-        if (cparams.embeddings) {
-            // only output embeddings if required
-            embd = ggml_graph_node(gf, -1);
-            if (strcmp(embd->name, "result_embd_pooled") != 0) {
-                embd = ggml_graph_node(gf, -2);
-            }
-            GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
-        }
-    }
-
-    const auto compute_status = lctx.compute_graph(gf, n_tokens > 1);
-    switch (compute_status) {
-        case GGML_STATUS_SUCCESS:
-            break;
-        case GGML_STATUS_ABORTED:
-            return 2;
-        case GGML_STATUS_ALLOC_FAILED:
-            return -2;
-        case GGML_STATUS_FAILED:
-        default:
-            return -3;
-    }
-
-    // extract embeddings
-    if (embd) {
-        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
-        GGML_ASSERT(backend_embd != nullptr);
-
-        if (llama_model_has_decoder(&lctx.model)) {
-            lctx.embd_enc.resize(n_tokens*n_embd);
-            float * embd_out = lctx.embd_enc.data();
-
-            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
-            GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
-
-            // remember the sequence ids used during the encoding - needed for cross attention later
-            lctx.seq_ids_enc.resize(n_tokens);
-            for (uint32_t i = 0; i < n_tokens; i++) {
-                for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
-                    llama_seq_id seq_id = ubatch.seq_id[i][s];
-                    lctx.seq_ids_enc[i].insert(seq_id);
-                }
-            }
-        } else {
-            GGML_ASSERT(lctx.embd != nullptr);
-
-            switch (cparams.pooling_type) {
-                case LLAMA_POOLING_TYPE_NONE:
-                    {
-                        // extract token embeddings
-                        GGML_ASSERT(lctx.embd != nullptr);
-                        float * embd_out = lctx.embd;
-
-                        GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
-                        ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
-                    } break;
-                case LLAMA_POOLING_TYPE_MEAN:
-                case LLAMA_POOLING_TYPE_CLS:
-                case LLAMA_POOLING_TYPE_LAST:
-                    {
-                        // extract sequence embeddings
-                        auto & embd_seq_out = lctx.embd_seq;
-                        embd_seq_out.clear();
-
-                        GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
-
-                        for (uint32_t i = 0; i < n_tokens; i++) {
-                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(n_embd);
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_RANK:
-                    {
-                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
-                        //       wait for an encoder model that requires this pooling type in order to test it
-                        //       https://github.com/ggerganov/llama.cpp/pull/9510
-                        GGML_ABORT("RANK pooling not implemented yet");
-                    }
-                case LLAMA_POOLING_TYPE_UNSPECIFIED:
-                    {
-                        GGML_ABORT("unknown pooling type");
-                    }
-            }
-        }
-    }
-
-    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-    // overlap with device computation.
-    ggml_backend_sched_reset(lctx.sched.get());
-
-    return 0;
+    return lctx.encode(inp_batch);
 }
 
 //

From d1d8d530083a9bf3ada2427bf59e97fa58667365 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Feb 2025 16:50:14 +0200
Subject: [PATCH 27/84] bman : remove ubatch member

ggml-ci
---
 src/llama-context.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5d21dd5ef2cb3..4387128fedf15 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -460,9 +460,9 @@ struct llama_batch_manager_i {
 
     virtual bool is_done() const = 0;
     virtual llama_ubatch next() = 0;
-    virtual bool prepare() = 0;
+    virtual bool prepare(const llama_ubatch & ubatch) = 0;
     virtual void restore() = 0;
-    virtual void update() = 0;
+    virtual void update(const llama_ubatch & ubatch) = 0;
     virtual void finalize() = 0;
 
     // TODO: might be temporary
@@ -532,7 +532,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
     }
 
     virtual llama_ubatch next() override {
-        ubatch = llama_ubatch();
+        llama_ubatch ubatch = llama_ubatch();
 
         const auto & cparams = lctx.cparams;
         const auto & kv_self = lctx.kv_self;
@@ -557,7 +557,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
         return ubatch;
     }
 
-    virtual bool prepare() override {
+    virtual bool prepare(const llama_ubatch & ubatch) override {
         const auto & cparams = lctx.cparams;
         const auto & hparams = lctx.model.hparams;
         const auto & batch   = lctx.sbatch.batch;
@@ -644,7 +644,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
         kv_slot_restorer.restore(lctx.kv_self);
     }
 
-    virtual void update() override {
+    virtual void update(const llama_ubatch & ubatch) override {
         auto & kv_self = lctx.kv_self;
 
         // update the kv ring buffer
@@ -682,8 +682,6 @@ struct llama_batch_manager : public llama_batch_manager_i {
 
     const llama_batch & batch;
 
-    llama_ubatch ubatch;
-
     llama_kv_slot_restorer kv_slot_restorer;
 };
 
@@ -728,7 +726,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     while (!bman->is_done()) {
         llama_ubatch ubatch = bman->next();
 
-        if (!bman->prepare()) {
+        if (!bman->prepare(ubatch)) {
             LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
             bman->restore();
             return -3;
@@ -782,7 +780,7 @@ int llama_context::decode(llama_batch & inp_batch) {
             }
         }
 
-        bman->update();
+        bman->update(ubatch);
 
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {

From 2cd8a903c84b9fbf91f256a6349e05e492a47421 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Feb 2025 17:01:27 +0200
Subject: [PATCH 28/84] context : make output functions members

ggml-ci
---
 src/llama-context.cpp | 238 ++++++++++++++++++++----------------------
 src/llama-context.h   |   8 ++
 2 files changed, 122 insertions(+), 124 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4387128fedf15..87d6642da778f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -9,121 +9,6 @@
 #include <stdexcept>
 #include <cinttypes>
 
-// llama output (TMP)
-
-// Make sure enough space is available for outputs.
-// Returns max number of outputs for which space was reserved.
-static size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
-    const auto & cparams = lctx.cparams;
-    const auto & hparams = lctx.model.hparams;
-    const auto & vocab   = lctx.model.vocab;
-
-    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
-
-    const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_tokens();
-    const auto n_embd  = hparams.n_embd;
-
-    // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
-
-    const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-    const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
-
-    if (lctx.output_ids.empty()) {
-        // init, never resized afterwards
-        lctx.output_ids.resize(n_batch);
-    }
-
-    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
-    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
-
-    // alloc only when more than the current capacity is required
-    // TODO: also consider shrinking the buffer
-    if (!lctx.buf_output || prev_size < new_size) {
-        if (lctx.buf_output) {
-#ifndef NDEBUG
-            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
-#endif
-            lctx.buf_output = nullptr;
-            lctx.logits = nullptr;
-            lctx.embd = nullptr;
-        }
-
-        auto * buft = ggml_backend_cpu_buffer_type();
-        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
-        auto * output_dev = lctx.model.dev_output();
-        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
-        if (output_dev_host_buft) {
-            buft = output_dev_host_buft;
-        }
-        lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
-        if (lctx.buf_output == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
-            return 0;
-        }
-    }
-
-    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
-
-    lctx.logits = has_logits ? output_base               : nullptr;
-    lctx.embd   = has_embd   ? output_base + logits_size : nullptr;
-
-    lctx.output_size = n_outputs_max;
-    lctx.logits_size = logits_size;
-    lctx.embd_size   = embd_size;
-
-    // set all ids as invalid (negative)
-    std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
-
-    ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
-
-    lctx.n_outputs = 0;
-
-    return n_outputs_max;
-}
-
-// make the outputs have the same order they had in the user-provided batch
-static void llama_output_reorder(struct llama_context & ctx) {
-    std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
-    if (!out_ids.empty()) {
-        const uint32_t n_vocab = ctx.model.vocab.n_tokens();
-        const uint32_t n_embd  = ctx.model.hparams.n_embd;
-
-        const int32_t n_outputs = ctx.n_outputs;
-        GGML_ASSERT((size_t) n_outputs == out_ids.size());
-
-        // TODO: is there something more efficient which also minimizes swaps?
-        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-        for (int32_t i = 0; i < n_outputs - 1; ++i) {
-            int32_t j_min = i;
-            for (int32_t j = i + 1; j < n_outputs; ++j) {
-                if (out_ids[j] < out_ids[j_min]) {
-                    j_min = j;
-                }
-            }
-            if (j_min == i) { continue; }
-            std::swap(out_ids[i], out_ids[j_min]);
-            if (ctx.logits_size > 0) {
-                for (uint32_t k = 0; k < n_vocab; k++) {
-                    std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]);
-                }
-            }
-            if (ctx.embd_size > 0) {
-                for (uint32_t k = 0; k < n_embd; k++) {
-                    std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]);
-                }
-            }
-        }
-        std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1);
-        for (int32_t i = 0; i < n_outputs; ++i) {
-            ctx.output_ids[out_ids[i]] = i;
-        }
-        out_ids.clear();
-    }
-}
 static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
     // TODO move to hparams if a T5 variant appears that uses a different value
     const int64_t max_distance = 128;
@@ -334,7 +219,7 @@ llama_context::llama_context(
         // graph outputs buffer
         {
             // resized during inference when a batch uses more outputs
-            if (llama_output_reserve(*this, params.n_seq_max) < params.n_seq_max) {
+            if (reserve_outputs(params.n_seq_max) < params.n_seq_max) {
                 LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
                 throw std::runtime_error("failed to reserve initial output buffer");
             }
@@ -716,7 +601,7 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     // reserve output buffer
     // TODO: move to batch manager?
-    if (llama_output_reserve(*this, bman->n_outputs_all) < (size_t) n_outputs_all) {
+    if (reserve_outputs(bman->n_outputs_all) < (size_t) n_outputs_all) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
         return -2;
     };
@@ -940,7 +825,7 @@ int llama_context::encode(llama_batch & inp_batch) {
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
 
     // reserve output buffer
-    if (llama_output_reserve(*this, n_tokens) < n_tokens) {
+    if (reserve_outputs(n_tokens) < n_tokens) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
         return -2;
     };
@@ -1555,6 +1440,113 @@ void llama_context::set_inputs(const llama_ubatch & ubatch) {
     }
 }
 
+void llama_context::reorder_outputs() {
+    std::vector<size_t> & out_ids = sbatch.out_ids;
+    if (!out_ids.empty()) {
+        const uint32_t n_vocab = model.vocab.n_tokens();
+        const uint32_t n_embd  = model.hparams.n_embd;
+
+        GGML_ASSERT((size_t) n_outputs == out_ids.size());
+
+        // TODO: is there something more efficient which also minimizes swaps?
+        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+        for (int32_t i = 0; i < n_outputs - 1; ++i) {
+            int32_t j_min = i;
+            for (int32_t j = i + 1; j < n_outputs; ++j) {
+                if (out_ids[j] < out_ids[j_min]) {
+                    j_min = j;
+                }
+            }
+            if (j_min == i) { continue; }
+            std::swap(out_ids[i], out_ids[j_min]);
+            if (logits_size > 0) {
+                for (uint32_t k = 0; k < n_vocab; k++) {
+                    std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
+                }
+            }
+            if (embd_size > 0) {
+                for (uint32_t k = 0; k < n_embd; k++) {
+                    std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
+                }
+            }
+        }
+        std::fill(output_ids.begin(), output_ids.end(), -1);
+        for (int32_t i = 0; i < n_outputs; ++i) {
+            output_ids[out_ids[i]] = i;
+        }
+        out_ids.clear();
+    }
+}
+
+size_t llama_context::reserve_outputs(size_t n_outputs) {
+    const auto & hparams = model.hparams;
+    const auto & vocab   = model.vocab;
+
+    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
+
+    const auto n_batch = cparams.n_batch;
+    const auto n_vocab = vocab.n_tokens();
+    const auto n_embd  = hparams.n_embd;
+
+    // TODO: use a per-batch flag for logits presence instead
+    const bool has_logits = !cparams.embeddings;
+    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+
+    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
+    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
+
+    if (output_ids.empty()) {
+        // init, never resized afterwards
+        output_ids.resize(n_batch);
+    }
+
+    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
+
+    // alloc only when more than the current capacity is required
+    // TODO: also consider shrinking the buffer
+    if (!buf_output || prev_size < new_size) {
+        if (buf_output) {
+#ifndef NDEBUG
+            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+            buf_output = nullptr;
+            logits = nullptr;
+            embd = nullptr;
+        }
+
+        auto * buft = ggml_backend_cpu_buffer_type();
+        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
+        auto * output_dev = model.dev_output();
+        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+        if (output_dev_host_buft) {
+            buft = output_dev_host_buft;
+        }
+        buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
+        if (buf_output == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
+            return 0;
+        }
+    }
+
+    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
+
+    logits = has_logits ? output_base               : nullptr;
+    embd   = has_embd   ? output_base + logits_size : nullptr;
+
+    output_size = n_outputs_max;
+
+    // set all ids as invalid (negative)
+    std::fill(output_ids.begin(), output_ids.end(), -1);
+
+    ggml_backend_buffer_clear(buf_output.get(), 0);
+
+    n_outputs = 0;
+
+    return n_outputs_max;
+}
+
 // do mat_mul, while optionally apply lora
 ggml_tensor * llama_context::build_lora_mm(
         ggml_context * ctx0,
@@ -2827,8 +2819,7 @@ float * llama_get_logits(struct llama_context * ctx) {
     llama_synchronize(ctx);
 
     // reorder logits for backward compatibility
-    // TODO: maybe deprecate this
-    llama_output_reorder(*ctx);
+    ctx->reorder_outputs();
 
     return ctx->logits;
 }
@@ -2877,8 +2868,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     llama_synchronize(ctx);
 
     // reorder embeddings for backward compatibility
-    // TODO: maybe deprecate this
-    llama_output_reorder(*ctx);
+    ctx->reorder_outputs();
 
     return ctx->embd;
 }
@@ -3187,7 +3177,7 @@ struct llama_data_write {
     //}
 
     void write_output_ids(struct llama_context * ctx) {
-        llama_output_reorder(*ctx);
+        ctx->reorder_outputs();
 
         const uint32_t n_outputs = ctx->n_outputs;
 
@@ -3281,7 +3271,7 @@ struct llama_data_read {
         uint32_t n_outputs;
         read_to(&n_outputs, sizeof(n_outputs));
 
-        if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
+        if (n_outputs > ctx->reserve_outputs(n_outputs)) {
             throw std::runtime_error("could not reserve outputs");
         }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index f6d63eb3cebfc..8f22fd3b1d3a1 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -114,6 +114,14 @@ struct llama_context {
 
     void set_inputs(const llama_ubatch & ubatch);
 
+    // make the outputs have the same order they had in the user-provided batch
+    // TODO: maybe deprecate this
+    void reorder_outputs();
+
+    // Make sure enough space is available for outputs.
+    // Returns max number of outputs for which space was reserved.
+    size_t reserve_outputs(size_t n_outputs);
+
     ggml_tensor * build_lora_mm(
             ggml_context * ctx0,
              ggml_tensor * w,

From 02ef4be975bd7549971caa3149061008790112bb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 11 Feb 2025 11:25:18 +0200
Subject: [PATCH 29/84] context : initial abstraction

ggml-ci
---
 src/llama-context.cpp | 2077 +++++++++++++++++++++++------------------
 src/llama-context.h   |  480 ++++++++--
 src/llama.cpp         |  240 +----
 3 files changed, 1570 insertions(+), 1227 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 87d6642da778f..13beb097cbadd 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -33,14 +33,68 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
-llama_context::llama_context(
+// llama_context
+
+llama_context::llama_context(const llama_model & model) :
+    model     (model),
+    t_start_us(model.t_start_us),
+    t_load_us (model.t_load_us) {
+}
+
+llama_context::~llama_context() = default;
+
+void llama_context::synchronize() {
+    ggml_backend_sched_synchronize(sched.get());
+
+    // FIXME: if multiple single tokens are evaluated without a synchronization,
+    // the stats will be added to the prompt evaluation stats
+    // this should only happen when using batch size 1 to evaluate a batch
+
+    // add the evaluation to the stats
+    if (n_queued_tokens == 1) {
+        if (!cparams.no_perf) {
+            t_eval_us += ggml_time_us() - t_compute_start_us;
+        }
+        n_eval++;
+    } else if (n_queued_tokens > 1) {
+        if (!cparams.no_perf) {
+            t_p_eval_us += ggml_time_us() - t_compute_start_us;
+        }
+        n_p_eval += n_queued_tokens;
+    }
+
+    // get a more accurate load time, upon first eval
+    if (n_queued_tokens > 0 && !has_evaluated_once) {
+        t_load_us = ggml_time_us() - t_start_us;
+        has_evaluated_once = true;
+    }
+
+    n_queued_tokens = 0;
+    t_compute_start_us = 0;
+}
+
+int64_t llama_context::n_pos_per_token() const {
+    return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+}
+
+ggml_context_ptr llama_context::init() {
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_compute_meta.size(),
+        /*.mem_buffer =*/ buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    return ggml_context_ptr { ggml_init(params) };
+}
+
+// llama_context_unified
+
+llama_context_unified::llama_context_unified(
         const llama_model & model,
         const llama_context_params & params,
         build_graph_callback && cb_build_graph) :
-    model(model),
-    cb_build_graph(std::move(cb_build_graph)),
-    t_start_us(model.t_start_us),
-    t_load_us (model.t_load_us) {
+    llama_context(model),
+    cb_build_graph(std::move(cb_build_graph)){
 
     const auto & hparams = model.hparams;
 
@@ -252,6 +306,7 @@ llama_context::llama_context(
             const size_t max_nodes = model.max_nodes();
 
             // buffer used to store the computation graph and the tensor meta data
+            // TODO: move to base class
             buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
 
             // TODO: move these checks to ggml_backend_sched
@@ -337,25 +392,161 @@ llama_context::llama_context(
             }
         }
     }
+}
+
+llama_context_unified::~llama_context_unified() = default;
 
+uint32_t llama_context_unified::n_ctx() const {
+    return cparams.n_ctx;
 }
 
-struct llama_batch_manager_i {
-    virtual ~llama_batch_manager_i() = default;
+uint32_t llama_context_unified::n_batch() const {
+    return cparams.n_batch;
+}
 
-    virtual bool is_done() const = 0;
-    virtual llama_ubatch next() = 0;
-    virtual bool prepare(const llama_ubatch & ubatch) = 0;
-    virtual void restore() = 0;
-    virtual void update(const llama_ubatch & ubatch) = 0;
-    virtual void finalize() = 0;
+uint32_t llama_context_unified::n_ubatch() const {
+    return cparams.n_ubatch;
+}
 
-    // TODO: might be temporary
-    int64_t n_outputs_all = 0;
-};
+uint32_t llama_context_unified::n_seq_max() const {
+    // TODO: add notion of n_seq_max to llama_kv_cache and use it here
+    return kv_self.size;
+}
+
+llama_kv_cache * llama_context_unified::get_kv_self() {
+    return &kv_self;
+}
+
+const llama_kv_cache * llama_context_unified::get_kv_self() const {
+    return &kv_self;
+}
+
+enum llama_pooling_type llama_context_unified::pooling_type() const {
+    return cparams.pooling_type;
+}
+
+float * llama_context_unified::get_logits() {
+    // reorder logits for backward compatibility
+    reorder_outputs();
+
+    return logits;
+}
+
+float * llama_context_unified::get_logits_ith(int32_t i) {
+    int32_t j = -1;
+
+    try {
+        if (logits == nullptr) {
+            throw std::runtime_error("no logits");
+        }
+
+        if (i < 0) {
+            j = n_outputs + i;
+            if (j < 0) {
+                throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
+            }
+        } else if ((size_t) i >= output_ids.size()) {
+            throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
+        } else {
+            j = output_ids[i];
+        }
+
+        if (j < 0) {
+            throw std::runtime_error(format("batch.logits[%d] != true", i));
+        }
+        if (j >= n_outputs) {
+            // This should not happen
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
+        }
+
+        return logits + j*model.vocab.n_tokens();
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+        GGML_ABORT("fatal error");
+#else
+        return nullptr;
+#endif
+    }
+}
+
+float * llama_context_unified::get_embeddings() {
+    // reorder embeddings for backward compatibility
+    reorder_outputs();
+
+    return embd;
+}
+
+float * llama_context_unified::get_embeddings_ith(int32_t i) {
+    int32_t j = -1;
+
+    try {
+        if (embd == nullptr) {
+            throw std::runtime_error("no embeddings");
+        }
+
+        if (i < 0) {
+            j = n_outputs + i;
+            if (j < 0) {
+                throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
+            }
+        } else if ((size_t) i >= output_ids.size()) {
+            throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
+        } else {
+            j = output_ids[i];
+        }
+
+        if (j < 0) {
+            throw std::runtime_error(format("batch.logits[%d] != true", i));
+        }
+        if (j >= n_outputs) {
+            // This should not happen
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
+        }
+
+        return embd + j*model.hparams.n_embd;
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+        GGML_ABORT("fatal error");
+#else
+        return nullptr;
+#endif
+    }
+}
+
+float * llama_context_unified::get_embeddings_seq(llama_seq_id seq_id) {
+    auto it = embd_seq.find(seq_id);
+    if (it == embd_seq.end()) {
+        return nullptr;
+    }
+
+    return it->second.data();
+}
+
+ggml_context_ptr llama_context_unified::init() {
+    inp_tokens          = nullptr;
+    inp_embd            = nullptr;
+    inp_pos             = nullptr;
+    inp_out_ids         = nullptr;
+    inp_mean            = nullptr;
+    inp_cls             = nullptr;
+    inp_embd_enc        = nullptr;
+    inp_pos_bucket      = nullptr;
+    inp_KQ_mask         = nullptr;
+    inp_KQ_mask_cnv     = nullptr;
+    inp_KQ_mask_swa     = nullptr;
+    inp_KQ_mask_swa_cnv = nullptr;
+    inp_KQ_mask_cross   = nullptr;
+    inp_K_shift         = nullptr;
+    inp_s_copy          = nullptr;
+    inp_s_mask          = nullptr;
+
+    return llama_context::init();
+}
 
-struct llama_batch_manager : public llama_batch_manager_i {
-    llama_batch_manager(llama_context & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
+struct llama_context_unified::batch_manager {
+    batch_manager(llama_context_unified & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
         const auto & model   = lctx.model;
         const auto & cparams = lctx.cparams;
         const auto & hparams = lctx.model.hparams;
@@ -409,14 +600,14 @@ struct llama_batch_manager : public llama_batch_manager_i {
                 /* logits_all   */ logits_all);
     }
 
-    ~llama_batch_manager() override {
+    ~batch_manager() {
     }
 
-    virtual bool is_done() const override {
+    bool is_done() const {
         return lctx.sbatch.n_tokens == 0;
     }
 
-    virtual llama_ubatch next() override {
+    llama_ubatch next() {
         llama_ubatch ubatch = llama_ubatch();
 
         const auto & cparams = lctx.cparams;
@@ -442,7 +633,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
         return ubatch;
     }
 
-    virtual bool prepare(const llama_ubatch & ubatch) override {
+    bool prepare(const llama_ubatch & ubatch) {
         const auto & cparams = lctx.cparams;
         const auto & hparams = lctx.model.hparams;
         const auto & batch   = lctx.sbatch.batch;
@@ -525,11 +716,11 @@ struct llama_batch_manager : public llama_batch_manager_i {
         return true;
     }
 
-    virtual void restore() override {
+    void restore() {
         kv_slot_restorer.restore(lctx.kv_self);
     }
 
-    virtual void update(const llama_ubatch & ubatch) override {
+    void update(const llama_ubatch & ubatch) {
         auto & kv_self = lctx.kv_self;
 
         // update the kv ring buffer
@@ -543,7 +734,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
         }
     }
 
-    virtual void finalize() override {
+    void finalize() {
         const auto & cparams = lctx.cparams;
 
         auto & kv_self = lctx.kv_self;
@@ -563,18 +754,20 @@ struct llama_batch_manager : public llama_batch_manager_i {
         }
     }
 
-    llama_context & lctx;
+    int64_t n_outputs_all = 0;
+
+    llama_context_unified & lctx;
 
     const llama_batch & batch;
 
     llama_kv_slot_restorer kv_slot_restorer;
 };
 
-std::unique_ptr<llama_batch_manager_i> llama_context::prepare_batch(const llama_batch & batch) {
-    return std::make_unique<llama_batch_manager>(*this, batch);
+std::unique_ptr<llama_context_unified::batch_manager> llama_context_unified::prepare_batch(const llama_batch & batch) {
+    return std::make_unique<batch_manager>(*this, batch);
 }
 
-int llama_context::decode(llama_batch & inp_batch) {
+int llama_context_unified::decode(llama_batch & inp_batch) {
     is_encoding = false;
 
     if (inp_batch.n_tokens == 0) {
@@ -679,12 +872,11 @@ int llama_context::decode(llama_batch & inp_batch) {
             GGML_ASSERT(logits != nullptr);
 
             float * logits_out = logits + n_outputs_prev*n_vocab;
-            const int32_t n_outputs_new = n_outputs;
 
-            if (n_outputs_new) {
-                GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all);
-                GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) logits_size);
-                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
+            if (n_outputs) {
+                GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
+                GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size);
+                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
             }
         }
 
@@ -699,12 +891,11 @@ int llama_context::decode(llama_batch & inp_batch) {
                         // extract token embeddings
                         GGML_ASSERT(embd != nullptr);
                         float * embd_out = embd + n_outputs_prev*n_embd;
-                        const int32_t n_outputs_new = n_outputs;
 
-                        if (n_outputs_new) {
-                            GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all);
-                            GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) embd_size);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
+                        if (n_outputs) {
+                            GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
+                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
                         }
                     } break;
                 case LLAMA_POOLING_TYPE_MEAN:
@@ -770,7 +961,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     n_outputs = n_outputs_all;
 
     // wait for the computation to finish (automatically done when obtaining the model output)
-    //llama_synchronize(&;
+    //synchronize();
 
     bman->finalize();
 
@@ -781,7 +972,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-int llama_context::encode(llama_batch & inp_batch) {
+int llama_context_unified::encode(llama_batch & inp_batch) {
     is_encoding = true;
 
     if (inp_batch.n_tokens == 0) {
@@ -958,7 +1149,7 @@ int llama_context::encode(llama_batch & inp_batch) {
     return 0;
 }
 
-enum ggml_status llama_context::compute_graph(
+enum ggml_status llama_context_unified::compute_graph(
             ggml_cgraph * graph,
                    bool   batched) {
     int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
@@ -985,43 +1176,23 @@ enum ggml_status llama_context::compute_graph(
     return status;
 }
 
-llama_pos llama_context::pos_max() const {
+llama_pos llama_context_unified::pos_max() const {
     return kv_self.pos_max();
 }
 
-uint32_t llama_context::get_ctx_padding(const llama_cparams & cparams) const {
+uint32_t llama_context_unified::get_ctx_padding(const llama_cparams & cparams) const {
     return kv_self.get_padding(cparams);
 }
 
-// TODO: improve
-void llama_context::reset() {
-    inp_tokens          = nullptr;
-    inp_embd            = nullptr;
-    inp_pos             = nullptr;
-    inp_out_ids         = nullptr;
-    inp_mean            = nullptr;
-    inp_cls             = nullptr;
-    inp_embd_enc        = nullptr;
-    inp_pos_bucket      = nullptr;
-    inp_KQ_mask         = nullptr;
-    inp_KQ_mask_cnv     = nullptr;
-    inp_KQ_mask_swa     = nullptr;
-    inp_KQ_mask_swa_cnv = nullptr;
-    inp_KQ_mask_cross   = nullptr;
-    inp_K_shift         = nullptr;
-    inp_s_copy          = nullptr;
-    inp_s_mask          = nullptr;
-}
-
-void llama_context::prepare_k_shift() {
+void llama_context_unified::prepare_k_shift() {
 }
 
-void llama_context::prepare_defrag() {
+void llama_context_unified::prepare_defrag() {
 }
 
 // llama input
 
-void llama_context::set_inputs(const llama_ubatch & ubatch) {
+void llama_context_unified::set_inputs(const llama_ubatch & ubatch) {
     const llama_hparams & hparams = model.hparams;
 
     //
@@ -1056,8 +1227,8 @@ void llama_context::set_inputs(const llama_ubatch & ubatch) {
 
     if (ubatch.pos && inp_pos) {
         const int64_t n_tokens = ubatch.n_tokens;
-        auto n_pos = n_pos_per_token;
-        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(inp_pos));
+
+        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos));
     }
 
     if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
@@ -1440,7 +1611,7 @@ void llama_context::set_inputs(const llama_ubatch & ubatch) {
     }
 }
 
-void llama_context::reorder_outputs() {
+void llama_context_unified::reorder_outputs() {
     std::vector<size_t> & out_ids = sbatch.out_ids;
     if (!out_ids.empty()) {
         const uint32_t n_vocab = model.vocab.n_tokens();
@@ -1478,7 +1649,7 @@ void llama_context::reorder_outputs() {
     }
 }
 
-size_t llama_context::reserve_outputs(size_t n_outputs) {
+size_t llama_context_unified::reserve_outputs(size_t n_outputs) {
     const auto & hparams = model.hparams;
     const auto & vocab   = model.vocab;
 
@@ -1605,7 +1776,7 @@ ggml_tensor * llama_context::build_lora_mm_id(
     return res;
 }
 
-void llama_context::kv_self_update() {
+void llama_context_unified::kv_self_update() {
     auto & kv = kv_self;
 
     if (kv.has_shift) {
@@ -1619,15 +1790,8 @@ void llama_context::kv_self_update() {
 
             ggml_backend_sched_reset(sched.get());
 
-            struct ggml_init_params params = {
-                /*.mem_size   =*/ buf_compute_meta.size(),
-                /*.mem_buffer =*/ buf_compute_meta.data(),
-                /*.no_alloc   =*/ true,
-            };
-
-            ggml_context * ctx0 = ggml_init(params);
-
-            reset();
+            auto ctx = init();
+            auto ctx0 = ctx.get();
 
             ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
@@ -1639,8 +1803,6 @@ void llama_context::kv_self_update() {
 
             compute_graph(gf, false);
 
-            ggml_free(ctx0);
-
             need_reserve = true;
         }
 
@@ -1659,15 +1821,8 @@ void llama_context::kv_self_update() {
 
         ggml_backend_sched_reset(sched.get());
 
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ buf_compute_meta.size(),
-            /*.mem_buffer =*/ buf_compute_meta.data(),
-            /*.no_alloc   =*/ true,
-        };
-
-        ggml_context * ctx0 = ggml_init(params);
-
-        reset();
+        auto ctx = init();
+        auto ctx0 = ctx.get();
 
         ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
@@ -1680,19 +1835,13 @@ void llama_context::kv_self_update() {
 
         compute_graph(gf, false);
 
-        ggml_free(ctx0);
-
         kv.do_defrag = false;
 
         need_reserve = true;
     }
 }
 
-void llama_kv_self_update(llama_context * ctx) {
-    ctx->kv_self_update();
-}
-
-void llama_context::build_attn_inp(
+void llama_context_unified::build_attn_inp(
         ggml_context * ctx0,
              int32_t   n_tokens,
                 bool   causal,
@@ -1723,7 +1872,7 @@ void llama_context::build_attn_inp(
     }
 }
 
-void llama_context::build_attn_kv_store(
+void llama_context_unified::build_attn_kv_store(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * k_cur,
@@ -1767,7 +1916,7 @@ void llama_context::build_attn_kv_store(
     ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view));
 }
 
-ggml_tensor * llama_context::build_attn_qkv(
+ggml_tensor * llama_context_unified::build_attn_qkv(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * wo,
@@ -1919,7 +2068,7 @@ ggml_tensor * llama_context::build_attn_qkv(
     return cur;
 }
 
-ggml_tensor * llama_context::build_soft_max_ext(
+ggml_tensor * llama_context_unified::build_soft_max_ext(
         ggml_context * ctx0,
          ggml_tensor * kq,
              float     kq_scale) {
@@ -1928,7 +2077,7 @@ ggml_tensor * llama_context::build_soft_max_ext(
     return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
 }
 
-ggml_tensor * llama_context::get_rope_factors(int il) {
+ggml_tensor * llama_context_unified::get_rope_factors(int il) {
     const auto & hparams = model.hparams;
 
     // choose long/short freq factors based on the context size
@@ -1945,7 +2094,96 @@ ggml_tensor * llama_context::get_rope_factors(int il) {
     return model.layers[il].rope_short;
 }
 
-void llama_context::build_k_shift(
+ggml_tensor * llama_context_unified::build_inp_embd(
+        ggml_context * ctx0,
+         ggml_tensor * tok_embd,
+  const llama_ubatch & ubatch) {
+    const auto & hparams = model.hparams;
+
+    const int64_t n_embd = hparams.n_embd;
+
+    struct ggml_tensor * inpL;
+
+    if (ubatch.token) {
+        inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+        //cb(inp_tokens, "inp_tokens", -1);
+        ggml_set_input(inp_tokens);
+
+        inpL = ggml_get_rows(ctx0, tok_embd, inp_tokens);
+
+        // apply lora for embedding tokens if needed
+        for (const auto & lora : loras) {
+            struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
+            if (lw == nullptr) {
+                continue;
+            }
+
+            const float adapter_scale = lora.second;
+            const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+            struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
+                        ctx0, lw->b, // non-transposed lora_b
+                        ggml_get_rows(ctx0, lw->a, inp_tokens)
+                        ), scale);
+
+            inpL = ggml_add(ctx0, inpL, inpL_delta);
+        }
+    } else {
+        inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+        inpL = inp_embd;
+        ggml_set_input(inp_embd);
+    }
+
+    // For Granite architecture
+    if (hparams.f_embedding_scale != 0.0f) {
+        inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale);
+    }
+
+    //cb(inpL, "inp_embd", -1);
+
+    return inpL;
+}
+
+ggml_tensor * llama_context_unified::build_inp_pos(
+        ggml_context * ctx0,
+             int32_t   n_tokens) {
+    inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
+    ggml_set_input(inp_pos);
+
+    return inp_pos;
+}
+
+ggml_tensor * llama_context_unified::build_inp_out_ids(
+        ggml_context * ctx0,
+             int32_t   n_tokens,
+                bool   worst_case) {
+    const int32_t n_out_ids = worst_case ? n_tokens : n_outputs;
+
+    inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids);
+    ggml_set_input(inp_out_ids);
+
+    return inp_out_ids;
+}
+
+ggml_tensor * llama_context_unified::build_inp_mean(
+        ggml_context * ctx0,
+             int32_t   n_tokens) {
+    inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+    ggml_set_input(inp_mean);
+
+    return inp_mean;
+}
+
+ggml_tensor * llama_context_unified::build_inp_cls(
+        ggml_context * ctx0,
+             int32_t   n_tokens) {
+    inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp_cls);
+
+    return inp_cls;
+}
+
+void llama_context_unified::build_k_shift(
         ggml_context * ctx0,
          ggml_cgraph * graph) {
     const auto & n_ctx      = cparams.n_ctx;
@@ -2017,7 +2255,7 @@ void llama_context::build_k_shift(
     }
 }
 
-void llama_context::build_defrag(
+void llama_context_unified::build_defrag(
         ggml_context * ctx0,
          ggml_cgraph * graph) {
     const auto & hparams = model.hparams;
@@ -2287,7 +2525,39 @@ void llama_context::build_defrag(
 #endif
 }
 
-ggml_tensor * llama_context::build_inp_s_copy(
+ggml_tensor * llama_context_unified::build_inp_embd_enc(
+        ggml_context * ctx0,
+             int32_t   n_tokens,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+    const int64_t n_embd = hparams.n_embd;
+
+    // TODO: not sure if this is correct
+    const int32_t n_outputs_enc = worst_case ? n_tokens : embd_enc.size() / n_embd;
+
+    inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
+    ggml_set_input(inp_embd_enc);
+
+    return inp_embd_enc;
+}
+
+ggml_tensor * llama_context_unified::build_inp_KQ_mask_cross(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                 bool   worst_case) {
+    const auto & hparams = model.hparams;
+    const int64_t n_embd = hparams.n_embd;
+
+    // TODO: not sure if this is correct
+    const int32_t n_outputs_enc = worst_case ? n_tokens : embd_enc.size() / n_embd;
+
+    inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    ggml_set_input(inp_KQ_mask_cross);
+
+    return inp_KQ_mask_cross;
+}
+
+ggml_tensor * llama_context_unified::build_inp_s_copy(
         ggml_context * ctx0,
                 bool   worst_case) {
     const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
@@ -2298,7 +2568,7 @@ ggml_tensor * llama_context::build_inp_s_copy(
     return inp_s_copy;
 }
 
-ggml_tensor * llama_context::build_inp_s_mask(
+ggml_tensor * llama_context_unified::build_inp_s_mask(
         ggml_context * ctx0,
                 bool   worst_case) {
     const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
@@ -2308,7 +2578,7 @@ ggml_tensor * llama_context::build_inp_s_mask(
     return inp_s_mask;
 }
 
-ggml_tensor * llama_context::build_copy_mask_state(
+ggml_tensor * llama_context_unified::build_copy_mask_state(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * s,
@@ -2343,7 +2613,7 @@ ggml_tensor * llama_context::build_copy_mask_state(
 }
 
 // TODO: split
-ggml_tensor * llama_context::build_mamba_layer(
+ggml_tensor * llama_context_unified::build_mamba_layer(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * cur,
@@ -2479,7 +2749,7 @@ ggml_tensor * llama_context::build_mamba_layer(
 }
 
 
-ggml_tensor * llama_context::build_rwkv_token_shift_load(
+ggml_tensor * llama_context_unified::build_rwkv_token_shift_load(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * state_copy,
@@ -2506,7 +2776,7 @@ ggml_tensor * llama_context::build_rwkv_token_shift_load(
 }
 
 
-ggml_tensor * llama_context::build_rwkv_token_shift_store(
+ggml_tensor * llama_context_unified::build_rwkv_token_shift_store(
         ggml_context * ctx0,
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
@@ -2530,7 +2800,7 @@ ggml_tensor * llama_context::build_rwkv_token_shift_store(
 }
 
 
-ggml_tensor * llama_context::build_rwkv6_time_mix(
+ggml_tensor * llama_context_unified::build_rwkv6_time_mix(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * cur,
@@ -2702,1048 +2972,999 @@ ggml_tensor * llama_context::build_rwkv6_time_mix(
 }
 
 //
-// interface implementation
+// state
 //
 
-void llama_free(struct llama_context * ctx) {
-    delete ctx;
-}
+// TODO: this needs a big rework
 
-uint32_t llama_n_ctx(const struct llama_context * ctx) {
-    return ctx->cparams.n_ctx;
-}
+// TODO: replace all non-fatal assertions with returned errors or exceptions
+struct llama_data_write {
+    llama_data_write(llama_context_unified * ctx) : ctx(ctx) {}
+    virtual ~llama_data_write() = default;
 
-uint32_t llama_n_batch(const struct llama_context * ctx) {
-    return ctx->cparams.n_batch;
-}
+    virtual void write(const void * src, size_t size) = 0;
+    virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
+    virtual size_t get_size_written() = 0;
 
-uint32_t llama_n_ubatch(const struct llama_context * ctx) {
-    return ctx->cparams.n_ubatch;
-}
+    void write_string(const std::string & str) {
+        uint32_t str_size = str.size();
 
-uint32_t llama_n_seq_max(const struct llama_context * ctx) {
-    // TODO: add notion of n_seq_max to llama_kv_cache and use it here
-    return ctx->kv_self.size;
-}
+        write(&str_size,  sizeof(str_size));
+        write(str.data(), str_size);
+    }
 
-const llama_model * llama_get_model(const llama_context * ctx) {
-    return &ctx->model;
-}
+    void write_model_info() {
+        const std::string arch_str = llm_arch_name(ctx->model.arch);
+        write_string(arch_str);
+        // TODO: add more model-specific info which should prevent loading the session file if not identical
+    }
 
-llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
-    return &ctx->kv_self;
-}
+    //void write_rng(const std::mt19937 & rng) {
+    //    std::ostringstream rng_ss;
+    //    rng_ss << rng;
 
-enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
-    return ctx->cparams.pooling_type;
-}
+    //    const std::string & rng_str = rng_ss.str();
 
-void llama_attach_threadpool(
-             struct llama_context * ctx,
-        ggml_threadpool_t   threadpool,
-        ggml_threadpool_t   threadpool_batch) {
-    ctx->threadpool       = threadpool;
-    ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
-}
+    //    write_string(rng_str);
+    //}
 
-void llama_detach_threadpool(struct llama_context * ctx) {
-    ctx->threadpool       = nullptr;
-    ctx->threadpool_batch = nullptr;
-}
+    void write_output_ids() {
+        ctx->reorder_outputs();
 
-void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
-    ctx->cparams.n_threads       = n_threads;
-    ctx->cparams.n_threads_batch = n_threads_batch;
-}
+        const uint32_t n_outputs = ctx->n_outputs;
 
-int32_t llama_n_threads(struct llama_context * ctx) {
-    return ctx->cparams.n_threads;
-}
+        std::vector<int32_t> output_pos;
 
-int32_t llama_n_threads_batch(struct llama_context * ctx) {
-    return ctx->cparams.n_threads_batch;
-}
+        const size_t    n_batch = ctx->cparams.n_batch;
+        const auto & output_ids = ctx->output_ids;
 
-void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
-    ctx->abort_callback      = abort_callback;
-    ctx->abort_callback_data = abort_callback_data;
+        GGML_ASSERT(n_outputs <= ctx->output_size);
 
-    for (auto & backend : ctx->backends) {
-        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
-        auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
-        if (set_abort_callback_fn) {
-            set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
+        output_pos.resize(n_outputs);
+
+        // build a more compact representation of the output ids
+        for (size_t i = 0; i < n_batch; ++i) {
+            // map an output id to a position in the batch
+            int32_t pos = output_ids[i];
+            if (pos >= 0) {
+                GGML_ASSERT((uint32_t) pos < n_outputs);
+                output_pos[pos] = i;
+            }
         }
-    }
-}
 
-void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
-    ctx->cparams.embeddings = embeddings;
-}
+        write(&n_outputs, sizeof(n_outputs));
 
-void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
-    ctx->cparams.causal_attn = causal_attn;
-}
+        if (n_outputs) {
+            write(output_pos.data(), n_outputs * sizeof(int32_t));
+        }
+    }
 
-void llama_synchronize(struct llama_context * ctx) {
-    ggml_backend_sched_synchronize(ctx->sched.get());
+    void write_logits() {
+        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
 
-    // FIXME: if multiple single tokens are evaluated without a synchronization,
-    // the stats will be added to the prompt evaluation stats
-    // this should only happen when using batch size 1 to evaluate a batch
+        write(&logits_size, sizeof(logits_size));
 
-    // add the evaluation to the stats
-    if (ctx->n_queued_tokens == 1) {
-        if (!ctx->cparams.no_perf) {
-            ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
-        }
-        ctx->n_eval++;
-    } else if (ctx->n_queued_tokens > 1) {
-        if (!ctx->cparams.no_perf) {
-            ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        if (logits_size) {
+            write(ctx->logits, logits_size * sizeof(float));
         }
-        ctx->n_p_eval += ctx->n_queued_tokens;
     }
 
-    // get a more accurate load time, upon first eval
-    if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
-        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
-        ctx->has_evaluated_once = true;
-    }
+    void write_embeddings() {
+        const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
 
-    ctx->n_queued_tokens = 0;
-    ctx->t_compute_start_us = 0;
-}
+        write(&embeddings_size, sizeof(embeddings_size));
 
-float * llama_get_logits(struct llama_context * ctx) {
-    llama_synchronize(ctx);
+        if (embeddings_size) {
+            write(ctx->embd, embeddings_size * sizeof(float));
+        }
+    }
 
-    // reorder logits for backward compatibility
-    ctx->reorder_outputs();
+    llama_context_unified * ctx;
+};
 
-    return ctx->logits;
-}
+struct llama_data_read {
+    llama_data_read(llama_context_unified * ctx) : ctx(ctx) {}
+    virtual ~llama_data_read() = default;
 
-float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
-    int32_t j = -1;
+    virtual const uint8_t * read(size_t size) = 0;
+    virtual void read_to(void * dst, size_t size) = 0;
+    virtual size_t get_size_read() = 0;
 
-    llama_synchronize(ctx);
+    void read_string(std::string & str) {
+        uint32_t str_size;
+        read_to(&str_size, sizeof(str_size));
 
-    try {
-        if (ctx->logits == nullptr) {
-            throw std::runtime_error("no logits");
-        }
+        str.assign((const char *) read(str_size), str_size);
+    }
 
-        if (i < 0) {
-            j = ctx->n_outputs + i;
-            if (j < 0) {
-                throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
-            }
-        } else if ((size_t) i >= ctx->output_ids.size()) {
-            throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
-        } else {
-            j = ctx->output_ids[i];
-        }
+    // validate model information
+    void read_model_info() {
+        const std::string cur_arch_str = llm_arch_name(ctx->model.arch);
 
-        if (j < 0) {
-            throw std::runtime_error(format("batch.logits[%d] != true", i));
-        }
-        if (j >= ctx->n_outputs) {
-            // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
+        std::string arch_str;
+        read_string(arch_str);
+        if (cur_arch_str != arch_str) {
+            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
         }
-
-        return ctx->logits + j*ctx->model.vocab.n_tokens();
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
-#ifndef NDEBUG
-        GGML_ABORT("fatal error");
-#else
-        return nullptr;
-#endif
+        // TODO: add more info which needs to be identical but which is not verified otherwise
     }
-}
 
-float * llama_get_embeddings(struct llama_context * ctx) {
-    llama_synchronize(ctx);
+    //void read_rng(std::mt19937 & rng) {
+    //    std::string rng_str;
+    //    read_string(rng_str);
 
-    // reorder embeddings for backward compatibility
-    ctx->reorder_outputs();
+    //    std::istringstream rng_ss(rng_str);
+    //    rng_ss >> rng;
 
-    return ctx->embd;
-}
+    //    if (rng_ss.fail()) {
+    //        throw std::runtime_error("failed to load RNG state");
+    //    }
+    //}
 
-float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
-    int32_t j = -1;
+    void read_output_ids() {
+        std::vector<int32_t> output_pos;
 
-    llama_synchronize(ctx);
+        uint32_t n_outputs;
+        read_to(&n_outputs, sizeof(n_outputs));
 
-    try {
-        if (ctx->embd == nullptr) {
-            throw std::runtime_error("no embeddings");
+        if (n_outputs > ctx->reserve_outputs(n_outputs)) {
+            throw std::runtime_error("could not reserve outputs");
         }
 
-        if (i < 0) {
-            j = ctx->n_outputs + i;
-            if (j < 0) {
-                throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
+        if (n_outputs) {
+            output_pos.resize(n_outputs);
+            read_to(output_pos.data(), n_outputs * sizeof(int32_t));
+
+            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
+                int32_t id = output_pos[i];
+                if ((uint32_t) id >= ctx->cparams.n_batch) {
+                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
+                }
+                ctx->output_ids[id] = i;
             }
-        } else if ((size_t) i >= ctx->output_ids.size()) {
-            throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
-        } else {
-            j = ctx->output_ids[i];
-        }
 
-        if (j < 0) {
-            throw std::runtime_error(format("batch.logits[%d] != true", i));
-        }
-        if (j >= ctx->n_outputs) {
-            // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
+            ctx->n_outputs = n_outputs;
         }
-
-        return ctx->embd + j*ctx->model.hparams.n_embd;
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
-#ifndef NDEBUG
-        GGML_ABORT("fatal error");
-#else
-        return nullptr;
-#endif
     }
-}
 
-float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
-    llama_synchronize(ctx);
+    void read_logits() {
+        uint64_t logits_size;
+        read_to(&logits_size, sizeof(logits_size));
 
-    auto it = ctx->embd_seq.find(seq_id);
-    if (it == ctx->embd_seq.end()) {
-        return nullptr;
-    }
+        if (ctx->logits_size < logits_size) {
+            throw std::runtime_error("logits buffer too small");
+        }
 
-    return it->second.data();
-}
+        if (logits_size) {
+            read_to(ctx->logits, logits_size * sizeof(float));
+        }
+    }
 
-// llama adapter API
+    void read_embeddings() {
+        uint64_t embeddings_size;
+        read_to(&embeddings_size, sizeof(embeddings_size));
 
-int32_t llama_set_adapter_lora(
-            struct llama_context * ctx,
-            struct llama_adapter_lora * adapter,
-            float scale) {
-    ctx->loras[adapter] = scale;
-    return 0;
-}
+        if (ctx->embd_size < embeddings_size) {
+            throw std::runtime_error("embeddings buffer too small");
+        }
 
-int32_t llama_rm_adapter_lora(
-            struct llama_context * ctx,
-            struct llama_adapter_lora * adapter) {
-    auto pos = ctx->loras.find(adapter);
-    if (pos != ctx->loras.end()) {
-        ctx->loras.erase(pos);
-        return 0;
+        if (embeddings_size) {
+            read_to(ctx->embd, embeddings_size * sizeof(float));
+        }
     }
 
-    return -1;
-}
+    llama_context_unified * ctx;
+};
 
-void llama_clear_adapter_lora(struct llama_context * ctx) {
-    ctx->loras.clear();
-}
+struct llama_data_write_dummy : llama_data_write {
+    llama_data_write_dummy(llama_context_unified * ctx) : llama_data_write(ctx) {}
 
-int32_t llama_apply_adapter_cvec(
-        struct llama_context * ctx,
-                 const float * data,
-                      size_t   len,
-                     int32_t   n_embd,
-                     int32_t   il_start,
-                     int32_t   il_end) {
-    return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
-}
+    void write(const void * /* src */, size_t size) override {
+        size_written += size;
+    }
 
-//
-// kv cache view
-//
+    void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
+        size_written += size;
+    }
 
-struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
-    return llama_kv_cache_view_init(ctx->kv_self, n_seq_max);
-}
+    size_t get_size_written() override {
+        return size_written;
+    }
 
-void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
-    llama_kv_cache_view_update(view, ctx->kv_self);
-}
+    size_t size_written = 0;
+};
 
-//
-// kv cache
-//
+struct llama_data_write_buffer : llama_data_write {
+    llama_data_write_buffer(
+            llama_context_unified * ctx,
+            uint8_t * p, size_t len) : llama_data_write(ctx), ptr(p), buf_size(len) {}
 
-// deprecated
-int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
-    return llama_kv_self_n_tokens(ctx);
-}
+    void write(const void * src, size_t size) override {
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
+        }
+        memcpy(ptr, src, size);
+        ptr += size;
+        size_written += size;
+        buf_size -= size;
+    }
 
-int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
-    return llama_kv_cache_n_tokens(&ctx->kv_self);
-}
+    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
+        }
+        ggml_backend_tensor_get(tensor, ptr, offset, size);
+        ptr += size;
+        size_written += size;
+        buf_size -= size;
+    }
 
-// deprecated
-int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
-    return llama_kv_self_used_cells(ctx);
-}
+    size_t get_size_written() override {
+        return size_written;
+    }
 
-int32_t llama_kv_self_used_cells(const llama_context * ctx) {
-    return llama_kv_cache_used_cells(&ctx->kv_self);
-}
+    uint8_t * ptr;
+    size_t buf_size = 0;
+    size_t size_written = 0;
+};
 
-// deprecated
-void llama_kv_cache_clear(llama_context * ctx) {
-    llama_kv_self_clear(ctx);
-}
+struct llama_data_read_buffer : llama_data_read {
+    llama_data_read_buffer(
+            llama_context_unified * ctx,
+            const uint8_t * p, size_t len) : llama_data_read(ctx), ptr(p), buf_size(len) {}
 
-void llama_kv_self_clear(llama_context * ctx) {
-    llama_kv_cache_clear(&ctx->kv_self);
-}
+    const uint8_t * read(size_t size) override {
+        const uint8_t * base_ptr = ptr;
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
+        }
+        ptr += size;
+        size_read += size;
+        buf_size -= size;
+        return base_ptr;
+    }
 
-// deprecated
-bool llama_kv_cache_seq_rm(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
-}
+    void read_to(void * dst, size_t size) override {
+        memcpy(dst, read(size), size);
+    }
 
-bool llama_kv_self_seq_rm(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1);
-}
+    size_t get_size_read() override {
+        return size_read;
+    }
 
-// deprecated
-void llama_kv_cache_seq_cp(
-        llama_context * ctx,
-         llama_seq_id   seq_id_src,
-         llama_seq_id   seq_id_dst,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
-}
+    const uint8_t * ptr;
+    size_t buf_size = 0;
+    size_t size_read = 0;
+};
 
-void llama_kv_self_seq_cp(
-        llama_context * ctx,
-         llama_seq_id   seq_id_src,
-         llama_seq_id   seq_id_dst,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
-}
+struct llama_data_write_file : llama_data_write {
+    llama_data_write_file(
+            llama_context_unified * ctx,
+            llama_file * f) : llama_data_write(ctx), file(f) {}
 
-// deprecated
-void llama_kv_cache_seq_keep(
-        llama_context * ctx,
-         llama_seq_id   seq_id) {
-    return llama_kv_self_seq_keep(ctx, seq_id);
-}
+    void write(const void * src, size_t size) override {
+        file->write_raw(src, size);
+        size_written += size;
+    }
 
-void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id);
-}
+    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
+        temp_buffer.resize(size);
+        ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
+        write(temp_buffer.data(), temp_buffer.size());
+    }
 
-// deprecated
-void llama_kv_cache_seq_add(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-            llama_pos   delta) {
-    return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
-}
+    size_t get_size_written() override {
+        return size_written;
+    }
 
-void llama_kv_self_seq_add(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-            llama_pos   delta) {
-    return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta);
-}
+    llama_file * file;
+    size_t size_written = 0;
+    std::vector<uint8_t> temp_buffer;
+};
 
-// deprecated
-void llama_kv_cache_seq_div(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-                  int   d) {
-    return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
+struct llama_data_read_file : llama_data_read {
+    llama_data_read_file(
+            llama_context_unified * ctx,
+            llama_file * f) : llama_data_read(ctx), file(f) {}
+
+    void read_to(void * dst, size_t size) override {
+        file->read_raw(dst, size);
+        size_read += size;
+    }
+
+    const uint8_t * read(size_t size) override {
+        temp_buffer.resize(size);
+        read_to(temp_buffer.data(), size);
+        return temp_buffer.data();
+    }
+
+    size_t get_size_read() override {
+        return size_read;
+    }
+
+    llama_file * file;
+    size_t size_read = 0;
+    std::vector<uint8_t> temp_buffer;
+};
+
+size_t llama_context_unified::state_get_size() {
+    llama_data_write_dummy data_ctx(this);
+    try {
+        return state_get_data(data_ctx);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
+        return 0;
+    }
 }
 
-void llama_kv_self_seq_div(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-                  int   d) {
-    return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d);
+size_t llama_context_unified::state_get_data(uint8_t * dst, size_t size) {
+    llama_data_write_buffer data_ctx(this, dst, size);
+    try {
+        return state_get_data(data_ctx);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
+        return 0;
+    }
 }
 
-// deprecated
-llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_self_seq_pos_max(ctx, seq_id);
+size_t llama_context_unified::state_set_data(const uint8_t * src, size_t size) {
+    llama_data_read_buffer data_ctx(this, src, size);
+    try {
+        return state_set_data(data_ctx);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
+        return 0;
+    }
 }
 
-llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id);
+size_t llama_context_unified::state_seq_get_size(llama_seq_id seq_id) {
+    llama_data_write_dummy data_ctx(this);
+    try {
+        return state_seq_get_data(data_ctx, seq_id);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
+        return 0;
+    }
 }
 
-// deprecated
-void llama_kv_cache_defrag(llama_context * ctx) {
-    return llama_kv_self_defrag(ctx);
+size_t llama_context_unified::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
+    llama_data_write_buffer data_ctx(this, dst, size);
+    try {
+        return state_seq_get_data(data_ctx, seq_id);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
+        return 0;
+    }
 }
 
-void llama_kv_self_defrag(llama_context * ctx) {
-    return llama_kv_cache_defrag(&ctx->kv_self);
+size_t llama_context_unified::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
+    llama_data_read_buffer data_ctx(this, src, size);
+    try {
+        return state_seq_set_data(data_ctx, seq_id);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
+        return 0;
+    }
 }
 
-// deprecated
-bool llama_kv_cache_can_shift(const llama_context * ctx) {
-    return llama_kv_self_can_shift(ctx);
+bool llama_context_unified::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    llama_file file(filepath, "rb");
+
+    // sanity checks
+    {
+        const uint32_t magic   = file.read_u32();
+        const uint32_t version = file.read_u32();
+
+        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
+            LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+            return false;
+        }
+    }
+
+    // load the prompt
+    {
+        const uint32_t n_token_count = file.read_u32();
+
+        if (n_token_count > n_token_capacity) {
+            LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            return false;
+        }
+
+        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        *n_token_count_out = n_token_count;
+    }
+
+    // restore the context state
+    {
+        const size_t n_state_size_cur = file.size() - file.tell();
+
+        llama_data_read_file data_ctx(this, &file);
+        const size_t n_read = state_set_data(data_ctx);
+
+        if (n_read != n_state_size_cur) {
+            LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
+            return false;
+        }
+    }
+
+    return true;
 }
 
-bool llama_kv_self_can_shift(const llama_context * ctx) {
-    return llama_kv_cache_can_shift(&ctx->kv_self);
+bool llama_context_unified::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) {
+    llama_file file(filepath, "wb");
+
+    file.write_u32(LLAMA_SESSION_MAGIC);
+    file.write_u32(LLAMA_SESSION_VERSION);
+
+    // save the prompt
+    file.write_u32((uint32_t) n_token_count);
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+    // save the context state using stream saving
+    llama_data_write_file data_ctx(this, &file);
+    state_get_data(data_ctx);
+
+    return true;
 }
 
-// deprecated
-void llama_kv_cache_update(llama_context * ctx) {
-    llama_kv_self_update(ctx);
-}
+size_t llama_context_unified::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    llama_file file(filepath, "rb");
+
+    // version checks
+    {
+        const uint32_t magic   = file.read_u32();
+        const uint32_t version = file.read_u32();
+
+        if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
+            LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
+            return 0;
+        }
+    }
+
+    // load the prompt
+    {
+        const uint32_t n_token_count = file.read_u32();
+
+        if (n_token_count > n_token_capacity) {
+            LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            return 0;
+        }
+
+        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        *n_token_count_out = n_token_count;
+    }
 
-// llama state API
+    // restore the context state
+    {
+        const size_t state_size = file.size() - file.tell();
+        llama_data_read_file data_ctx(this, &file);
+        const size_t nread = state_seq_set_data(data_ctx, seq_id);
+        if (!nread) {
+            LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
+            return 0;
+        }
+        GGML_ASSERT(nread <= state_size);
+        GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
+    }
 
-// deprecated
-size_t llama_get_state_size(struct llama_context * ctx) {
-    return llama_state_get_size(ctx);
+    return file.tell();
 }
 
-// deprecated
-size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
-    return llama_state_get_data(ctx, dst, -1);
-}
+size_t llama_context_unified::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) {
+    llama_file file(filepath, "wb");
 
-// deprecated
-size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
-    return llama_state_set_data(ctx, src, -1);
-}
+    file.write_u32(LLAMA_STATE_SEQ_MAGIC);
+    file.write_u32(LLAMA_STATE_SEQ_VERSION);
 
-// deprecated
-bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
-}
+    // save the prompt
+    file.write_u32((uint32_t) n_token_count);
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
 
-// deprecated
-bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
-    return llama_state_save_file(ctx, path_session, tokens, n_token_count);
-}
+    // save the context state using stream saving
+    llama_data_write_file data_ctx(this, &file);
+    state_seq_get_data(data_ctx, seq_id);
 
-// TODO: replace all non-fatal assertions with returned errors or exceptions
-struct llama_data_write {
-    virtual void write(const void * src, size_t size) = 0;
-    virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
-    virtual size_t get_size_written() = 0;
-    virtual ~llama_data_write() = default;
+    const size_t res = file.tell();
+    GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
 
-    void write_string(const std::string & str) {
-        uint32_t str_size = str.size();
+    return res;
+}
 
-        write(&str_size,  sizeof(str_size));
-        write(str.data(), str_size);
-    }
+/** copy state data into either a buffer or file depending on the passed in context
+ *
+ * file context:
+ * llama_file file("/path", "wb");
+ * llama_data_write_file data_ctx(&file);
+ * llama_state_get_data_internal(ctx, data_ctx);
+ *
+ * buffer context:
+ * std::vector<uint8_t> buf(max_size, 0);
+ * llama_data_write_buffer data_ctx(buf.data(), max_size);
+ * llama_state_get_data_internal(ctx, data_ctx);
+ *
+*/
+size_t llama_context_unified::state_get_data(llama_data_write & data_ctx) {
+    synchronize();
 
-    void write_model_info(const struct llama_context * ctx) {
-        const std::string arch_str = llm_arch_name(ctx->model.arch);
-        write_string(arch_str);
-        // TODO: add more model-specific info which should prevent loading the session file if not identical
-    }
+    data_ctx.write_model_info();
 
-    //void write_rng(const std::mt19937 & rng) {
-    //    std::ostringstream rng_ss;
-    //    rng_ss << rng;
+    // copy outputs
+    data_ctx.write_output_ids();
+    data_ctx.write_logits();
+    data_ctx.write_embeddings();
 
-    //    const std::string & rng_str = rng_ss.str();
+    llama_kv_cache::io io = {
+        /* .write = */ [&](const void * src, size_t size) {
+            data_ctx.write(src, size);
+        },
+        /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
+            data_ctx.write_tensor_data(tensor, offset, size);
+        },
+        /* .read    = */ nullptr,
+        /* .read_to = */ nullptr,
+    };
 
-    //    write_string(rng_str);
-    //}
+    kv_self.state_write(io, model.hparams);
 
-    void write_output_ids(struct llama_context * ctx) {
-        ctx->reorder_outputs();
+    return data_ctx.get_size_written();
+}
 
-        const uint32_t n_outputs = ctx->n_outputs;
+size_t llama_context_unified::state_set_data(llama_data_read & data_ctx) {
+    synchronize();
 
-        std::vector<int32_t> output_pos;
+    data_ctx.read_model_info();
 
-        const size_t    n_batch = ctx->cparams.n_batch;
-        const auto & output_ids = ctx->output_ids;
+    // set outputs
+    data_ctx.read_output_ids();
+    data_ctx.read_logits();
+    data_ctx.read_embeddings();
 
-        GGML_ASSERT(n_outputs <= ctx->output_size);
+    llama_kv_cache::io io = {
+        /* .write = */ nullptr,
+        /* .write_tensor_data = */ nullptr,
+        /* .read = */ [&](size_t size) {
+            return data_ctx.read(size);
+        },
+        /* .read_to = */ [&](void * dst, size_t size) {
+            data_ctx.read_to(dst, size);
+        },
+    };
 
-        output_pos.resize(n_outputs);
+    kv_self.state_read(io, model.hparams);
 
-        // build a more compact representation of the output ids
-        for (size_t i = 0; i < n_batch; ++i) {
-            // map an output id to a position in the batch
-            int32_t pos = output_ids[i];
-            if (pos >= 0) {
-                GGML_ASSERT((uint32_t) pos < n_outputs);
-                output_pos[pos] = i;
-            }
-        }
+    return data_ctx.get_size_read();
+}
 
-        write(&n_outputs, sizeof(n_outputs));
+size_t llama_context_unified::state_seq_get_data(llama_data_write & data_ctx, llama_seq_id seq_id) {
+    synchronize();
 
-        if (n_outputs) {
-            write(output_pos.data(), n_outputs * sizeof(int32_t));
-        }
-    }
+    llama_kv_cache::io io = {
+        /* .write = */ [&](const void * src, size_t size) {
+            data_ctx.write(src, size);
+        },
+        /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
+            data_ctx.write_tensor_data(tensor, offset, size);
+        },
+        /* .read = */    nullptr,
+        /* .read_to = */ nullptr,
+    };
 
-    void write_logits(const struct llama_context * ctx) {
-        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
+    kv_self.state_write(io, model.hparams, seq_id);
 
-        write(&logits_size, sizeof(logits_size));
+    return data_ctx.get_size_written();
+}
 
-        if (logits_size) {
-            write(ctx->logits, logits_size * sizeof(float));
-        }
-    }
+size_t llama_context_unified::state_seq_set_data(llama_data_read & data_ctx, llama_seq_id seq_id) {
+    synchronize();
 
-    void write_embeddings(const struct llama_context * ctx) {
-        const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
+    llama_kv_cache::io io = {
+        /* .write = */ nullptr,
+        /* .write_tensor_data = */ nullptr,
+        /* .read = */ [&](size_t size) {
+            return data_ctx.read(size);
+        },
+        /* .read_to = */ [&](void * dst, size_t size) {
+            data_ctx.read_to(dst, size);
+        },
+    };
 
-        write(&embeddings_size, sizeof(embeddings_size));
+    kv_self.state_read(io, model.hparams, seq_id);
 
-        if (embeddings_size) {
-            write(ctx->embd, embeddings_size * sizeof(float));
-        }
-    }
-};
+    return data_ctx.get_size_read();
+}
 
-struct llama_data_read {
-    virtual const uint8_t * read(size_t size) = 0;
-    virtual void read_to(void * dst, size_t size) = 0;
-    virtual size_t get_size_read() = 0;
-    virtual ~llama_data_read() = default;
+//
+// interface implementation
+//
 
-    void read_string(std::string & str) {
-        uint32_t str_size;
-        read_to(&str_size, sizeof(str_size));
+void llama_free(struct llama_context * ctx) {
+    delete ctx;
+}
 
-        str.assign((const char *) read(str_size), str_size);
-    }
+uint32_t llama_n_ctx(const struct llama_context * ctx) {
+    return ctx->n_ctx();
+}
 
-    // validate model information
-    void read_model_info(const struct llama_context * ctx) {
-        const std::string cur_arch_str = llm_arch_name(ctx->model.arch);
+uint32_t llama_n_batch(const struct llama_context * ctx) {
+    return ctx->n_batch();
+}
 
-        std::string arch_str;
-        read_string(arch_str);
-        if (cur_arch_str != arch_str) {
-            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
-        }
-        // TODO: add more info which needs to be identical but which is not verified otherwise
-    }
+uint32_t llama_n_ubatch(const struct llama_context * ctx) {
+    return ctx->n_ubatch();
+}
 
-    //void read_rng(std::mt19937 & rng) {
-    //    std::string rng_str;
-    //    read_string(rng_str);
+uint32_t llama_n_seq_max(const struct llama_context * ctx) {
+    return ctx->n_seq_max();
+}
 
-    //    std::istringstream rng_ss(rng_str);
-    //    rng_ss >> rng;
+const llama_model * llama_get_model(const llama_context * ctx) {
+    return &ctx->model;
+}
 
-    //    if (rng_ss.fail()) {
-    //        throw std::runtime_error("failed to load RNG state");
-    //    }
-    //}
+llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
+    return ctx->get_kv_self();
+}
 
-    void read_output_ids(struct llama_context * ctx) {
-        std::vector<int32_t> output_pos;
+void llama_kv_self_update(llama_context * ctx) {
+    ctx->kv_self_update();
+}
 
-        uint32_t n_outputs;
-        read_to(&n_outputs, sizeof(n_outputs));
+enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
+    return ctx->pooling_type();
+}
 
-        if (n_outputs > ctx->reserve_outputs(n_outputs)) {
-            throw std::runtime_error("could not reserve outputs");
-        }
+void llama_attach_threadpool(
+             struct llama_context * ctx,
+        ggml_threadpool_t   threadpool,
+        ggml_threadpool_t   threadpool_batch) {
+    ctx->threadpool       = threadpool;
+    ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
+}
 
-        if (n_outputs) {
-            output_pos.resize(n_outputs);
-            read_to(output_pos.data(), n_outputs * sizeof(int32_t));
+void llama_detach_threadpool(struct llama_context * ctx) {
+    ctx->threadpool       = nullptr;
+    ctx->threadpool_batch = nullptr;
+}
 
-            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
-                int32_t id = output_pos[i];
-                if ((uint32_t) id >= ctx->cparams.n_batch) {
-                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
-                }
-                ctx->output_ids[id] = i;
-            }
+void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
+    ctx->cparams.n_threads       = n_threads;
+    ctx->cparams.n_threads_batch = n_threads_batch;
+}
 
-            ctx->n_outputs = n_outputs;
-        }
-    }
+int32_t llama_n_threads(struct llama_context * ctx) {
+    return ctx->cparams.n_threads;
+}
 
-    void read_logits(struct llama_context * ctx) {
-        uint64_t logits_size;
-        read_to(&logits_size, sizeof(logits_size));
+int32_t llama_n_threads_batch(struct llama_context * ctx) {
+    return ctx->cparams.n_threads_batch;
+}
 
-        if (ctx->logits_size < logits_size) {
-            throw std::runtime_error("logits buffer too small");
-        }
+void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
+    ctx->abort_callback      = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
 
-        if (logits_size) {
-            read_to(ctx->logits, logits_size * sizeof(float));
+    for (auto & backend : ctx->backends) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
+        auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
+        if (set_abort_callback_fn) {
+            set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
         }
     }
+}
 
-    void read_embeddings(struct llama_context * ctx) {
-        uint64_t embeddings_size;
-        read_to(&embeddings_size, sizeof(embeddings_size));
+void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
+    ctx->cparams.embeddings = embeddings;
+}
 
-        if (ctx->embd_size < embeddings_size) {
-            throw std::runtime_error("embeddings buffer too small");
-        }
+void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
+    ctx->cparams.causal_attn = causal_attn;
+}
 
-        if (embeddings_size) {
-            read_to(ctx->embd, embeddings_size * sizeof(float));
-        }
-    }
-};
+void llama_synchronize(struct llama_context * ctx) {
+    ctx->synchronize();
+}
 
-struct llama_data_write_dummy : llama_data_write {
-    size_t size_written = 0;
+float * llama_get_logits(struct llama_context * ctx) {
+    ctx->synchronize();
 
-    llama_data_write_dummy() {}
+    return ctx->get_logits();
+}
 
-    void write(const void * /* src */, size_t size) override {
-        size_written += size;
-    }
+float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
+    ctx->synchronize();
 
-    void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
-        size_written += size;
-    }
+    return ctx->get_logits_ith(i);
+}
 
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
+float * llama_get_embeddings(struct llama_context * ctx) {
+    ctx->synchronize();
 
-struct llama_data_write_buffer : llama_data_write {
-    uint8_t * ptr;
-    size_t buf_size = 0;
-    size_t size_written = 0;
+    return ctx->get_embeddings();
+}
 
-    llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
+    ctx->synchronize();
 
-    void write(const void * src, size_t size) override {
-        if (size > buf_size) {
-            throw std::runtime_error("unexpectedly reached end of buffer");
-        }
-        memcpy(ptr, src, size);
-        ptr += size;
-        size_written += size;
-        buf_size -= size;
-    }
+    return ctx->get_embeddings_ith(i);
+}
 
-    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
-        if (size > buf_size) {
-            throw std::runtime_error("unexpectedly reached end of buffer");
-        }
-        ggml_backend_tensor_get(tensor, ptr, offset, size);
-        ptr += size;
-        size_written += size;
-        buf_size -= size;
-    }
+float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
+    ctx->synchronize();
 
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
+    return ctx->get_embeddings_seq(seq_id);
+}
 
-struct llama_data_read_buffer : llama_data_read {
-    const uint8_t * ptr;
-    size_t buf_size = 0;
-    size_t size_read = 0;
+// llama adapter API
 
-    llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+int32_t llama_set_adapter_lora(
+            struct llama_context * ctx,
+            struct llama_adapter_lora * adapter,
+            float scale) {
+    ctx->loras[adapter] = scale;
+    return 0;
+}
 
-    const uint8_t * read(size_t size) override {
-        const uint8_t * base_ptr = ptr;
-        if (size > buf_size) {
-            throw std::runtime_error("unexpectedly reached end of buffer");
-        }
-        ptr += size;
-        size_read += size;
-        buf_size -= size;
-        return base_ptr;
+int32_t llama_rm_adapter_lora(
+            struct llama_context * ctx,
+            struct llama_adapter_lora * adapter) {
+    auto pos = ctx->loras.find(adapter);
+    if (pos != ctx->loras.end()) {
+        ctx->loras.erase(pos);
+        return 0;
     }
 
-    void read_to(void * dst, size_t size) override {
-        memcpy(dst, read(size), size);
-    }
+    return -1;
+}
 
-    size_t get_size_read() override {
-        return size_read;
-    }
-};
+void llama_clear_adapter_lora(struct llama_context * ctx) {
+    ctx->loras.clear();
+}
 
-struct llama_data_write_file : llama_data_write {
-    llama_file * file;
-    size_t size_written = 0;
-    std::vector<uint8_t> temp_buffer;
+int32_t llama_apply_adapter_cvec(
+        struct llama_context * ctx,
+                 const float * data,
+                      size_t   len,
+                     int32_t   n_embd,
+                     int32_t   il_start,
+                     int32_t   il_end) {
+    return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
+}
 
-    llama_data_write_file(llama_file * f) : file(f) {}
+//
+// kv cache view
+//
 
-    void write(const void * src, size_t size) override {
-        file->write_raw(src, size);
-        size_written += size;
-    }
+struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
+    return llama_kv_cache_view_init(*ctx->get_kv_self(), n_seq_max);
+}
 
-    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
-        temp_buffer.resize(size);
-        ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
-        write(temp_buffer.data(), temp_buffer.size());
-    }
+void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
+    llama_kv_cache_view_update(view, *ctx->get_kv_self());
+}
 
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
+//
+// kv cache
+//
 
-struct llama_data_read_file : llama_data_read {
-    llama_file * file;
-    size_t size_read = 0;
-    std::vector<uint8_t> temp_buffer;
+// deprecated
+int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
+    return llama_kv_self_n_tokens(ctx);
+}
 
-    llama_data_read_file(llama_file * f) : file(f) {}
+int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
+    return llama_kv_cache_n_tokens(ctx->get_kv_self());
+}
 
-    void read_to(void * dst, size_t size) override {
-        file->read_raw(dst, size);
-        size_read += size;
-    }
+// deprecated
+int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
+    return llama_kv_self_used_cells(ctx);
+}
 
-    const uint8_t * read(size_t size) override {
-        temp_buffer.resize(size);
-        read_to(temp_buffer.data(), size);
-        return temp_buffer.data();
-    }
+int32_t llama_kv_self_used_cells(const llama_context * ctx) {
+    return llama_kv_cache_used_cells(ctx->get_kv_self());
+}
 
-    size_t get_size_read() override {
-        return size_read;
-    }
-};
+// deprecated
+void llama_kv_cache_clear(llama_context * ctx) {
+    llama_kv_self_clear(ctx);
+}
 
-/** copy state data into either a buffer or file depending on the passed in context
- *
- * file context:
- * llama_file file("/path", "wb");
- * llama_data_write_file data_ctx(&file);
- * llama_state_get_data_internal(ctx, data_ctx);
- *
- * buffer context:
- * std::vector<uint8_t> buf(max_size, 0);
- * llama_data_write_buffer data_ctx(buf.data(), max_size);
- * llama_state_get_data_internal(ctx, data_ctx);
- *
-*/
-static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
-    llama_synchronize(ctx);
+void llama_kv_self_clear(llama_context * ctx) {
+    llama_kv_cache_clear(ctx->get_kv_self());
+}
 
-    data_ctx.write_model_info(ctx);
+// deprecated
+bool llama_kv_cache_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
+}
 
-    // copy outputs
-    data_ctx.write_output_ids(ctx);
-    data_ctx.write_logits(ctx);
-    data_ctx.write_embeddings(ctx);
+bool llama_kv_self_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_cache_seq_rm(ctx->get_kv_self(), seq_id, p0, p1);
+}
 
-    llama_kv_cache::io io = {
-        /* .write = */ [&](const void * src, size_t size) {
-            data_ctx.write(src, size);
-        },
-        /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
-            data_ctx.write_tensor_data(tensor, offset, size);
-        },
-        /* .read    = */ nullptr,
-        /* .read_to = */ nullptr,
-    };
+// deprecated
+void llama_kv_cache_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_self_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_cache_seq_cp(ctx->get_kv_self(), seq_id_src, seq_id_dst, p0, p1);
+}
+
+// deprecated
+void llama_kv_cache_seq_keep(
+        llama_context * ctx,
+         llama_seq_id   seq_id) {
+    return llama_kv_self_seq_keep(ctx, seq_id);
+}
 
-    ctx->kv_self.state_write(io, ctx->model.hparams);
+void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_keep(ctx->get_kv_self(), seq_id);
+}
 
-    return data_ctx.get_size_written();
+// deprecated
+void llama_kv_cache_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
 }
 
-size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
-    llama_data_write_buffer data_ctx(dst, size);
-    try {
-        return llama_state_get_data_internal(ctx, data_ctx);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
-        return 0;
-    }
+void llama_kv_self_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    return llama_kv_cache_seq_add(ctx->get_kv_self(), seq_id, p0, p1, delta);
 }
 
-// Returns the *actual* size of the state.
-// Intended to be used when saving to state to a buffer.
-size_t llama_state_get_size(struct llama_context * ctx) {
-    llama_data_write_dummy data_ctx;
-    try {
-        return llama_state_get_data_internal(ctx, data_ctx);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
-        return 0;
-    }
+// deprecated
+void llama_kv_cache_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
 }
 
-static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
-    llama_synchronize(ctx);
+void llama_kv_self_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    return llama_kv_cache_seq_div(ctx->get_kv_self(), seq_id, p0, p1, d);
+}
 
-    data_ctx.read_model_info(ctx);
+// deprecated
+llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_self_seq_pos_max(ctx, seq_id);
+}
 
-    // set outputs
-    data_ctx.read_output_ids(ctx);
-    data_ctx.read_logits(ctx);
-    data_ctx.read_embeddings(ctx);
+llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_pos_max(ctx->get_kv_self(), seq_id);
+}
 
-    llama_kv_cache::io io = {
-        /* .write = */ nullptr,
-        /* .write_tensor_data = */ nullptr,
-        /* .read = */ [&](size_t size) {
-            return data_ctx.read(size);
-        },
-        /* .read_to = */ [&](void * dst, size_t size) {
-            data_ctx.read_to(dst, size);
-        },
-    };
+// deprecated
+void llama_kv_cache_defrag(llama_context * ctx) {
+    return llama_kv_self_defrag(ctx);
+}
 
-    ctx->kv_self.state_read(io, ctx->model.hparams);
+void llama_kv_self_defrag(llama_context * ctx) {
+    return llama_kv_cache_defrag(ctx->get_kv_self());
+}
 
-    return data_ctx.get_size_read();
+// deprecated
+bool llama_kv_cache_can_shift(const llama_context * ctx) {
+    return llama_kv_self_can_shift(ctx);
 }
 
-// Sets the state reading from the specified source address
-size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
-    llama_data_read_buffer data_ctx(src, size);
-    try {
-        return llama_state_set_data_internal(ctx, data_ctx);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
-        return 0;
-    }
+bool llama_kv_self_can_shift(const llama_context * ctx) {
+    return llama_kv_cache_can_shift(ctx->get_kv_self());
 }
 
-static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_file file(path_session, "rb");
+// deprecated
+void llama_kv_cache_update(llama_context * ctx) {
+    llama_kv_self_update(ctx);
+}
 
-    // sanity checks
-    {
-        const uint32_t magic   = file.read_u32();
-        const uint32_t version = file.read_u32();
+// llama state API
 
-        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
-            LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
-            return false;
-        }
-    }
+// deprecated
+size_t llama_get_state_size(struct llama_context * ctx) {
+    return llama_state_get_size(ctx);
+}
 
-    // load the prompt
-    {
-        const uint32_t n_token_count = file.read_u32();
+// deprecated
+size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
+    return llama_state_get_data(ctx, dst, -1);
+}
 
-        if (n_token_count > n_token_capacity) {
-            LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
-            return false;
-        }
+// deprecated
+size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
+    return llama_state_set_data(ctx, src, -1);
+}
 
-        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
-        *n_token_count_out = n_token_count;
-    }
+// deprecated
+bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
+}
 
-    // restore the context state
-    {
-        const size_t n_state_size_cur = file.size() - file.tell();
+// deprecated
+bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    return llama_state_save_file(ctx, path_session, tokens, n_token_count);
+}
 
-        llama_data_read_file data_ctx(&file);
-        const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
+// Returns the *actual* size of the state.
+// Intended to be used when saving to state to a buffer.
+size_t llama_state_get_size(struct llama_context * ctx) {
+    return ctx->state_get_size();
+}
 
-        if (n_read != n_state_size_cur) {
-            LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
-            return false;
-        }
-    }
-    return true;
+size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
+    return ctx->state_get_data(dst, size);
+}
+
+// Sets the state reading from the specified source address
+size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
+    return ctx->state_set_data(src, size);
 }
 
 bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
     try {
-        return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
+        return ctx->state_load_file(path_session, tokens_out, n_token_capacity, n_token_count_out);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
         return false;
     }
 }
 
-static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
-    llama_file file(path_session, "wb");
-
-    file.write_u32(LLAMA_SESSION_MAGIC);
-    file.write_u32(LLAMA_SESSION_VERSION);
-
-    // save the prompt
-    file.write_u32((uint32_t) n_token_count);
-    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
-
-    // save the context state using stream saving
-    llama_data_write_file data_ctx(&file);
-    llama_state_get_data_internal(ctx, data_ctx);
-
-    return true;
-}
-
 bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
     try {
-        return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
+        return ctx->state_save_file(path_session, tokens, n_token_count);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
         return false;
     }
 }
 
-static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
-    llama_synchronize(ctx);
-
-    llama_kv_cache::io io = {
-        /* .write = */ [&](const void * src, size_t size) {
-            data_ctx.write(src, size);
-        },
-        /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
-            data_ctx.write_tensor_data(tensor, offset, size);
-        },
-        /* .read = */    nullptr,
-        /* .read_to = */ nullptr,
-    };
-
-    ctx->kv_self.state_write(io, ctx->model.hparams, seq_id);
-
-    return data_ctx.get_size_written();
-}
-
 size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) {
-    llama_data_write_dummy data_ctx;
-    return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+    return ctx->state_seq_get_size(seq_id);
 }
 
 size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
-    llama_data_write_buffer data_ctx(dst, size);
-    try {
-        return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
-        return 0;
-    }
-}
-
-static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
-    llama_synchronize(ctx);
-
-    llama_kv_cache::io io = {
-        /* .write = */ nullptr,
-        /* .write_tensor_data = */ nullptr,
-        /* .read = */ [&](size_t size) {
-            return data_ctx.read(size);
-        },
-        /* .read_to = */ [&](void * dst, size_t size) {
-            data_ctx.read_to(dst, size);
-        },
-    };
-
-    ctx->kv_self.state_read(io, ctx->model.hparams, dest_seq_id);
-
-    return data_ctx.get_size_read();
-}
-
-size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) {
-    llama_data_read_buffer data_ctx(src, size);
-    try {
-        return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
-        return 0;
-    }
-}
-
-static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
-    llama_file file(filepath, "wb");
-
-    file.write_u32(LLAMA_STATE_SEQ_MAGIC);
-    file.write_u32(LLAMA_STATE_SEQ_VERSION);
-
-    // save the prompt
-    file.write_u32((uint32_t) n_token_count);
-    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
-
-    // save the context state using stream saving
-    llama_data_write_file data_ctx(&file);
-    llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
-
-    const size_t res = file.tell();
-    GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
-    return res;
+    return ctx->state_seq_get_data(seq_id, dst, size);
 }
 
-static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_file file(filepath, "rb");
-
-    // version checks
-    {
-        const uint32_t magic   = file.read_u32();
-        const uint32_t version = file.read_u32();
-
-        if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
-            LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
-            return 0;
-        }
-    }
-
-    // load the prompt
-    {
-        const uint32_t n_token_count = file.read_u32();
-
-        if (n_token_count > n_token_capacity) {
-            LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
-            return 0;
-        }
-
-        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
-        *n_token_count_out = n_token_count;
-    }
-
-    // restore the context state
-    {
-        const size_t state_size = file.size() - file.tell();
-        llama_data_read_file data_ctx(&file);
-        const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
-        if (!nread) {
-            LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
-            return 0;
-        }
-        GGML_ASSERT(nread <= state_size);
-        GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
-    }
-
-    return file.tell();
+size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
+    return ctx->state_seq_set_data(seq_id, src, size);
 }
 
 size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
     try {
-        return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
+        return ctx->state_seq_save_file(seq_id, filepath, tokens, n_token_count);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
         return 0;
@@ -3752,7 +3973,7 @@ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepa
 
 size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
     try {
-        return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
+        return ctx->state_seq_load_file(dest_seq_id, filepath, tokens_out, n_token_capacity, n_token_count_out);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
         return 0;
diff --git a/src/llama-context.h b/src/llama-context.h
index 8f22fd3b1d3a1..f7e007f3273c5 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -16,38 +16,245 @@
 
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
-struct llama_batch_manager_i;
-
-// TODO: make implementation details private
-// TODO: become abstract base class, split the current implementation into different child classes
 struct llama_context {
-    // TODO: tmp until llama-model starts implementing the graph build function
-    typedef std::function<ggml_cgraph *(llama_context &, const llama_ubatch &, bool worst_case)> build_graph_callback;
+    llama_context(const llama_model & model);
+    virtual ~llama_context();
 
-    llama_context(
-            const llama_model & model,
-            const llama_context_params & params,
-            build_graph_callback && cb_build_graph);
+    virtual void synchronize();
+
+    virtual uint32_t n_ctx()     const = 0;
+    virtual uint32_t n_batch()   const = 0;
+    virtual uint32_t n_ubatch()  const = 0;
+    virtual uint32_t n_seq_max() const = 0;
+
+    virtual       llama_kv_cache * get_kv_self()       = 0;
+    virtual const llama_kv_cache * get_kv_self() const = 0;
+
+    virtual void kv_self_update() = 0;
+
+    virtual enum llama_pooling_type pooling_type() const = 0;
+
+    virtual float * get_logits()              = 0;
+    virtual float * get_logits_ith(int32_t i) = 0;
+
+    virtual float * get_embeddings()                        = 0;
+    virtual float * get_embeddings_ith(int32_t i)           = 0;
+    virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0;
+
+    int64_t n_pos_per_token() const; // vision
+
+    virtual ggml_context_ptr init();
+
+    virtual int decode(llama_batch & inp_batch) = 0;
+    virtual int encode(llama_batch & inp_batch) = 0;
+
+    // graph build API (generic)
+
+    // do mat_mul, while optionally apply lora
+    virtual ggml_tensor * build_lora_mm(
+            ggml_context * ctx0,
+             ggml_tensor * w,
+             ggml_tensor * cur);
+
+    // do mat_mul_id, while optionally apply lora
+    virtual ggml_tensor * build_lora_mm_id(
+            ggml_context * ctx0,
+             ggml_tensor * w,   // struct ggml_tensor * as
+             ggml_tensor * cur, // struct ggml_tensor * b
+             ggml_tensor * ids);
+
+    // graph build API (context-specific)
+
+    virtual ggml_tensor * build_inp_embd(
+            ggml_context * ctx0,
+             ggml_tensor * tok_embd,
+      const llama_ubatch & ubatch) = 0;
+
+    virtual ggml_tensor * build_inp_pos(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) = 0;
+
+    virtual ggml_tensor * build_inp_out_ids(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_inp_mean(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) = 0;
+
+    virtual ggml_tensor * build_inp_cls(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) = 0;
+
+    virtual void build_attn_inp(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   causal,
+                    bool   swa,
+                    bool   worst_case) = 0;
+
+    virtual void build_attn_kv_store(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * k_cur,
+             ggml_tensor * v_cur,
+                 int32_t   n_tokens,
+                 int64_t   il,
+                 bool      worst_case) = 0;
+
+    virtual ggml_tensor * build_attn_qkv(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * wo,
+             ggml_tensor * wo_b,
+             ggml_tensor * q_cur,
+                 int32_t   n_tokens,
+                 float     kq_scale,
+                 int       il,
+                 bool      worst_case) = 0;
+
+    virtual ggml_tensor * build_soft_max_ext(
+            ggml_context * ctx0,
+             ggml_tensor * kq,
+                 float     kq_scale) = 0;
+
+    virtual ggml_tensor * get_rope_factors(int il) = 0;
+
+    virtual void build_k_shift(
+            ggml_context * ctx0,
+             ggml_cgraph * graph) = 0;
+
+    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
+    virtual void build_defrag(
+            ggml_context * ctx0,
+             ggml_cgraph * graph) = 0;
+
+    virtual ggml_tensor * build_inp_embd_enc(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_inp_KQ_mask_cross(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_inp_s_copy(
+            ggml_context * ctx0,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_inp_s_mask(
+            ggml_context * ctx0,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_copy_mask_state(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * s,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+                 int32_t   n_tokens,
+                 int32_t   n_state,
+                 int32_t   n_seqs,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_mamba_layer(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * cur,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case) = 0;
 
-    virtual ~llama_context() = default;
+    virtual ggml_tensor * build_rwkv_token_shift_load(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case) = 0;
 
-    const struct llama_model & model;
+    virtual ggml_tensor * build_rwkv_token_shift_store(
+            ggml_context * ctx0,
+             ggml_tensor * token_shift,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_rwkv6_time_mix(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * cur,
+             ggml_tensor * x_prev,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case) = 0;
+
+    // state save/load
+
+    virtual size_t state_get_size() = 0;
+    virtual size_t state_get_data(      uint8_t * dst, size_t size) = 0;
+    virtual size_t state_set_data(const uint8_t * src, size_t size) = 0;
+
+    virtual size_t state_seq_get_size(llama_seq_id seq_id) = 0;
+    virtual size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) = 0;
+    virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) = 0;
+
+    virtual bool state_load_file(
+            const char * filepath,
+           llama_token * tokens_out,
+                size_t   n_token_capacity,
+                size_t * n_token_count_out) = 0;
+
+    virtual bool state_save_file(
+            const char * filepath,
+     const llama_token * tokens,
+                size_t   n_token_count) = 0;
+
+    virtual size_t state_seq_load_file(
+          llama_seq_id   seq_id,
+            const char * filepath,
+           llama_token * tokens_out,
+                size_t   n_token_capacity,
+                size_t * n_token_count_out) = 0;
+
+    virtual size_t state_seq_save_file(
+          llama_seq_id   seq_id,
+            const char * filepath,
+     const llama_token * tokens,
+                size_t   n_token_count) = 0;
+
+    // members
+
+    const llama_model & model;
 
     llama_cparams      cparams;
-    llama_sbatch       sbatch;  // TODO: revisit if needed
     llama_adapter_cvec cvec;
     llama_loras        loras;
 
-    build_graph_callback cb_build_graph;
+    ggml_threadpool_t threadpool       = nullptr;
+    ggml_threadpool_t threadpool_batch = nullptr;
+
+    ggml_abort_callback abort_callback      = nullptr;
+    void *              abort_callback_data = nullptr;
 
     std::vector<ggml_backend_ptr> backends;
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
 
     ggml_backend_t backend_cpu = nullptr;
 
-    ggml_threadpool_t threadpool       = nullptr;
-    ggml_threadpool_t threadpool_batch = nullptr;
+    ggml_backend_sched_ptr sched;
+
+    // memory buffers used to evaluate the model
+    std::vector<uint8_t> buf_compute_meta;
 
+    // perf
     bool has_evaluated_once = false;
 
     mutable int64_t t_start_us;
@@ -60,6 +267,49 @@ struct llama_context {
 
     mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
     mutable int32_t n_eval   = 0; // number of eval calls
+};
+
+// TODO: make implementation details private
+struct llama_context_unified : public llama_context {
+    struct batch_manager;
+
+    // TODO: tmp until llama-model starts implementing the graph build function
+    typedef std::function<ggml_cgraph *(llama_context &, const llama_ubatch &, bool worst_case)> build_graph_callback;
+
+    llama_context_unified(
+            const llama_model & model,
+            const llama_context_params & params,
+            build_graph_callback && cb_build_graph);
+
+    virtual ~llama_context_unified();
+
+    virtual uint32_t n_ctx()     const override;
+    virtual uint32_t n_batch()   const override;
+    virtual uint32_t n_ubatch()  const override;
+    virtual uint32_t n_seq_max() const override;
+
+    virtual       llama_kv_cache * get_kv_self()       override;
+    virtual const llama_kv_cache * get_kv_self() const override;
+
+    virtual void kv_self_update() override;
+
+    virtual enum llama_pooling_type pooling_type() const override;
+
+    virtual float * get_logits()              override;
+    virtual float * get_logits_ith(int32_t i) override;
+
+    virtual float * get_embeddings()                        override;
+    virtual float * get_embeddings_ith(int32_t i)           override;
+    virtual float * get_embeddings_seq(llama_seq_id seq_id) override;
+
+    virtual ggml_context_ptr init() override;
+
+    virtual int decode(llama_batch & inp_batch) override;
+    virtual int encode(llama_batch & inp_batch) override;
+
+    llama_sbatch sbatch;
+
+    build_graph_callback cb_build_graph;
 
     // host buffer for the model output (logits and embeddings)
     ggml_backend_buffer_ptr buf_output;
@@ -72,7 +322,7 @@ struct llama_context {
     size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
     int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
 
-    bool logits_all = false;
+    bool logits_all   = false;
     bool need_reserve = false;
 
     // embeddings output (2-dimensional array: [n_outputs][n_embd])
@@ -84,17 +334,7 @@ struct llama_context {
     // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
     std::map<llama_seq_id, std::vector<float>> embd_seq;
 
-    // memory buffers used to evaluate the model
-    std::vector<uint8_t> buf_compute_meta;
-    ggml_backend_sched_ptr sched;
-
-    ggml_abort_callback abort_callback      = nullptr;
-    void *              abort_callback_data = nullptr;
-
-    virtual std::unique_ptr<llama_batch_manager_i> prepare_batch(const llama_batch & batch);
-
-    virtual int decode(llama_batch & inp_batch);
-    virtual int encode(llama_batch & inp_batch);
+    virtual std::unique_ptr<batch_manager> prepare_batch(const llama_batch & batch);
 
     // returns the result of ggml_backend_sched_graph_compute_async execution
     enum ggml_status compute_graph(
@@ -107,32 +347,19 @@ struct llama_context {
     // certain implementations could require a padding for the context size
     uint32_t get_ctx_padding(const llama_cparams & cparams) const;
 
-    void reset();
-
     void prepare_k_shift();
     void prepare_defrag();
 
     void set_inputs(const llama_ubatch & ubatch);
 
     // make the outputs have the same order they had in the user-provided batch
-    // TODO: maybe deprecate this
+    // TODO: maybe remove this
     void reorder_outputs();
 
     // Make sure enough space is available for outputs.
     // Returns max number of outputs for which space was reserved.
     size_t reserve_outputs(size_t n_outputs);
 
-    ggml_tensor * build_lora_mm(
-            ggml_context * ctx0,
-             ggml_tensor * w,
-             ggml_tensor * cur);
-
-    ggml_tensor * build_lora_mm_id(
-            ggml_context * ctx0,
-             ggml_tensor * w,   // struct ggml_tensor * as
-             ggml_tensor * cur, // struct ggml_tensor * b
-             ggml_tensor * ids);
-
     // input tensors
     struct ggml_tensor * inp_tokens;        // I32 [n_batch]
     struct ggml_tensor * inp_embd;          // F32 [n_embd, n_batch]
@@ -141,49 +368,55 @@ struct llama_context {
     struct ggml_tensor * inp_mean;          // F32 [n_batch, n_batch]
     struct ggml_tensor * inp_cls;           // I32 [n_batch]
 
-    // === encoder-decoder ===
-
-    // whether we are computing encoder output or decoder output
-    bool is_encoding = false;
-
-    // output of the encoder part of the encoder-decoder models
-    std::vector<float> embd_enc;
-    std::vector<std::set<llama_seq_id>> seq_ids_enc;
-
-    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
-    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
-
     // === unified KV cache ===
 
-    llama_kv_cache     kv_self;
+    llama_kv_cache kv_self;
 
     struct ggml_tensor * inp_KQ_mask;         // F32 [kv_size, n_batch]
     struct ggml_tensor * inp_KQ_mask_cnv;     //     [kv_size, n_batch]
     struct ggml_tensor * inp_KQ_mask_swa;     // F32 [kv_size, n_batch]
     struct ggml_tensor * inp_KQ_mask_swa_cnv; //     [kv_size, n_batch]
-    struct ggml_tensor * inp_KQ_mask_cross;   // F32 [n_outputs_enc, n_batch]
     struct ggml_tensor * inp_K_shift;         // I32 [kv_size]
 
-    // return true if need to reserve new worst-case graph
-    void kv_self_update();
+    virtual ggml_tensor * build_inp_embd(
+            ggml_context * ctx0,
+             ggml_tensor * tok_embd,
+      const llama_ubatch & ubatch) override;
+
+    virtual ggml_tensor * build_inp_pos(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) override;
 
-    void build_attn_inp(
+    virtual ggml_tensor * build_inp_out_ids(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   worst_case) override;
+
+    virtual ggml_tensor * build_inp_mean(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) override;
+
+    virtual ggml_tensor * build_inp_cls(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) override;
+
+    virtual void build_attn_inp(
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
                     bool   swa,
-                    bool   worst_case);
+                    bool   worst_case) override;
 
-    void build_attn_kv_store(
+    virtual void build_attn_kv_store(
             ggml_context * ctx0,
              ggml_cgraph * graph,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
                  int32_t   n_tokens,
                  int64_t   il,
-                 bool      worst_case);
+                 bool      worst_case) override;
 
-    ggml_tensor * build_attn_qkv(
+    virtual ggml_tensor * build_attn_qkv(
             ggml_context * ctx0,
              ggml_cgraph * graph,
              ggml_tensor * wo,
@@ -192,39 +425,65 @@ struct llama_context {
                  int32_t   n_tokens,
                  float     kq_scale,
                  int       il,
-                 bool      worst_case);
+                 bool      worst_case) override;
 
-    ggml_tensor * build_soft_max_ext(
+    virtual ggml_tensor * build_soft_max_ext(
             ggml_context * ctx0,
              ggml_tensor * kq,
-                 float     kq_scale);
+                 float     kq_scale) override;
 
-    ggml_tensor * get_rope_factors(int il);
+    virtual ggml_tensor * get_rope_factors(int il) override;
 
-    void build_k_shift(
+    virtual void build_k_shift(
             ggml_context * ctx0,
-             ggml_cgraph * graph);
+             ggml_cgraph * graph) override;
 
     // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    void build_defrag(
+    virtual void build_defrag(
             ggml_context * ctx0,
-             ggml_cgraph * graph);
+             ggml_cgraph * graph) override;
+
+    // === encoder-decoder ===
+
+    // whether we are computing encoder output or decoder output
+    bool is_encoding = false;
+
+    // output of the encoder part of the encoder-decoder models
+    std::vector<float> embd_enc;
+    std::vector<std::set<llama_seq_id>> seq_ids_enc;
+
+    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
+    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
+    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+
+    virtual ggml_tensor * build_inp_embd_enc(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   worst_case) override;
+
+    virtual ggml_tensor * build_inp_KQ_mask_cross(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   worst_case) override;
 
     // === recurrent ===
 
+    struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
+    struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
+
     // TODO: add recurrent cache
     // TODO: add mamba-specific llama_context
 
     // TODO: change these to build_mamba_inp and hide `state_copy` and `state_mask` inside the llama_context impl
-    ggml_tensor * build_inp_s_copy(
+    virtual ggml_tensor * build_inp_s_copy(
             ggml_context * ctx0,
-                    bool   worst_case);
+                    bool   worst_case) override;
 
-    ggml_tensor * build_inp_s_mask(
+    virtual ggml_tensor * build_inp_s_mask(
             ggml_context * ctx0,
-                    bool   worst_case);
+                    bool   worst_case) override;
 
-    ggml_tensor * build_copy_mask_state(
+    virtual ggml_tensor * build_copy_mask_state(
             ggml_context * ctx0,
              ggml_cgraph * graph,
              ggml_tensor * s,
@@ -233,9 +492,9 @@ struct llama_context {
                  int32_t   n_tokens,
                  int32_t   n_state,
                  int32_t   n_seqs,
-                    bool   worst_case);
+                    bool   worst_case) override;
 
-    ggml_tensor * build_mamba_layer(
+    virtual ggml_tensor * build_mamba_layer(
             ggml_context * ctx0,
              ggml_cgraph * graph,
              ggml_tensor * cur,
@@ -243,25 +502,25 @@ struct llama_context {
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
                      int   il,
-                    bool   worst_case);
+                    bool   worst_case) override;
 
-    ggml_tensor * build_rwkv_token_shift_load(
+    virtual ggml_tensor * build_rwkv_token_shift_load(
             ggml_context * ctx0,
              ggml_cgraph * graph,
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
                      int   il,
-                    bool   worst_case);
+                    bool   worst_case) override;
 
-    ggml_tensor * build_rwkv_token_shift_store(
+    virtual ggml_tensor * build_rwkv_token_shift_store(
             ggml_context * ctx0,
              ggml_tensor * token_shift,
       const llama_ubatch & ubatch,
                      int   il,
-                    bool   worst_case);
+                    bool   worst_case) override;
 
-    ggml_tensor * build_rwkv6_time_mix(
+    virtual ggml_tensor * build_rwkv6_time_mix(
             ggml_context * ctx0,
              ggml_cgraph * graph,
              ggml_tensor * cur,
@@ -270,17 +529,48 @@ struct llama_context {
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
                      int   il,
-                    bool   worst_case);
-
-    struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
-    struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
-
-    // === vision ===
-
-    // TODO: find a better way to accommodate mutli-dimension position encoding methods
-    // number of position id each token get, 1 for each token in most cases.
-    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
-    int n_pos_per_token = 1;
+                    bool   worst_case) override;
+
+    // state save/load
+
+    virtual size_t state_get_size()                                 override;
+    virtual size_t state_get_data(      uint8_t * dst, size_t size) override;
+    virtual size_t state_set_data(const uint8_t * src, size_t size) override;
+
+    virtual size_t state_seq_get_size(llama_seq_id seq_id)                                  override;
+    virtual size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) override;
+    virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override;
+
+    virtual bool state_load_file(
+            const char * filepath,
+           llama_token * tokens_out,
+                size_t   n_token_capacity,
+                size_t * n_token_count_out) override;
+
+    virtual bool state_save_file(
+            const char * filepath,
+     const llama_token * tokens,
+                size_t   n_token_count) override;
+
+    virtual size_t state_seq_load_file(
+          llama_seq_id   seq_id,
+            const char * filepath,
+           llama_token * tokens_out,
+                size_t   n_token_capacity,
+                size_t * n_token_count_out) override;
+
+    virtual size_t state_seq_save_file(
+          llama_seq_id   seq_id,
+            const char * filepath,
+     const llama_token * tokens,
+                size_t   n_token_count) override;
+
+private:
+    size_t state_get_data(struct llama_data_write & data_ctx);
+    size_t state_set_data(struct llama_data_read  & data_ctx);
+
+    size_t state_seq_get_data(struct llama_data_write & data_ctx, llama_seq_id seq_id);
+    size_t state_seq_set_data(struct llama_data_read  & data_ctx, llama_seq_id seq_id);
 };
 
 // For internal test use
diff --git a/src/llama.cpp b/src/llama.cpp
index ed5e1e5254e7a..7c002f9bf8ff0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8,7 +8,6 @@
 #include "llama-model.h"
 
 #include "ggml.h"
-#include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml-cpp.h"
 
@@ -86,8 +85,6 @@ struct llm_build_context {
     const float norm_rms_eps;
 
     const int32_t n_tokens;
-    const int32_t n_outputs;
-    const int32_t n_outputs_enc;
     const int32_t n_ctx_orig;
 
     const bool worst_case;
@@ -98,9 +95,8 @@ struct llm_build_context {
 
     const llm_build_cb & cb;
 
-    std::vector<uint8_t> & buf_compute_meta;
-
-    struct ggml_context * ctx0 = nullptr;
+    const ggml_context_ptr   ctx  = nullptr;
+          ggml_context     * ctx0 = nullptr;
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
@@ -136,132 +132,37 @@ struct llm_build_context {
         norm_eps         (hparams.f_norm_eps),
         norm_rms_eps     (hparams.f_norm_rms_eps),
         n_tokens         (ubatch.n_tokens),
-        n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
-        n_outputs_enc    (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
         n_ctx_orig       (cparams.n_ctx_orig_yarn),
         worst_case       (worst_case),
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
         cb               (cb),
-        buf_compute_meta (lctx.buf_compute_meta) {
-            // all initializations should be done in init()
+        ctx              (lctx.init()),
+        ctx0             (ctx.get()) {
         }
 
-    void init() {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ buf_compute_meta.size(),
-            /*.mem_buffer =*/ buf_compute_meta.data(),
-            /*.no_alloc   =*/ true,
-        };
-
-        ctx0 = ggml_init(params);
-
-        lctx.reset();
-    }
-
-    void free() {
-        ggml_free(ctx0);
-        ctx0 = nullptr;
-    }
-
+    // TODO: tmp
     struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
-        struct ggml_tensor * inpL;
-
-        if (ubatch.token) {
-            lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-            cb(lctx.inp_tokens, "inp_tokens", -1);
-            ggml_set_input(lctx.inp_tokens);
-
-            inpL = ggml_get_rows(ctx0, tok_embd, lctx.inp_tokens);
-
-            // apply lora for embedding tokens if needed
-            for (const auto & lora : loras) {
-                struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
-                if (lw == nullptr) {
-                    continue;
-                }
-
-                const float adapter_scale = lora.second;
-                const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
-
-                struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
-                            ctx0, lw->b, // non-transposed lora_b
-                            ggml_get_rows(ctx0, lw->a, lctx.inp_tokens)
-                            ), scale);
-
-                inpL = ggml_add(ctx0, inpL, inpL_delta);
-            }
-        } else {
-            lctx.inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
-            inpL = lctx.inp_embd;
-            ggml_set_input(lctx.inp_embd);
-        }
-
-        // For Granite architecture
-        if (hparams.f_embedding_scale != 0.0f) {
-            inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale);
-        }
-
+        struct ggml_tensor * inpL = lctx.build_inp_embd(ctx0, tok_embd, ubatch);
         cb(inpL, "inp_embd", -1);
 
         return inpL;
     }
 
-    // do mat_mul, while optionally apply lora
+    // TODO: tmp
     struct ggml_tensor * build_lora_mm(
               struct ggml_tensor * w,
               struct ggml_tensor * cur) {
-        struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
-
-        for (const auto & lora : loras) {
-            struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
-            if (lw == nullptr) {
-                continue;
-            }
-
-            const float adapter_scale = lora.second;
-            const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
-
-            struct ggml_tensor * ab_cur = ggml_mul_mat(
-                ctx0, lw->b,
-                ggml_mul_mat(ctx0, lw->a, cur)
-            );
-
-            ab_cur = ggml_scale(ctx0, ab_cur, scale);
-            res = ggml_add(ctx0, res, ab_cur);
-        }
-
-        return res;
+        return lctx.build_lora_mm(ctx0, w, cur);
     }
 
-    // do mat_mul_id, while optionally apply lora
+    // TODO: tmp
     struct ggml_tensor * build_lora_mm_id(
               struct ggml_tensor * w,   // struct ggml_tensor * as
               struct ggml_tensor * cur, // struct ggml_tensor * b
               struct ggml_tensor * ids) {
-        struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
-        for (const auto & lora : loras) {
-            struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
-            if (lw == nullptr) {
-                continue;
-            }
-
-            const float alpha = lora.first->alpha;
-            const float rank  = (float) lw->b->ne[0];
-            const float scale = alpha ? lora.second * alpha / rank : lora.second;
-
-            struct ggml_tensor * ab_cur = ggml_mul_mat_id(
-                ctx0, lw->b,
-                ggml_mul_mat_id(ctx0, lw->a, cur, ids),
-                ids
-            );
-
-            ab_cur = ggml_scale(ctx0, ab_cur, scale);
-            res = ggml_add(ctx0, res, ab_cur);
-        }
-
-        return res;
+        return lctx.build_lora_mm_id(ctx0, w, cur, ids);
     }
 
     struct ggml_tensor * build_norm(
@@ -620,31 +521,31 @@ struct llm_build_context {
     }
 
     struct ggml_tensor * build_inp_pos() {
-        lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(lctx.inp_pos, "inp_pos", -1);
-        ggml_set_input(lctx.inp_pos);
-        return lctx.inp_pos;
+        ggml_tensor * cur = lctx.build_inp_pos(ctx0, n_tokens);
+        cb(cur, "inp_pos", -1);
+
+        return cur;
     }
 
     struct ggml_tensor * build_inp_out_ids() {
-        lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
-        cb(lctx.inp_out_ids, "inp_out_ids", -1);
-        ggml_set_input(lctx.inp_out_ids);
-        return lctx.inp_out_ids;
+        ggml_tensor * cur = lctx.build_inp_out_ids(ctx0, n_tokens, worst_case);
+        cb(cur, "inp_out_ids", -1);
+
+        return cur;
     }
 
     struct ggml_tensor * build_inp_mean() {
-        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
-        cb(lctx.inp_mean, "inp_mean", -1);
-        ggml_set_input(lctx.inp_mean);
-        return lctx.inp_mean;
+        ggml_tensor * cur = lctx.build_inp_mean(ctx0, n_tokens);
+        cb(cur, "inp_mean", -1);
+
+        return cur;
     }
 
     struct ggml_tensor * build_inp_cls() {
-        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(lctx.inp_cls, "inp_cls", -1);
-        ggml_set_input(lctx.inp_cls);
-        return lctx.inp_cls;
+        ggml_tensor * cur = lctx.build_inp_cls(ctx0, n_tokens);
+        cb(cur, "inp_cls", -1);
+
+        return cur;
     }
 
     struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
@@ -745,26 +646,22 @@ struct llm_build_context {
     //}
 
     struct ggml_tensor * build_inp_embd_enc() {
-        const int64_t n_embd = hparams.n_embd;
-        lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
-        ggml_set_input(lctx.inp_embd_enc);
-        cb(lctx.inp_embd_enc, "embd_enc", -1);
-        return lctx.inp_embd_enc;
+        ggml_tensor * cur = lctx.build_inp_embd_enc(ctx0, n_tokens, worst_case);
+        cb(cur, "embd_enc", -1);
+
+        return cur;
     }
 
     struct ggml_tensor * build_inp_KQ_mask_cross() {
-        lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        ggml_set_input(lctx.inp_KQ_mask_cross);
-        cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1);
-        return lctx.inp_KQ_mask_cross;
+        ggml_tensor * cur = lctx.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
+        cb(cur, "KQ_mask_cross", -1);
+
+        return cur;
     }
 
     struct ggml_cgraph * build_llama() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -838,7 +735,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -927,9 +823,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_deci() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -1014,7 +907,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -1422,9 +1314,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_grok() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -1498,7 +1387,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -1580,9 +1468,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_dbrx() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -1649,7 +1534,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -2716,10 +2600,7 @@ struct llm_build_context {
         inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
-        lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
-        cb(lctx.inp_pos, "inp_pos", -1);
-        ggml_set_input(lctx.inp_pos);
-        struct ggml_tensor * inp_pos = lctx.inp_pos;
+        struct ggml_tensor * inp_pos = build_inp_pos();
 
         lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
@@ -2825,9 +2706,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_qwen2moe() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -2891,7 +2769,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -4685,9 +4562,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_olmo() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4757,7 +4631,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -4808,9 +4681,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_olmo2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4880,7 +4750,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -4935,9 +4804,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_olmoe() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5006,7 +4872,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5325,9 +5190,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_arctic() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5385,7 +5247,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5458,9 +5319,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_deepseek() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5535,7 +5393,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5616,9 +5473,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_deepseek2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         bool is_lite = (hparams.n_layer == 27);
 
         // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
@@ -5767,7 +5621,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5996,9 +5849,6 @@ struct llm_build_context {
     //struct ggml_cgraph * build_t5_enc() {
     //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-    //    // mutable variable, needed during the last layer of the computation to skip unused tokens
-    //    int32_t n_tokens = this->n_tokens;
-
     //    const int64_t n_embd_head = hparams.n_embd_head_v;
     //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
     //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -6072,7 +5922,6 @@ struct llm_build_context {
     //        if (il == n_layer - 1) {
     //            // skip computing output for unused tokens
     //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-    //            n_tokens = n_outputs;
     //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
     //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
     //        }
@@ -6128,9 +5977,6 @@ struct llm_build_context {
     //struct ggml_cgraph * build_t5_dec() {
     //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-    //    // mutable variable, needed during the last layer of the computation to skip unused tokens
-    //    int32_t n_tokens = this->n_tokens;
-
     //    const int64_t n_embd_head = hparams.n_embd_head_v;
     //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
     //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -6272,7 +6118,6 @@ struct llm_build_context {
     //        if (il == n_layer - 1) {
     //            // skip computing output for unused tokens
     //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-    //            n_tokens = n_outputs;
     //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
     //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
     //            inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
@@ -6673,9 +6518,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_exaone() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6748,7 +6590,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -6978,9 +6819,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_chameleon() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -7076,7 +6914,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -7341,8 +7178,6 @@ static struct ggml_cgraph * llama_build_graph(
 
     struct llm_build_context llm(lctx, ubatch, cb, worst_case);
 
-    llm.init();
-
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
         case LLM_ARCH_MINICPM:
@@ -7403,7 +7238,6 @@ static struct ggml_cgraph * llama_build_graph(
             } break;
         case LLM_ARCH_QWEN2VL:
             {
-                lctx.n_pos_per_token = 4;
                 result = llm.build_qwen2vl();
             } break;
         case LLM_ARCH_QWEN2MOE:
@@ -7564,8 +7398,6 @@ static struct ggml_cgraph * llama_build_graph(
         result = llm.append_pooling(result);
     }
 
-    llm.free();
-
     return result;
 }
 
@@ -7908,7 +7740,7 @@ struct llama_context * llama_init_from_model(
 
     try {
         // TODO: add logic which llama_context implementation to construct
-        ctx = new llama_context(*model, params,
+        ctx = new llama_context_unified(*model, params,
                 [](llama_context & lctx, const llama_ubatch & ubatch, bool worst_case) {
                     return llama_build_graph(lctx, ubatch, worst_case);
                 });

From b52b79b048e3b82ea68c20de34ceac3fc3984786 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 12 Feb 2025 11:23:38 +0200
Subject: [PATCH 30/84] context : move encode/decode to llama-context.cpp

---
 src/llama-context.cpp | 25 +++++++++++++++++++
 src/llama-context.h   | 23 +++++++++++++++++
 src/llama.cpp         | 57 -------------------------------------------
 3 files changed, 48 insertions(+), 57 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 13beb097cbadd..4e02f155b1a81 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -3980,6 +3980,31 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
     }
 }
 
+///
+
+int32_t llama_encode(
+        struct llama_context * ctx,
+          struct llama_batch   batch) {
+    const int ret = ctx->encode(batch);
+    if (ret != 0) {
+        LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
+    }
+
+    return ret;
+}
+
+int32_t llama_decode(
+        struct llama_context * ctx,
+          struct llama_batch   batch) {
+    const int ret = ctx->decode(batch);
+    if (ret != 0) {
+        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
+    }
+
+    return ret;
+}
+
+
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
     struct llama_context * ctx
 ) {
diff --git a/src/llama-context.h b/src/llama-context.h
index f7e007f3273c5..ac842dc8bc54c 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -45,7 +45,30 @@ struct llama_context {
 
     virtual ggml_context_ptr init();
 
+    // decode a batch of tokens by evaluating the transformer
+    // in case of unsuccessful decoding (error or warning),
+    // the kv_cache state will be returned to its original state
+    // (for non-recurrent models) or cleaned (for recurrent models)
+    //
+    //   - lctx:      llama context
+    //   - inp_batch: batch to evaluate
+    //
+    // return 0 on success
+    // return positive int on warning
+    // return negative int on error
+    //
     virtual int decode(llama_batch & inp_batch) = 0;
+
+
+    // encode a batch of tokens by evaluating the encoder part of the transformer
+    //
+    //   - lctx:      llama context
+    //   - batch:     batch to evaluate
+    //
+    // return 0 on success
+    // return positive int on warning
+    // return negative int on error
+    //
     virtual int encode(llama_batch & inp_batch) = 0;
 
     // graph build API (generic)
diff --git a/src/llama.cpp b/src/llama.cpp
index 7c002f9bf8ff0..f623dd385d917 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7401,39 +7401,6 @@ static struct ggml_cgraph * llama_build_graph(
     return result;
 }
 
-// decode a batch of tokens by evaluating the transformer
-// in case of unsuccessful decoding (error or warning),
-// the kv_cache state will be returned to its original state
-// (for non-recurrent models) or cleaned (for recurrent models)
-//
-//   - lctx:      llama context
-//   - inp_batch: batch to evaluate
-//
-// return 0 on success
-// return positive int on warning
-// return negative int on error
-//
-static int llama_decode_impl(
-         llama_context & lctx,
-           llama_batch   inp_batch) {
-    return lctx.decode(inp_batch);
-}
-
-// encode a batch of tokens by evaluating the encoder part of the transformer
-//
-//   - lctx:      llama context
-//   - batch:     batch to evaluate
-//
-// return 0 on success
-// return positive int on warning
-// return negative int on error
-//
-static int llama_encode_impl(
-         llama_context & lctx,
-           llama_batch   inp_batch) {
-    return lctx.encode(inp_batch);
-}
-
 //
 // interface implementation
 //
@@ -7759,30 +7726,6 @@ struct llama_context * llama_new_context_with_model(
     return llama_init_from_model(model, params);
 }
 
-///
-
-int32_t llama_encode(
-        struct llama_context * ctx,
-          struct llama_batch   batch) {
-    const int ret = llama_encode_impl(*ctx, batch);
-    if (ret != 0) {
-        LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
-    }
-
-    return ret;
-}
-
-int32_t llama_decode(
-        struct llama_context * ctx,
-          struct llama_batch   batch) {
-    const int ret = llama_decode_impl(*ctx, batch);
-    if (ret != 0) {
-        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
-    }
-
-    return ret;
-}
-
 //
 // chat templates
 //

From 8da7f612b750851d7e13e4f1697ed8a98c46db3c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 12 Feb 2025 12:11:30 +0200
Subject: [PATCH 31/84] context : improve llama_context encapsulation

ggml-ci
---
 src/llama-adapter.cpp |  10 +-
 src/llama-adapter.h   |   2 +-
 src/llama-context.cpp | 242 ++++++++++++++++++++++++++++++------------
 src/llama-context.h   |  79 ++++++++++----
 src/llama.cpp         | 151 +++++++++++++++-----------
 5 files changed, 327 insertions(+), 157 deletions(-)

diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
index 8a0800463137e..3ce36886c0e1f 100644
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -91,7 +91,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
     return true;
 }
 
-int32_t llama_adapter_cvec::apply(
+bool llama_adapter_cvec::apply(
         const llama_model & model,
         const float * data,
         size_t len,
@@ -104,17 +104,17 @@ int32_t llama_adapter_cvec::apply(
         // disable the current control vector (but leave allocated for later)
         layer_start = -1;
         layer_end   = -1;
-        return 0;
+        return true;
     }
 
     if (n_embd != (int) hparams.n_embd) {
         LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
-        return 1;
+        return false;
     }
 
     if (tensors.empty()) {
         if (!init(model)) {
-            return 1;
+            return false;
         }
     }
 
@@ -130,7 +130,7 @@ int32_t llama_adapter_cvec::apply(
         }
     }
 
-    return 0;
+    return true;
 }
 
 // lora
diff --git a/src/llama-adapter.h b/src/llama-adapter.h
index 603fa08f6d186..4332ccd57f14b 100644
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -19,7 +19,7 @@ struct llama_adapter_cvec {
 
     struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const;
 
-    int32_t apply(
+    bool apply(
             const llama_model & model,
             const float * data,
             size_t len,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4e02f155b1a81..353fc7feac66c 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -33,7 +33,9 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
+//
 // llama_context
+//
 
 llama_context::llama_context(const llama_model & model) :
     model     (model),
@@ -43,6 +45,52 @@ llama_context::llama_context(const llama_model & model) :
 
 llama_context::~llama_context() = default;
 
+const llama_model & llama_context::get_model() const {
+    return model;
+}
+
+const llama_cparams & llama_context::get_cparams() const {
+    return cparams;
+}
+
+uint32_t llama_context::n_ctx() const {
+    return cparams.n_ctx;
+}
+
+uint32_t llama_context::n_batch() const {
+    return cparams.n_batch;
+}
+
+uint32_t llama_context::n_ubatch() const {
+    return cparams.n_ubatch;
+}
+
+uint32_t llama_context::n_threads() const {
+    return cparams.n_threads;
+}
+
+uint32_t llama_context::n_threads_batch() const {
+    return cparams.n_threads_batch;
+}
+
+enum llama_pooling_type llama_context::pooling_type() const {
+    return cparams.pooling_type;
+}
+
+int64_t llama_context::n_pos_per_token() const {
+    return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+}
+
+ggml_context_ptr llama_context::init() {
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_compute_meta.size(),
+        /*.mem_buffer =*/ buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    return ggml_context_ptr { ggml_init(params) };
+}
+
 void llama_context::synchronize() {
     ggml_backend_sched_synchronize(sched.get());
 
@@ -73,21 +121,96 @@ void llama_context::synchronize() {
     t_compute_start_us = 0;
 }
 
-int64_t llama_context::n_pos_per_token() const {
-    return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+void llama_context::attach_threadpool(
+           ggml_threadpool_t   threadpool,
+           ggml_threadpool_t   threadpool_batch) {
+    this->threadpool       = threadpool;
+    this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
 }
 
-ggml_context_ptr llama_context::init() {
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute_meta.size(),
-        /*.mem_buffer =*/ buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
+void llama_context::detach_threadpool() {
+    this->threadpool       = nullptr;
+    this->threadpool_batch = nullptr;
+}
 
-    return ggml_context_ptr { ggml_init(params) };
+void llama_context::set_n_threads(int32_t n_threads, int32_t n_threads_batch) {
+    cparams.n_threads       = n_threads;
+    cparams.n_threads_batch = n_threads_batch;
+}
+
+void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) {
+    this->abort_callback      = abort_callback;
+    this->abort_callback_data = abort_callback_data;
+
+    for (auto & backend : backends) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
+        auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
+        if (set_abort_callback_fn) {
+            set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
+        }
+    }
+}
+
+void llama_context::set_embeddings(bool value) {
+    cparams.embeddings = value;
+}
+
+void llama_context::set_causal_attn(bool value) {
+    cparams.causal_attn = value;
+}
+
+void llama_context::set_adapter_lora(
+            struct llama_adapter_lora * adapter,
+            float scale) {
+    loras[adapter] = scale;
+}
+
+bool llama_context::rm_adapter_lora(
+            struct llama_adapter_lora * adapter) {
+    auto pos = loras.find(adapter);
+    if (pos != loras.end()) {
+        loras.erase(pos);
+        return true;
+    }
+
+    return false;
+}
+
+void llama_context::clear_adapter_lora() {
+    loras.clear();
+}
+
+bool llama_context::apply_adapter_cvec(
+            const float * data,
+                 size_t   len,
+                int32_t   n_embd,
+                int32_t   il_start,
+                int32_t   il_end) {
+    return cvec.apply(model, data, len, n_embd, il_start, il_end);
+}
+
+llama_perf_context_data llama_context::get_perf() const {
+    llama_perf_context_data data = {};
+
+    data.t_start_ms  = 1e-3 * t_start_us;
+    data.t_load_ms   = 1e-3 * t_load_us;
+    data.t_p_eval_ms = 1e-3 * t_p_eval_us;
+    data.t_eval_ms   = 1e-3 * t_eval_us;
+    data.n_p_eval    = std::max(1, n_p_eval);
+    data.n_eval      = std::max(1, n_eval);
+
+    return data;
 }
 
+void llama_context::perf_reset() {
+    t_start_us  = ggml_time_us();
+    t_eval_us   = n_eval = 0;
+    t_p_eval_us = n_p_eval = 0;
+}
+
+//
 // llama_context_unified
+//
 
 llama_context_unified::llama_context_unified(
         const llama_model & model,
@@ -396,18 +519,6 @@ llama_context_unified::llama_context_unified(
 
 llama_context_unified::~llama_context_unified() = default;
 
-uint32_t llama_context_unified::n_ctx() const {
-    return cparams.n_ctx;
-}
-
-uint32_t llama_context_unified::n_batch() const {
-    return cparams.n_batch;
-}
-
-uint32_t llama_context_unified::n_ubatch() const {
-    return cparams.n_ubatch;
-}
-
 uint32_t llama_context_unified::n_seq_max() const {
     // TODO: add notion of n_seq_max to llama_kv_cache and use it here
     return kv_self.size;
@@ -421,10 +532,6 @@ const llama_kv_cache * llama_context_unified::get_kv_self() const {
     return &kv_self;
 }
 
-enum llama_pooling_type llama_context_unified::pooling_type() const {
-    return cparams.pooling_type;
-}
-
 float * llama_context_unified::get_logits() {
     // reorder logits for backward compatibility
     reorder_outputs();
@@ -1718,7 +1825,13 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) {
     return n_outputs_max;
 }
 
-// do mat_mul, while optionally apply lora
+ggml_tensor * llama_context::build_cvec(
+        ggml_context * ctx0,
+         ggml_tensor * cur,
+                 int   il) {
+    return cvec.apply_to(ctx0, cur, il);
+}
+
 ggml_tensor * llama_context::build_lora_mm(
         ggml_context * ctx0,
          ggml_tensor * w,
@@ -1746,7 +1859,6 @@ ggml_tensor * llama_context::build_lora_mm(
     return res;
 }
 
-// do mat_mul_id, while optionally apply lora
 ggml_tensor * llama_context::build_lora_mm_id(
         ggml_context * ctx0,
          ggml_tensor * w,
@@ -2994,7 +3106,8 @@ struct llama_data_write {
     }
 
     void write_model_info() {
-        const std::string arch_str = llm_arch_name(ctx->model.arch);
+        const auto & model = ctx->get_model();
+        const std::string arch_str = llm_arch_name(model.arch);
         write_string(arch_str);
         // TODO: add more model-specific info which should prevent loading the session file if not identical
     }
@@ -3015,7 +3128,7 @@ struct llama_data_write {
 
         std::vector<int32_t> output_pos;
 
-        const size_t    n_batch = ctx->cparams.n_batch;
+        const size_t    n_batch = ctx->n_batch();
         const auto & output_ids = ctx->output_ids;
 
         GGML_ASSERT(n_outputs <= ctx->output_size);
@@ -3040,7 +3153,9 @@ struct llama_data_write {
     }
 
     void write_logits() {
-        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
+        const auto & model = ctx->get_model();
+
+        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * model.vocab.n_tokens());
 
         write(&logits_size, sizeof(logits_size));
 
@@ -3050,7 +3165,9 @@ struct llama_data_write {
     }
 
     void write_embeddings() {
-        const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
+        const auto & model = ctx->get_model();
+
+        const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * model.hparams.n_embd);
 
         write(&embeddings_size, sizeof(embeddings_size));
 
@@ -3079,7 +3196,9 @@ struct llama_data_read {
 
     // validate model information
     void read_model_info() {
-        const std::string cur_arch_str = llm_arch_name(ctx->model.arch);
+        const auto & model = ctx->get_model();
+
+        const std::string cur_arch_str = llm_arch_name(model.arch);
 
         std::string arch_str;
         read_string(arch_str);
@@ -3117,8 +3236,8 @@ struct llama_data_read {
 
             for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
                 int32_t id = output_pos[i];
-                if ((uint32_t) id >= ctx->cparams.n_batch) {
-                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
+                if ((uint32_t) id >= ctx->n_batch()) {
+                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->n_batch()));
                 }
                 ctx->output_ids[id] = i;
             }
@@ -3598,7 +3717,7 @@ uint32_t llama_n_seq_max(const struct llama_context * ctx) {
 }
 
 const llama_model * llama_get_model(const llama_context * ctx) {
-    return &ctx->model;
+    return &ctx->get_model();
 }
 
 llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
@@ -3614,50 +3733,38 @@ enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
 }
 
 void llama_attach_threadpool(
-             struct llama_context * ctx,
-        ggml_threadpool_t   threadpool,
-        ggml_threadpool_t   threadpool_batch) {
-    ctx->threadpool       = threadpool;
-    ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
+        struct llama_context * ctx,
+           ggml_threadpool_t   threadpool,
+           ggml_threadpool_t   threadpool_batch) {
+    ctx->attach_threadpool(threadpool, threadpool_batch);
 }
 
 void llama_detach_threadpool(struct llama_context * ctx) {
-    ctx->threadpool       = nullptr;
-    ctx->threadpool_batch = nullptr;
+    ctx->detach_threadpool();
 }
 
 void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
-    ctx->cparams.n_threads       = n_threads;
-    ctx->cparams.n_threads_batch = n_threads_batch;
+    ctx->set_n_threads(n_threads, n_threads_batch);
 }
 
 int32_t llama_n_threads(struct llama_context * ctx) {
-    return ctx->cparams.n_threads;
+    return ctx->n_threads();
 }
 
 int32_t llama_n_threads_batch(struct llama_context * ctx) {
-    return ctx->cparams.n_threads_batch;
+    return ctx->n_threads_batch();
 }
 
 void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
-    ctx->abort_callback      = abort_callback;
-    ctx->abort_callback_data = abort_callback_data;
-
-    for (auto & backend : ctx->backends) {
-        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
-        auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
-        if (set_abort_callback_fn) {
-            set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
-        }
-    }
+    ctx->set_abort_callback(abort_callback, abort_callback_data);
 }
 
 void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
-    ctx->cparams.embeddings = embeddings;
+    ctx->set_embeddings(embeddings);
 }
 
 void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
-    ctx->cparams.causal_attn = causal_attn;
+    ctx->set_causal_attn(causal_attn);
 }
 
 void llama_synchronize(struct llama_context * ctx) {
@@ -3700,24 +3807,21 @@ int32_t llama_set_adapter_lora(
             struct llama_context * ctx,
             struct llama_adapter_lora * adapter,
             float scale) {
-    ctx->loras[adapter] = scale;
+    ctx->set_adapter_lora(adapter, scale);
+
     return 0;
 }
 
 int32_t llama_rm_adapter_lora(
             struct llama_context * ctx,
             struct llama_adapter_lora * adapter) {
-    auto pos = ctx->loras.find(adapter);
-    if (pos != ctx->loras.end()) {
-        ctx->loras.erase(pos);
-        return 0;
-    }
+    bool res = ctx->rm_adapter_lora(adapter);
 
-    return -1;
+    return res ? 0 : -1;
 }
 
 void llama_clear_adapter_lora(struct llama_context * ctx) {
-    ctx->loras.clear();
+    ctx->clear_adapter_lora();
 }
 
 int32_t llama_apply_adapter_cvec(
@@ -3727,7 +3831,9 @@ int32_t llama_apply_adapter_cvec(
                      int32_t   n_embd,
                      int32_t   il_start,
                      int32_t   il_end) {
-    return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
+    bool res = ctx->apply_adapter_cvec(data, len, n_embd, il_start, il_end);
+
+    return res ? 0 : -1;
 }
 
 //
@@ -4008,5 +4114,5 @@ int32_t llama_decode(
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
     struct llama_context * ctx
 ) {
-    return ctx->model.tensors_by_name;
+    return ctx->get_model().tensors_by_name;
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index ac842dc8bc54c..7b7699952a6e2 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -20,19 +20,23 @@ struct llama_context {
     llama_context(const llama_model & model);
     virtual ~llama_context();
 
-    virtual void synchronize();
+    const llama_model   & get_model()   const;
+    const llama_cparams & get_cparams() const;
 
-    virtual uint32_t n_ctx()     const = 0;
-    virtual uint32_t n_batch()   const = 0;
-    virtual uint32_t n_ubatch()  const = 0;
+    virtual uint32_t n_ctx()     const;
+    virtual uint32_t n_batch()   const;
+    virtual uint32_t n_ubatch()  const;
     virtual uint32_t n_seq_max() const = 0;
 
+    virtual uint32_t n_threads()       const;
+    virtual uint32_t n_threads_batch() const;
+
     virtual       llama_kv_cache * get_kv_self()       = 0;
     virtual const llama_kv_cache * get_kv_self() const = 0;
 
     virtual void kv_self_update() = 0;
 
-    virtual enum llama_pooling_type pooling_type() const = 0;
+    virtual enum llama_pooling_type pooling_type() const;
 
     virtual float * get_logits()              = 0;
     virtual float * get_logits_ith(int32_t i) = 0;
@@ -41,10 +45,41 @@ struct llama_context {
     virtual float * get_embeddings_ith(int32_t i)           = 0;
     virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0;
 
-    int64_t n_pos_per_token() const; // vision
+    virtual int64_t n_pos_per_token() const; // vision
 
     virtual ggml_context_ptr init();
 
+    virtual void synchronize();
+
+    virtual void attach_threadpool(
+            ggml_threadpool_t   threadpool,
+            ggml_threadpool_t   threadpool_batch);
+
+    virtual void detach_threadpool();
+
+    virtual void set_n_threads(int32_t n_threads, int32_t n_threads_batch);
+
+    virtual void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
+
+    virtual void set_embeddings (bool value);
+    virtual void set_causal_attn(bool value);
+
+    virtual void set_adapter_lora(
+            struct llama_adapter_lora * adapter,
+            float scale);
+
+    virtual bool rm_adapter_lora(
+            struct llama_adapter_lora * adapter);
+
+    virtual void clear_adapter_lora();
+
+    virtual bool apply_adapter_cvec(
+            const float * data,
+                 size_t   len,
+                int32_t   n_embd,
+                int32_t   il_start,
+                int32_t   il_end);
+
     // decode a batch of tokens by evaluating the transformer
     // in case of unsuccessful decoding (error or warning),
     // the kv_cache state will be returned to its original state
@@ -73,6 +108,12 @@ struct llama_context {
 
     // graph build API (generic)
 
+    // apply control vector for layer il
+    virtual ggml_tensor * build_cvec(
+            ggml_context * ctx0,
+             ggml_tensor * cur,
+                     int   il);
+
     // do mat_mul, while optionally apply lora
     virtual ggml_tensor * build_lora_mm(
             ggml_context * ctx0,
@@ -221,11 +262,11 @@ struct llama_context {
 
     // state save/load
 
-    virtual size_t state_get_size() = 0;
+    virtual size_t state_get_size()                                 = 0;
     virtual size_t state_get_data(      uint8_t * dst, size_t size) = 0;
     virtual size_t state_set_data(const uint8_t * src, size_t size) = 0;
 
-    virtual size_t state_seq_get_size(llama_seq_id seq_id) = 0;
+    virtual size_t state_seq_get_size(llama_seq_id seq_id)                                   = 0;
     virtual size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) = 0;
     virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) = 0;
 
@@ -253,8 +294,19 @@ struct llama_context {
      const llama_token * tokens,
                 size_t   n_token_count) = 0;
 
+    // perf
+
+    virtual llama_perf_context_data get_perf() const;
+    virtual void perf_reset();
+
     // members
 
+    // TODO: temporary public until llama_context implements the graph build function
+    std::vector<ggml_backend_ptr> backends;
+    ggml_backend_t backend_cpu = nullptr;
+    ggml_backend_sched_ptr sched;
+
+protected:
     const llama_model & model;
 
     llama_cparams      cparams;
@@ -267,17 +319,11 @@ struct llama_context {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
-    std::vector<ggml_backend_ptr> backends;
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
 
-    ggml_backend_t backend_cpu = nullptr;
-
-    ggml_backend_sched_ptr sched;
-
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
 
-    // perf
     bool has_evaluated_once = false;
 
     mutable int64_t t_start_us;
@@ -306,9 +352,6 @@ struct llama_context_unified : public llama_context {
 
     virtual ~llama_context_unified();
 
-    virtual uint32_t n_ctx()     const override;
-    virtual uint32_t n_batch()   const override;
-    virtual uint32_t n_ubatch()  const override;
     virtual uint32_t n_seq_max() const override;
 
     virtual       llama_kv_cache * get_kv_self()       override;
@@ -316,8 +359,6 @@ struct llama_context_unified : public llama_context {
 
     virtual void kv_self_update() override;
 
-    virtual enum llama_pooling_type pooling_type() const override;
-
     virtual float * get_logits()              override;
     virtual float * get_logits_ith(int32_t i) override;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index f623dd385d917..ab6b7f5d3dae4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -59,8 +59,6 @@ struct llm_build_context {
     const llama_hparams      & hparams;
     const llama_cparams      & cparams;
     const llama_ubatch       & ubatch;
-    const llama_adapter_cvec & cvec;
-    const llama_loras        & loras;
 
     const int64_t n_embd;
     const int64_t n_layer;
@@ -105,12 +103,10 @@ struct llm_build_context {
        const llm_build_cb & cb,
                      bool   worst_case) :
         lctx             (lctx),
-        model            (lctx.model),
+        model            (lctx.get_model()),
         hparams          (model.hparams),
-        cparams          (lctx.cparams),
+        cparams          (lctx.get_cparams()),
         ubatch           (ubatch),
-        cvec             (lctx.cvec),
-        loras            (lctx.loras),
         n_embd           (hparams.n_embd),
         n_layer          (hparams.n_layer),
         n_rot            (hparams.n_rot),
@@ -791,7 +787,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -947,7 +943,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1067,7 +1063,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1171,7 +1168,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1287,7 +1285,8 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
             cur = ggml_add(ctx0, cur, inpL);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1436,7 +1435,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1564,7 +1563,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1670,7 +1669,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1761,7 +1761,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2057,7 +2058,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2194,7 +2196,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2342,7 +2345,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2454,7 +2458,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2565,7 +2570,8 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2680,7 +2686,8 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2823,7 +2830,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2944,7 +2952,8 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_output);
             cur = ggml_add(ctx0, cur, inpL);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3083,7 +3092,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, residual, cur);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3190,7 +3200,8 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, sa_out);
             cur = ggml_add(ctx0, cur, inpL);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3296,7 +3307,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3406,7 +3418,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3521,7 +3534,8 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3638,7 +3652,8 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3842,7 +3857,8 @@ struct llm_build_context {
             cb(cur, "hidden_scaled_ffn", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3954,7 +3970,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, sa_out);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4077,7 +4094,8 @@ struct llm_build_context {
             cb(cur, "ffn_post_norm", -1);
 
             cur = ggml_add(ctx0, cur, sa_out);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4202,7 +4220,8 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4256,7 +4275,8 @@ struct llm_build_context {
 
             // residual
             cur = ggml_add(ctx0, cur, inpL);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4397,7 +4417,8 @@ struct llm_build_context {
             // add together residual + FFN + self-attention
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4527,7 +4548,8 @@ struct llm_build_context {
             // add together residual + FFN + self-attention
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4655,7 +4677,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4774,7 +4796,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4899,7 +4921,8 @@ struct llm_build_context {
             cb(cur, "ffn_moe_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5024,7 +5047,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             inpL = cur;
@@ -5137,7 +5161,8 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
 
                 cur = ggml_add(ctx0, cur, attn_out);
-                cur = cvec.apply_to(ctx0, cur, il);
+
+                cur = lctx.build_cvec(ctx0, cur, il);
                 cb(cur, "l_out", il);
 
                 // input for next layer
@@ -5165,7 +5190,8 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
 
                 cur = ggml_add(ctx0, cur, ffn_inp);
-                cur = cvec.apply_to(ctx0, cur, il);
+
+                cur = lctx.build_cvec(ctx0, cur, il);
                 cb(cur, "l_out", il);
 
                 // input for next layer
@@ -5293,7 +5319,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_out);
             cb(cur, "ffn_out", il);
 
-            cur = cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5446,7 +5472,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5673,7 +5700,8 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6492,7 +6520,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6614,7 +6642,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6704,7 +6732,7 @@ struct llm_build_context {
                 cur = ggml_scale(ctx0, cur, 0.5F);
             }
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6787,7 +6815,8 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6947,7 +6976,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = lctx.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -7140,7 +7169,8 @@ static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
     const llama_ubatch & ubatch,
                   bool   worst_case) {
-    const auto & model = lctx.model;
+    const auto & model   = lctx.get_model();
+    const auto & cparams = lctx.get_cparams();
 
     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
     llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
@@ -7150,7 +7180,7 @@ static struct ggml_cgraph * llama_build_graph(
             ggml_set_name(cur, name);
         }
 
-        if (!lctx.cparams.offload_kqv) {
+        if (!cparams.offload_kqv) {
             if (strcmp(name, "kqv_merged_cont") == 0) {
                 // all nodes between the KV store and the attention output are run on the CPU
                 ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu);
@@ -7159,10 +7189,10 @@ static struct ggml_cgraph * llama_build_graph(
 
         // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
         // FIXME: fix in ggml_backend_sched
-        const bool full_offload = lctx.model.params.n_gpu_layers > (int) lctx.model.hparams.n_layer;
+        const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
         if (ubatch.n_tokens < 32 || full_offload) {
             if (il != -1 && strcmp(name, "norm") == 0) {
-                const auto & dev_layer = lctx.model.dev_layer(il);
+                const auto & dev_layer = model.dev_layer(il);
                 for (auto & backend : lctx.backends) {
                     if (ggml_backend_get_device(backend.get()) == dev_layer) {
                         if (ggml_backend_supports_op(backend.get(), cur)) {
@@ -7394,7 +7424,7 @@ static struct ggml_cgraph * llama_build_graph(
     }
 
     // add on pooling layer
-    if (lctx.cparams.embeddings) {
+    if (cparams.embeddings) {
         result = llm.append_pooling(result);
     }
 
@@ -7824,12 +7854,7 @@ struct llama_perf_context_data llama_perf_context(const struct llama_context * c
         return data;
     }
 
-    data.t_start_ms  = 1e-3 * ctx->t_start_us;
-    data.t_load_ms   = 1e-3 * ctx->t_load_us;
-    data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
-    data.t_eval_ms   = 1e-3 * ctx->t_eval_us;
-    data.n_p_eval    = std::max(1, ctx->n_p_eval);
-    data.n_eval      = std::max(1, ctx->n_eval);
+    data = ctx->get_perf();
 
     return data;
 }
@@ -7848,7 +7873,5 @@ void llama_perf_context_print(const struct llama_context * ctx) {
 }
 
 void llama_perf_context_reset(struct llama_context * ctx) {
-    ctx->t_start_us  = ggml_time_us();
-    ctx->t_eval_us   = ctx->n_eval = 0;
-    ctx->t_p_eval_us = ctx->n_p_eval = 0;
+    ctx->perf_reset();
 }

From d146a14f77eb456d2082f0620e3b310b7bcee0a8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 12 Feb 2025 12:41:36 +0200
Subject: [PATCH 32/84] context : minor naming fix

---
 src/llama-context.cpp |  2 +-
 src/llama-context.h   | 12 ++++++------
 src/llama.cpp         | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 353fc7feac66c..f0d8bdaba073a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -189,7 +189,7 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
-llama_perf_context_data llama_context::get_perf() const {
+llama_perf_context_data llama_context::perf_get_data() const {
     llama_perf_context_data data = {};
 
     data.t_start_ms  = 1e-3 * t_start_us;
diff --git a/src/llama-context.h b/src/llama-context.h
index 7b7699952a6e2..8ec7d3e2b1f69 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -94,7 +94,6 @@ struct llama_context {
     //
     virtual int decode(llama_batch & inp_batch) = 0;
 
-
     // encode a batch of tokens by evaluating the encoder part of the transformer
     //
     //   - lctx:      llama context
@@ -296,7 +295,7 @@ struct llama_context {
 
     // perf
 
-    virtual llama_perf_context_data get_perf() const;
+    virtual llama_perf_context_data perf_get_data() const;
     virtual void perf_reset();
 
     // members
@@ -326,20 +325,21 @@ struct llama_context {
 
     bool has_evaluated_once = false;
 
-    mutable int64_t t_start_us;
-    mutable int64_t t_load_us;
+    mutable int64_t t_start_us  = 0;
+    mutable int64_t t_load_us   = 0;
     mutable int64_t t_p_eval_us = 0;
     mutable int64_t t_eval_us   = 0;
 
     mutable int64_t t_compute_start_us = 0;
-    mutable int64_t n_queued_tokens = 0;
+    mutable int64_t n_queued_tokens    = 0;
 
     mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
     mutable int32_t n_eval   = 0; // number of eval calls
 };
 
 // TODO: make implementation details private
-struct llama_context_unified : public llama_context {
+class llama_context_unified : public llama_context {
+public:
     struct batch_manager;
 
     // TODO: tmp until llama-model starts implementing the graph build function
diff --git a/src/llama.cpp b/src/llama.cpp
index ab6b7f5d3dae4..c568f8d15c63c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -54,11 +54,11 @@ enum llm_norm_type {
 };
 
 struct llm_build_context {
-          llama_context      & lctx;
-    const llama_model        & model;
-    const llama_hparams      & hparams;
-    const llama_cparams      & cparams;
-    const llama_ubatch       & ubatch;
+          llama_context & lctx;
+    const llama_model   & model;
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+    const llama_ubatch  & ubatch;
 
     const int64_t n_embd;
     const int64_t n_layer;
@@ -7854,7 +7854,7 @@ struct llama_perf_context_data llama_perf_context(const struct llama_context * c
         return data;
     }
 
-    data = ctx->get_perf();
+    data = ctx->perf_get_data();
 
     return data;
 }

From 5eae8e5183f80a8b669757bde7b26cec05923081 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 12 Feb 2025 13:32:02 +0200
Subject: [PATCH 33/84] context : move build_rope_factors to base class

ggml-ci
---
 src/llama-context.cpp | 172 +++++++++++++++++++++---------------------
 src/llama-context.h   |  19 +++--
 src/llama.cpp         |  14 ++--
 3 files changed, 104 insertions(+), 101 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f0d8bdaba073a..b29c98af63add 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -57,6 +57,10 @@ uint32_t llama_context::n_ctx() const {
     return cparams.n_ctx;
 }
 
+uint32_t llama_context::n_ctx_per_seq() const {
+    return cparams.n_ctx / cparams.n_seq_max;
+}
+
 uint32_t llama_context::n_batch() const {
     return cparams.n_batch;
 }
@@ -122,8 +126,8 @@ void llama_context::synchronize() {
 }
 
 void llama_context::attach_threadpool(
-           ggml_threadpool_t   threadpool,
-           ggml_threadpool_t   threadpool_batch) {
+           ggml_threadpool_t threadpool,
+           ggml_threadpool_t threadpool_batch) {
     this->threadpool       = threadpool;
     this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
 }
@@ -202,6 +206,86 @@ llama_perf_context_data llama_context::perf_get_data() const {
     return data;
 }
 
+ggml_tensor * llama_context::build_cvec(
+        ggml_context * ctx0,
+         ggml_tensor * cur,
+                 int   il) {
+    return cvec.apply_to(ctx0, cur, il);
+}
+
+ggml_tensor * llama_context::build_lora_mm(
+        ggml_context * ctx0,
+         ggml_tensor * w,
+         ggml_tensor * cur) {
+    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+
+    for (const auto & lora : loras) {
+        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
+
+        const float adapter_scale = lora.second;
+        const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+        struct ggml_tensor * ab_cur = ggml_mul_mat(
+            ctx0, lw->b,
+            ggml_mul_mat(ctx0, lw->a, cur)
+        );
+
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+
+    return res;
+}
+
+ggml_tensor * llama_context::build_lora_mm_id(
+        ggml_context * ctx0,
+         ggml_tensor * w,
+         ggml_tensor * cur,
+         ggml_tensor * ids) {
+    struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
+    for (const auto & lora : loras) {
+        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
+
+        const float alpha = lora.first->alpha;
+        const float rank  = (float) lw->b->ne[0];
+        const float scale = alpha ? lora.second * alpha / rank : lora.second;
+
+        struct ggml_tensor * ab_cur = ggml_mul_mat_id(
+            ctx0, lw->b,
+            ggml_mul_mat_id(ctx0, lw->a, cur, ids),
+            ids
+        );
+
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+
+    return res;
+}
+
+ggml_tensor * llama_context::build_rope_factors(int il) {
+    const auto & hparams = model.hparams;
+
+    // choose long/short freq factors based on the context size
+    const auto n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
+    if (model.layers[il].rope_freqs != nullptr) {
+        return model.layers[il].rope_freqs;
+    }
+
+    if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
+        return model.layers[il].rope_long;
+    }
+
+    return model.layers[il].rope_short;
+}
+
 void llama_context::perf_reset() {
     t_start_us  = ggml_time_us();
     t_eval_us   = n_eval = 0;
@@ -217,7 +301,7 @@ llama_context_unified::llama_context_unified(
         const llama_context_params & params,
         build_graph_callback && cb_build_graph) :
     llama_context(model),
-    cb_build_graph(std::move(cb_build_graph)){
+    cb_build_graph(std::move(cb_build_graph)) {
 
     const auto & hparams = model.hparams;
 
@@ -1825,69 +1909,6 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) {
     return n_outputs_max;
 }
 
-ggml_tensor * llama_context::build_cvec(
-        ggml_context * ctx0,
-         ggml_tensor * cur,
-                 int   il) {
-    return cvec.apply_to(ctx0, cur, il);
-}
-
-ggml_tensor * llama_context::build_lora_mm(
-        ggml_context * ctx0,
-         ggml_tensor * w,
-         ggml_tensor * cur) {
-    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
-
-    for (const auto & lora : loras) {
-        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
-        if (lw == nullptr) {
-            continue;
-        }
-
-        const float adapter_scale = lora.second;
-        const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
-
-        struct ggml_tensor * ab_cur = ggml_mul_mat(
-            ctx0, lw->b,
-            ggml_mul_mat(ctx0, lw->a, cur)
-        );
-
-        ab_cur = ggml_scale(ctx0, ab_cur, scale);
-        res = ggml_add(ctx0, res, ab_cur);
-    }
-
-    return res;
-}
-
-ggml_tensor * llama_context::build_lora_mm_id(
-        ggml_context * ctx0,
-         ggml_tensor * w,
-         ggml_tensor * cur,
-         ggml_tensor * ids) {
-    struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
-    for (const auto & lora : loras) {
-        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
-        if (lw == nullptr) {
-            continue;
-        }
-
-        const float alpha = lora.first->alpha;
-        const float rank  = (float) lw->b->ne[0];
-        const float scale = alpha ? lora.second * alpha / rank : lora.second;
-
-        struct ggml_tensor * ab_cur = ggml_mul_mat_id(
-            ctx0, lw->b,
-            ggml_mul_mat_id(ctx0, lw->a, cur, ids),
-            ids
-        );
-
-        ab_cur = ggml_scale(ctx0, ab_cur, scale);
-        res = ggml_add(ctx0, res, ab_cur);
-    }
-
-    return res;
-}
-
 void llama_context_unified::kv_self_update() {
     auto & kv = kv_self;
 
@@ -2189,23 +2210,6 @@ ggml_tensor * llama_context_unified::build_soft_max_ext(
     return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
 }
 
-ggml_tensor * llama_context_unified::get_rope_factors(int il) {
-    const auto & hparams = model.hparams;
-
-    // choose long/short freq factors based on the context size
-    const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
-
-    if (model.layers[il].rope_freqs != nullptr) {
-        return model.layers[il].rope_freqs;
-    }
-
-    if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
-        return model.layers[il].rope_long;
-    }
-
-    return model.layers[il].rope_short;
-}
-
 ggml_tensor * llama_context_unified::build_inp_embd(
         ggml_context * ctx0,
          ggml_tensor * tok_embd,
@@ -2327,7 +2331,7 @@ void llama_context_unified::build_k_shift(
         const int64_t n_head_kv    = hparams.n_head_kv(il);
         const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
 
-        struct ggml_tensor * rope_factors = get_rope_factors(il);
+        struct ggml_tensor * rope_factors = build_rope_factors(il);
 
         struct ggml_tensor * k =
             ggml_view_3d(ctx0, kv_self.k_l[il],
diff --git a/src/llama-context.h b/src/llama-context.h
index 8ec7d3e2b1f69..dd1030388e692 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -23,10 +23,11 @@ struct llama_context {
     const llama_model   & get_model()   const;
     const llama_cparams & get_cparams() const;
 
-    virtual uint32_t n_ctx()     const;
-    virtual uint32_t n_batch()   const;
-    virtual uint32_t n_ubatch()  const;
-    virtual uint32_t n_seq_max() const = 0;
+    virtual uint32_t n_ctx()         const;
+    virtual uint32_t n_ctx_per_seq() const;
+    virtual uint32_t n_batch()       const;
+    virtual uint32_t n_ubatch()      const;
+    virtual uint32_t n_seq_max()     const = 0;
 
     virtual uint32_t n_threads()       const;
     virtual uint32_t n_threads_batch() const;
@@ -126,6 +127,8 @@ struct llama_context {
              ggml_tensor * cur, // struct ggml_tensor * b
              ggml_tensor * ids);
 
+    virtual ggml_tensor * build_rope_factors(int il);
+
     // graph build API (context-specific)
 
     virtual ggml_tensor * build_inp_embd(
@@ -182,8 +185,6 @@ struct llama_context {
              ggml_tensor * kq,
                  float     kq_scale) = 0;
 
-    virtual ggml_tensor * get_rope_factors(int il) = 0;
-
     virtual void build_k_shift(
             ggml_context * ctx0,
              ggml_cgraph * graph) = 0;
@@ -342,7 +343,7 @@ class llama_context_unified : public llama_context {
 public:
     struct batch_manager;
 
-    // TODO: tmp until llama-model starts implementing the graph build function
+    // TODO: tmp until llama_model starts implementing the graph build function
     typedef std::function<ggml_cgraph *(llama_context &, const llama_ubatch &, bool worst_case)> build_graph_callback;
 
     llama_context_unified(
@@ -496,8 +497,6 @@ class llama_context_unified : public llama_context {
              ggml_tensor * kq,
                  float     kq_scale) override;
 
-    virtual ggml_tensor * get_rope_factors(int il) override;
-
     virtual void build_k_shift(
             ggml_context * ctx0,
              ggml_cgraph * graph) override;
@@ -601,7 +600,7 @@ class llama_context_unified : public llama_context {
     virtual size_t state_get_data(      uint8_t * dst, size_t size) override;
     virtual size_t state_set_data(const uint8_t * src, size_t size) override;
 
-    virtual size_t state_seq_get_size(llama_seq_id seq_id)                                  override;
+    virtual size_t state_seq_get_size(llama_seq_id seq_id)                                   override;
     virtual size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) override;
     virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index c568f8d15c63c..9e37b0cd46dba 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -685,7 +685,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -857,7 +857,7 @@ struct llm_build_context {
             } else if (n_head > 0) {
                 // self-attention
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -2999,7 +2999,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 struct ggml_tensor* attn_norm_output = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -3706,7 +3706,7 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+            struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
             // norm
             cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
@@ -4480,7 +4480,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -5373,7 +5373,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -6572,7 +6572,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);

From e633dc171a8ae3d44c647bbd94a1921ed74c181c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 12 Feb 2025 13:48:52 +0200
Subject: [PATCH 34/84] context : introduce llama_graph_i

ggml-ci
---
 src/CMakeLists.txt  |   1 +
 src/llama-context.h | 134 +-----------------------------------
 src/llama-graph.cpp |   1 +
 src/llama-graph.h   | 164 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 168 insertions(+), 132 deletions(-)
 create mode 100644 src/llama-graph.cpp
 create mode 100644 src/llama-graph.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e1b02e4c08f07..f1f5d41d495a1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -15,6 +15,7 @@ add_library(llama
             llama-chat.cpp
             llama-context.cpp
             llama-grammar.cpp
+            llama-graph.cpp
             llama-hparams.cpp
             llama-impl.cpp
             llama-kv-cache.cpp
diff --git a/src/llama-context.h b/src/llama-context.h
index dd1030388e692..b446118ff2ffd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -3,6 +3,7 @@
 #include "llama.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
+#include "llama-graph.h"
 #include "llama-model.h"
 #include "llama-kv-cache.h"
 #include "llama-adapter.h"
@@ -16,7 +17,7 @@
 
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
-struct llama_context {
+struct llama_context : public llama_graph_i {
     llama_context(const llama_model & model);
     virtual ~llama_context();
 
@@ -129,137 +130,6 @@ struct llama_context {
 
     virtual ggml_tensor * build_rope_factors(int il);
 
-    // graph build API (context-specific)
-
-    virtual ggml_tensor * build_inp_embd(
-            ggml_context * ctx0,
-             ggml_tensor * tok_embd,
-      const llama_ubatch & ubatch) = 0;
-
-    virtual ggml_tensor * build_inp_pos(
-            ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
-
-    virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case) = 0;
-
-    virtual ggml_tensor * build_inp_mean(
-            ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
-
-    virtual ggml_tensor * build_inp_cls(
-            ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
-
-    virtual void build_attn_inp(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   causal,
-                    bool   swa,
-                    bool   worst_case) = 0;
-
-    virtual void build_attn_kv_store(
-            ggml_context * ctx0,
-             ggml_cgraph * graph,
-             ggml_tensor * k_cur,
-             ggml_tensor * v_cur,
-                 int32_t   n_tokens,
-                 int64_t   il,
-                 bool      worst_case) = 0;
-
-    virtual ggml_tensor * build_attn_qkv(
-            ggml_context * ctx0,
-             ggml_cgraph * graph,
-             ggml_tensor * wo,
-             ggml_tensor * wo_b,
-             ggml_tensor * q_cur,
-                 int32_t   n_tokens,
-                 float     kq_scale,
-                 int       il,
-                 bool      worst_case) = 0;
-
-    virtual ggml_tensor * build_soft_max_ext(
-            ggml_context * ctx0,
-             ggml_tensor * kq,
-                 float     kq_scale) = 0;
-
-    virtual void build_k_shift(
-            ggml_context * ctx0,
-             ggml_cgraph * graph) = 0;
-
-    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    virtual void build_defrag(
-            ggml_context * ctx0,
-             ggml_cgraph * graph) = 0;
-
-    virtual ggml_tensor * build_inp_embd_enc(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case) = 0;
-
-    virtual ggml_tensor * build_inp_KQ_mask_cross(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case) = 0;
-
-    virtual ggml_tensor * build_inp_s_copy(
-            ggml_context * ctx0,
-                    bool   worst_case) = 0;
-
-    virtual ggml_tensor * build_inp_s_mask(
-            ggml_context * ctx0,
-                    bool   worst_case) = 0;
-
-    virtual ggml_tensor * build_copy_mask_state(
-            ggml_context * ctx0,
-             ggml_cgraph * graph,
-             ggml_tensor * s,
-             ggml_tensor * state_copy,
-             ggml_tensor * state_mask,
-                 int32_t   n_tokens,
-                 int32_t   n_state,
-                 int32_t   n_seqs,
-                    bool   worst_case) = 0;
-
-    virtual ggml_tensor * build_mamba_layer(
-            ggml_context * ctx0,
-             ggml_cgraph * graph,
-             ggml_tensor * cur,
-             ggml_tensor * state_copy,
-             ggml_tensor * state_mask,
-      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) = 0;
-
-    virtual ggml_tensor * build_rwkv_token_shift_load(
-            ggml_context * ctx0,
-             ggml_cgraph * graph,
-             ggml_tensor * state_copy,
-             ggml_tensor * state_mask,
-      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) = 0;
-
-    virtual ggml_tensor * build_rwkv_token_shift_store(
-            ggml_context * ctx0,
-             ggml_tensor * token_shift,
-      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) = 0;
-
-    virtual ggml_tensor * build_rwkv6_time_mix(
-            ggml_context * ctx0,
-             ggml_cgraph * graph,
-             ggml_tensor * cur,
-             ggml_tensor * x_prev,
-             ggml_tensor * state_copy,
-             ggml_tensor * state_mask,
-      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) = 0;
-
     // state save/load
 
     virtual size_t state_get_size()                                 = 0;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
new file mode 100644
index 0000000000000..20f2ee0bd56aa
--- /dev/null
+++ b/src/llama-graph.cpp
@@ -0,0 +1 @@
+#include "llama-graph.h"
diff --git a/src/llama-graph.h b/src/llama-graph.h
new file mode 100644
index 0000000000000..37dff8db40541
--- /dev/null
+++ b/src/llama-graph.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <cstdint>
+
+struct ggml_cgraph;
+struct ggml_context;
+struct ggml_tensor;
+struct llama_ubatch;
+
+// TODO: pass to llama_model graph build
+class llama_graph_i {
+public:
+    // apply control vector for layer il
+    virtual ggml_tensor * build_cvec(
+            ggml_context * ctx0,
+             ggml_tensor * cur,
+                     int   il) = 0;
+
+    // do mat_mul, while optionally apply lora
+    virtual ggml_tensor * build_lora_mm(
+            ggml_context * ctx0,
+             ggml_tensor * w,
+             ggml_tensor * cur) = 0;
+
+    // do mat_mul_id, while optionally apply lora
+    virtual ggml_tensor * build_lora_mm_id(
+            ggml_context * ctx0,
+             ggml_tensor * w,   // struct ggml_tensor * as
+             ggml_tensor * cur, // struct ggml_tensor * b
+             ggml_tensor * ids) = 0;
+
+    virtual ggml_tensor * build_rope_factors(int il) = 0;
+
+    // graph build API (context-specific)
+
+    virtual ggml_tensor * build_inp_embd(
+            ggml_context * ctx0,
+             ggml_tensor * tok_embd,
+      const llama_ubatch & ubatch) = 0;
+
+    virtual ggml_tensor * build_inp_pos(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) = 0;
+
+    virtual ggml_tensor * build_inp_out_ids(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_inp_mean(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) = 0;
+
+    virtual ggml_tensor * build_inp_cls(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) = 0;
+
+    virtual void build_attn_inp(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   causal,
+                    bool   swa,
+                    bool   worst_case) = 0;
+
+    virtual void build_attn_kv_store(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * k_cur,
+             ggml_tensor * v_cur,
+                 int32_t   n_tokens,
+                 int64_t   il,
+                 bool      worst_case) = 0;
+
+    virtual ggml_tensor * build_attn_qkv(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * wo,
+             ggml_tensor * wo_b,
+             ggml_tensor * q_cur,
+                 int32_t   n_tokens,
+                 float     kq_scale,
+                 int       il,
+                 bool      worst_case) = 0;
+
+    virtual ggml_tensor * build_soft_max_ext(
+            ggml_context * ctx0,
+             ggml_tensor * kq,
+                 float     kq_scale) = 0;
+
+    virtual void build_k_shift(
+            ggml_context * ctx0,
+             ggml_cgraph * graph) = 0;
+
+    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
+    virtual void build_defrag(
+            ggml_context * ctx0,
+             ggml_cgraph * graph) = 0;
+
+    virtual ggml_tensor * build_inp_embd_enc(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_inp_KQ_mask_cross(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_inp_s_copy(
+            ggml_context * ctx0,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_inp_s_mask(
+            ggml_context * ctx0,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_copy_mask_state(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * s,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+                 int32_t   n_tokens,
+                 int32_t   n_state,
+                 int32_t   n_seqs,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_mamba_layer(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * cur,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_rwkv_token_shift_load(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_rwkv_token_shift_store(
+            ggml_context * ctx0,
+             ggml_tensor * token_shift,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case) = 0;
+
+    virtual ggml_tensor * build_rwkv6_time_mix(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * cur,
+             ggml_tensor * x_prev,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case) = 0;
+};

From 0ab50f1bbb4770ac7575f261fa53df6ae0d68767 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 12 Feb 2025 13:59:43 +0200
Subject: [PATCH 35/84] context : prepare llama_model graph build

ggml-ci
---
 src/llama.cpp | 269 +++++++++++++++++++++++++-------------------------
 1 file changed, 136 insertions(+), 133 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 9e37b0cd46dba..e71a87ee9fcdf 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -54,7 +54,7 @@ enum llm_norm_type {
 };
 
 struct llm_build_context {
-          llama_context & lctx;
+          llama_graph_i & lgf;
     const llama_model   & model;
     const llama_hparams & hparams;
     const llama_cparams & cparams;
@@ -98,14 +98,17 @@ struct llm_build_context {
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
-            llama_context & lctx,
-       const llama_ubatch & ubatch,
-       const llm_build_cb & cb,
+             llama_graph_i &  lgf,
+       const llama_model   &  model,
+       const llama_cparams &  cparams,
+       const llama_ubatch  &  ubatch,
+             llm_build_cb  && cb,
+          ggml_context_ptr && ctx,
                      bool   worst_case) :
-        lctx             (lctx),
-        model            (lctx.get_model()),
+        lgf              (lgf),
+        model            (model),
         hparams          (model.hparams),
-        cparams          (lctx.get_cparams()),
+        cparams          (cparams),
         ubatch           (ubatch),
         n_embd           (hparams.n_embd),
         n_layer          (hparams.n_layer),
@@ -133,14 +136,14 @@ struct llm_build_context {
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
-        cb               (cb),
-        ctx              (lctx.init()),
-        ctx0             (ctx.get()) {
+        cb               (std::move(cb)),
+        ctx              (std::move(ctx)),
+        ctx0             (this->ctx.get()) {
         }
 
     // TODO: tmp
     struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
-        struct ggml_tensor * inpL = lctx.build_inp_embd(ctx0, tok_embd, ubatch);
+        struct ggml_tensor * inpL = lgf.build_inp_embd(ctx0, tok_embd, ubatch);
         cb(inpL, "inp_embd", -1);
 
         return inpL;
@@ -150,7 +153,7 @@ struct llm_build_context {
     struct ggml_tensor * build_lora_mm(
               struct ggml_tensor * w,
               struct ggml_tensor * cur) {
-        return lctx.build_lora_mm(ctx0, w, cur);
+        return lgf.build_lora_mm(ctx0, w, cur);
     }
 
     // TODO: tmp
@@ -158,7 +161,7 @@ struct llm_build_context {
               struct ggml_tensor * w,   // struct ggml_tensor * as
               struct ggml_tensor * cur, // struct ggml_tensor * b
               struct ggml_tensor * ids) {
-        return lctx.build_lora_mm_id(ctx0, w, cur, ids);
+        return lgf.build_lora_mm_id(ctx0, w, cur, ids);
     }
 
     struct ggml_tensor * build_norm(
@@ -460,12 +463,12 @@ struct llm_build_context {
         ggml_build_forward_expand(graph, v_cur);
 
         //build_kv_store(graph, k_cur, v_cur, il);
-        lctx.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
+        lgf.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
 
         struct ggml_tensor * cur;
 
         //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il);
-        cur = lctx.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
+        cur = lgf.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
         cb(cur, "kqv_out", il);
 
         return cur;
@@ -503,7 +506,7 @@ struct llm_build_context {
     struct ggml_cgraph * build_k_shift() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        lctx.build_k_shift(ctx0, gf);
+        lgf.build_k_shift(ctx0, gf);
 
         return gf;
     }
@@ -511,34 +514,34 @@ struct llm_build_context {
     struct ggml_cgraph * build_defrag() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        lctx.build_defrag(ctx0, gf);
+        lgf.build_defrag(ctx0, gf);
 
         return gf;
     }
 
     struct ggml_tensor * build_inp_pos() {
-        ggml_tensor * cur = lctx.build_inp_pos(ctx0, n_tokens);
+        ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens);
         cb(cur, "inp_pos", -1);
 
         return cur;
     }
 
     struct ggml_tensor * build_inp_out_ids() {
-        ggml_tensor * cur = lctx.build_inp_out_ids(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf.build_inp_out_ids(ctx0, n_tokens, worst_case);
         cb(cur, "inp_out_ids", -1);
 
         return cur;
     }
 
     struct ggml_tensor * build_inp_mean() {
-        ggml_tensor * cur = lctx.build_inp_mean(ctx0, n_tokens);
+        ggml_tensor * cur = lgf.build_inp_mean(ctx0, n_tokens);
         cb(cur, "inp_mean", -1);
 
         return cur;
     }
 
     struct ggml_tensor * build_inp_cls() {
-        ggml_tensor * cur = lctx.build_inp_cls(ctx0, n_tokens);
+        ggml_tensor * cur = lgf.build_inp_cls(ctx0, n_tokens);
         cb(cur, "inp_cls", -1);
 
         return cur;
@@ -642,14 +645,14 @@ struct llm_build_context {
     //}
 
     struct ggml_tensor * build_inp_embd_enc() {
-        ggml_tensor * cur = lctx.build_inp_embd_enc(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf.build_inp_embd_enc(ctx0, n_tokens, worst_case);
         cb(cur, "embd_enc", -1);
 
         return cur;
     }
 
     struct ggml_tensor * build_inp_KQ_mask_cross() {
-        ggml_tensor * cur = lctx.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
         cb(cur, "KQ_mask_cross", -1);
 
         return cur;
@@ -670,7 +673,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -685,7 +688,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -787,7 +790,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -831,7 +834,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -857,7 +860,7 @@ struct llm_build_context {
             } else if (n_head > 0) {
                 // self-attention
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -943,7 +946,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -987,7 +990,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -1064,7 +1067,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1102,7 +1105,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -1169,7 +1172,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1206,7 +1209,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
@@ -1286,7 +1289,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cur = ggml_add(ctx0, cur, inpL);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1328,7 +1331,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -1435,7 +1438,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1480,7 +1483,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -1563,7 +1566,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1602,7 +1605,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -1670,7 +1673,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1702,7 +1705,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -1762,7 +1765,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1816,7 +1819,7 @@ struct llm_build_context {
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
-        lctx.build_attn_inp(ctx0, n_tokens, false, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, false, false, worst_case);
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
@@ -1887,7 +1890,7 @@ struct llm_build_context {
             cb(kq, "kq", il);
 
             //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
-            kq = lctx.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
+            kq = lgf.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
             cb(kq, "kq_soft_max_ext", il);
 
             struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -1991,7 +1994,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         inpL = build_norm(inpL,
                 model.tok_norm,
@@ -2059,7 +2062,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2093,7 +2096,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         if (model.pos_embd) {
             // inp_pos - contains the positions
@@ -2197,7 +2200,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2234,7 +2237,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -2346,7 +2349,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2384,7 +2387,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -2459,7 +2462,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2497,7 +2500,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -2571,7 +2574,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2608,7 +2611,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         int sections[4];
         std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -2687,7 +2690,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2725,7 +2728,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -2831,7 +2834,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2871,7 +2874,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             attn_norm_output = build_norm(inpL,
@@ -2953,7 +2956,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_output);
             cur = ggml_add(ctx0, cur, inpL);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2991,7 +2994,7 @@ struct llm_build_context {
         struct ggml_tensor * inp_pos = build_inp_pos();
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             auto residual = inpL;
@@ -2999,7 +3002,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
 
                 struct ggml_tensor* attn_norm_output = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -3093,7 +3096,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, residual, cur);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3135,7 +3138,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -3201,7 +3204,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, sa_out);
             cur = ggml_add(ctx0, cur, inpL);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3240,7 +3243,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -3308,7 +3311,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3345,7 +3348,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -3419,7 +3422,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3455,7 +3458,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -3535,7 +3538,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3573,7 +3576,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -3653,7 +3656,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3701,12 +3704,12 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
+            struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
             // norm
             cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
@@ -3858,7 +3861,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3902,7 +3905,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -3971,7 +3974,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, sa_out);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4010,7 +4013,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -4095,7 +4098,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, sa_out);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4140,7 +4143,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4221,7 +4224,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4253,8 +4256,8 @@ struct llm_build_context {
         // {n_embd, n_tokens}
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -4264,7 +4267,7 @@ struct llm_build_context {
             cb(cur, "attn_norm", il);
 
             //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
-            cur = lctx.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
 
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
@@ -4276,7 +4279,7 @@ struct llm_build_context {
             // residual
             cur = ggml_add(ctx0, cur, inpL);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4314,7 +4317,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -4418,7 +4421,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4462,7 +4465,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         // sliding window switch pattern
         const int32_t sliding_window_pattern = 4;
@@ -4480,7 +4483,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4549,7 +4552,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4596,7 +4599,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4677,7 +4680,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4715,7 +4718,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4796,7 +4799,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4838,7 +4841,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4922,7 +4925,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4958,7 +4961,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             const int64_t n_head    = hparams.n_head(il);
@@ -5048,7 +5051,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             inpL = cur;
@@ -5085,7 +5088,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -5162,7 +5165,7 @@ struct llm_build_context {
 
                 cur = ggml_add(ctx0, cur, attn_out);
 
-                cur = lctx.build_cvec(ctx0, cur, il);
+                cur = lgf.build_cvec(ctx0, cur, il);
                 cb(cur, "l_out", il);
 
                 // input for next layer
@@ -5191,7 +5194,7 @@ struct llm_build_context {
 
                 cur = ggml_add(ctx0, cur, ffn_inp);
 
-                cur = lctx.build_cvec(ctx0, cur, il);
+                cur = lgf.build_cvec(ctx0, cur, il);
                 cb(cur, "l_out", il);
 
                 // input for next layer
@@ -5228,7 +5231,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5319,7 +5322,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_out);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5357,7 +5360,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
@@ -5373,7 +5376,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -5473,7 +5476,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5521,7 +5524,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5701,7 +5704,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5738,7 +5741,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6215,7 +6218,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -6309,7 +6312,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6438,7 +6441,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6520,7 +6523,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6558,7 +6561,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6572,7 +6575,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -6642,7 +6645,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6676,8 +6679,8 @@ struct llm_build_context {
         inpL = build_inp_embd(model.tok_embd);
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-        struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -6686,7 +6689,7 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             const llama_layer * layer = &model.layers[il];
 
-            struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load(
+            struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load(
                 ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
             );
 
@@ -6703,7 +6706,7 @@ struct llm_build_context {
                 1
             );
 
-            cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
 
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
@@ -6726,13 +6729,13 @@ struct llm_build_context {
                 ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
                 1
             );
-            ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+            ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
 
             if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
                 cur = ggml_scale(ctx0, cur, 0.5F);
             }
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6766,8 +6769,8 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -6778,7 +6781,7 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             const llama_layer * layer = &model.layers[il];
 
-            struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load(
+            struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load(
                 ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
             );
 
@@ -6792,10 +6795,10 @@ struct llm_build_context {
                 1
             );
 
-            cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
 
             token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
-            ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+            ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
 
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
@@ -6816,7 +6819,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6860,7 +6863,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6976,7 +6979,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.build_cvec(ctx0, cur, il);
+            cur = lgf.build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -7206,7 +7209,7 @@ static struct ggml_cgraph * llama_build_graph(
 
     struct ggml_cgraph * result = NULL;
 
-    struct llm_build_context llm(lctx, ubatch, cb, worst_case);
+    struct llm_build_context llm(lctx, lctx.get_model(), lctx.get_cparams(), ubatch, std::move(cb), lctx.init(), worst_case);
 
     switch (model.arch) {
         case LLM_ARCH_LLAMA:

From f63aeecce681afacd5acfab8401fb298c16e31de Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 12 Feb 2025 15:08:40 +0200
Subject: [PATCH 36/84] llama : models now build their graphs using
 llama_graph_i

ggml-ci
---
 src/llama-context.cpp |   59 +-
 src/llama-context.h   |   26 +-
 src/llama-graph.h     |    8 +-
 src/llama-model.cpp   | 7374 ++++++++++++++++++++++++++++++++++++++++
 src/llama-model.h     |   13 +
 src/llama.cpp         | 7418 +----------------------------------------
 6 files changed, 7457 insertions(+), 7441 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index b29c98af63add..74d6a67bbe9d2 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -193,6 +193,47 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
+void llama_context::build_cb(
+         ggml_tensor * cur,
+          const char * name,
+                 int   il) {
+    if (il >= 0) {
+        ggml_format_name(cur, "%s-%d", name, il);
+    } else {
+        ggml_set_name(cur, name);
+    }
+
+    if (!cparams.offload_kqv) {
+        if (strcmp(name, "kqv_merged_cont") == 0) {
+            // all nodes between the KV store and the attention output are run on the CPU
+            ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
+        }
+    }
+
+    // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
+    // FIXME: fix in ggml_backend_sched
+    const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
+    // TODO: during #11213, the requirement for ubatch.n_tokens < 32 was removed to simplify
+    //       not sure if this is still needed, but it can be brought back if needed
+    //if (ubatch.n_tokens < 32 || full_offload) {
+    if (full_offload) {
+        if (il != -1 && strcmp(name, "norm") == 0) {
+            const auto & dev_layer = model.dev_layer(il);
+            for (auto & backend : backends) {
+                if (ggml_backend_get_device(backend.get()) == dev_layer) {
+                    if (ggml_backend_supports_op(backend.get(), cur)) {
+                        ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get());
+                    }
+                }
+            }
+        }
+    }
+}
+
+ggml_cgraph * llama_context::build_graph(const llama_ubatch & ubatch, bool worst_case) {
+    return model.build_graph(*this, cparams, ubatch, init(), worst_case);
+}
+
 llama_perf_context_data llama_context::perf_get_data() const {
     llama_perf_context_data data = {};
 
@@ -298,11 +339,7 @@ void llama_context::perf_reset() {
 
 llama_context_unified::llama_context_unified(
         const llama_model & model,
-        const llama_context_params & params,
-        build_graph_callback && cb_build_graph) :
-    llama_context(model),
-    cb_build_graph(std::move(cb_build_graph)) {
-
+        const llama_context_params & params) : llama_context(model) {
     const auto & hparams = model.hparams;
 
     cparams.n_seq_max        = std::max(1u, params.n_seq_max);
@@ -555,7 +592,7 @@ llama_context_unified::llama_context_unified(
             llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_pp = this->cb_build_graph(*this, ubatch_pp, true);
+            ggml_cgraph * gf_pp = build_graph(ubatch_pp, true);
 
             // reserve pp graph first so that buffers are only allocated once
             ggml_backend_sched_reserve(sched.get(), gf_pp);
@@ -564,13 +601,13 @@ llama_context_unified::llama_context_unified(
 
             // reserve with tg graph to get the number of splits and nodes
             llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_tg = this->cb_build_graph(*this, ubatch_tg, true);
+            ggml_cgraph * gf_tg = build_graph(ubatch_tg, true);
             ggml_backend_sched_reserve(sched.get(), gf_tg);
             int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
             int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
 
             // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-            gf_pp = this->cb_build_graph(*this, ubatch_pp, true);
+            gf_pp = build_graph(ubatch_pp, true);
             if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                 throw std::runtime_error("failed to allocate compute buffers");
@@ -893,7 +930,7 @@ struct llama_context_unified::batch_manager {
             llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
-            ggml_cgraph * gf = lctx.cb_build_graph(lctx, ubatch, true);
+            ggml_cgraph * gf = lctx.build_graph(ubatch, true);
 
             // initialize scheduler with the worst-case graph
             ggml_backend_sched_reset(lctx.sched.get());
@@ -1004,7 +1041,7 @@ int llama_context_unified::decode(llama_batch & inp_batch) {
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-        ggml_cgraph * gf = cb_build_graph(*this, ubatch, false);
+        ggml_cgraph * gf = build_graph(ubatch, false);
 
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
@@ -1227,7 +1264,7 @@ int llama_context_unified::encode(llama_batch & inp_batch) {
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-    ggml_cgraph * gf = cb_build_graph(*this, ubatch, false);
+    ggml_cgraph * gf = build_graph(ubatch, false);
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
diff --git a/src/llama-context.h b/src/llama-context.h
index b446118ff2ffd..8d7a6ad58dec4 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -82,6 +82,14 @@ struct llama_context : public llama_graph_i {
                 int32_t   il_start,
                 int32_t   il_end);
 
+    virtual void build_cb(
+             ggml_tensor * cur,
+              const char * name,
+                     int   il);
+
+    // TODO: add encode/decode graphs
+    virtual ggml_cgraph * build_graph(const llama_ubatch & ubatch, bool worst_case);
+
     // decode a batch of tokens by evaluating the transformer
     // in case of unsuccessful decoding (error or warning),
     // the kv_cache state will be returned to its original state
@@ -171,11 +179,6 @@ struct llama_context : public llama_graph_i {
 
     // members
 
-    // TODO: temporary public until llama_context implements the graph build function
-    std::vector<ggml_backend_ptr> backends;
-    ggml_backend_t backend_cpu = nullptr;
-    ggml_backend_sched_ptr sched;
-
 protected:
     const llama_model & model;
 
@@ -189,8 +192,13 @@ struct llama_context : public llama_graph_i {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
+    ggml_backend_t backend_cpu = nullptr;
+    std::vector<ggml_backend_ptr> backends;
+
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
 
+    ggml_backend_sched_ptr sched;
+
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
 
@@ -213,13 +221,9 @@ class llama_context_unified : public llama_context {
 public:
     struct batch_manager;
 
-    // TODO: tmp until llama_model starts implementing the graph build function
-    typedef std::function<ggml_cgraph *(llama_context &, const llama_ubatch &, bool worst_case)> build_graph_callback;
-
     llama_context_unified(
             const llama_model & model,
-            const llama_context_params & params,
-            build_graph_callback && cb_build_graph);
+            const llama_context_params & params);
 
     virtual ~llama_context_unified();
 
@@ -244,8 +248,6 @@ class llama_context_unified : public llama_context {
 
     llama_sbatch sbatch;
 
-    build_graph_callback cb_build_graph;
-
     // host buffer for the model output (logits and embeddings)
     ggml_backend_buffer_ptr buf_output;
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 37dff8db40541..0084d99ccade6 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -7,9 +7,15 @@ struct ggml_context;
 struct ggml_tensor;
 struct llama_ubatch;
 
-// TODO: pass to llama_model graph build
+// TODO: can become more granular in the future
 class llama_graph_i {
 public:
+    // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+    virtual void build_cb(
+             ggml_tensor * cur,
+              const char * name,
+                     int   il) = 0;
+
     // apply control vector for layer il
     virtual ggml_tensor * build_cvec(
             ggml_context * ctx0,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 0f4b62c434d4b..bded48be6c25b 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2,12 +2,17 @@
 
 #include "llama-impl.h"
 #include "llama-mmap.h"
+#include "llama-graph.h"
+#include "llama-batch.h"
+#include "llama-cparams.h"
 #include "llama-model-loader.h"
 
 #include "ggml-cpp.h"
 
 #include <algorithm>
 #include <cassert>
+#include <cmath>
+#include <cfloat>
 #include <cstring>
 #include <functional>
 #include <map>
@@ -3774,6 +3779,7375 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
     return it->second;
 }
 
+//
+// llm_build
+//
+
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+    LLM_FFN_SWIGLU,
+};
+
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+    LLM_NORM_GROUP,
+};
+
+struct llm_build_context {
+          llama_graph_i & lgf;
+    const llama_model   & model;
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+    const llama_ubatch  & ubatch;
+
+    const int64_t n_embd;
+    const int64_t n_layer;
+    const int64_t n_rot;
+    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_head;
+    const int64_t n_head_kv;
+    const int64_t n_embd_head_k;
+    const int64_t n_embd_k_gqa;
+    const int64_t n_embd_head_v;
+    const int64_t n_embd_v_gqa;
+    const int64_t n_expert;
+    const int64_t n_expert_used;
+
+    const float freq_base;
+    const float freq_scale;
+    const float ext_factor;
+    const float attn_factor;
+    const float beta_fast;
+    const float beta_slow;
+    const float norm_eps;
+    const float norm_rms_eps;
+
+    const int32_t n_tokens;
+    const int32_t n_ctx_orig;
+
+    const bool worst_case;
+    const bool flash_attn;
+
+    const enum llama_pooling_type pooling_type;
+    const enum llama_rope_type    rope_type;
+
+    const ggml_context_ptr   ctx  = nullptr;
+          ggml_context     * ctx0 = nullptr;
+
+    // TODO: consider making the entire interface noexcept
+    llm_build_context(
+             llama_graph_i &  lgf,
+       const llama_model   &  model,
+       const llama_cparams &  cparams,
+       const llama_ubatch  &  ubatch,
+          ggml_context_ptr && ctx,
+                      bool    worst_case) :
+        lgf              (lgf),
+        model            (model),
+        hparams          (model.hparams),
+        cparams          (cparams),
+        ubatch           (ubatch),
+        n_embd           (hparams.n_embd),
+        n_layer          (hparams.n_layer),
+        n_rot            (hparams.n_rot),
+        n_ctx            (cparams.n_ctx),
+        n_head           (hparams.n_head()),
+        n_head_kv        (hparams.n_head_kv()),
+        n_embd_head_k    (hparams.n_embd_head_k),
+        n_embd_k_gqa     (hparams.n_embd_k_gqa()),
+        n_embd_head_v    (hparams.n_embd_head_v),
+        n_embd_v_gqa     (hparams.n_embd_v_gqa()),
+        n_expert         (hparams.n_expert),
+        n_expert_used    (hparams.n_expert_used),
+        freq_base        (cparams.rope_freq_base),
+        freq_scale       (cparams.rope_freq_scale),
+        ext_factor       (cparams.yarn_ext_factor),
+        attn_factor      (cparams.yarn_attn_factor),
+        beta_fast        (cparams.yarn_beta_fast),
+        beta_slow        (cparams.yarn_beta_slow),
+        norm_eps         (hparams.f_norm_eps),
+        norm_rms_eps     (hparams.f_norm_rms_eps),
+        n_tokens         (ubatch.n_tokens),
+        n_ctx_orig       (cparams.n_ctx_orig_yarn),
+        worst_case       (worst_case),
+        flash_attn       (cparams.flash_attn),
+        pooling_type     (cparams.pooling_type),
+        rope_type        (hparams.rope_type),
+        ctx              (std::move(ctx)),
+        ctx0             (this->ctx.get()) {
+        }
+
+    // TODO: tmp
+    void cb(struct ggml_tensor * cur, const char * name, int il) {
+        lgf.build_cb(cur, name, il);
+    }
+
+    // TODO: tmp
+    struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
+        struct ggml_tensor * inpL = lgf.build_inp_embd(ctx0, tok_embd, ubatch);
+        cb(inpL, "inp_embd", -1);
+
+        return inpL;
+    }
+
+    // TODO: tmp
+    struct ggml_tensor * build_lora_mm(
+              struct ggml_tensor * w,
+              struct ggml_tensor * cur) {
+        return lgf.build_lora_mm(ctx0, w, cur);
+    }
+
+    // TODO: tmp
+    struct ggml_tensor * build_lora_mm_id(
+              struct ggml_tensor * w,   // struct ggml_tensor * as
+              struct ggml_tensor * cur, // struct ggml_tensor * b
+              struct ggml_tensor * ids) {
+        return lgf.build_lora_mm_id(ctx0, w, cur, ids);
+    }
+
+    struct ggml_tensor * build_norm(
+             struct ggml_tensor * cur,
+             struct ggml_tensor * mw,
+             struct ggml_tensor * mb,
+                  llm_norm_type   type,
+                            int   il) {
+        switch (type) {
+            case LLM_NORM:       cur = ggml_norm      (ctx0, cur, hparams.f_norm_eps);     break;
+            case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx0, cur, hparams.f_norm_rms_eps); break;
+            case LLM_NORM_GROUP:
+                {
+                    cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
+                    cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
+                    cur = ggml_reshape_2d(ctx0, cur, cur->ne[0],    cur->ne[2]);
+                } break;
+        }
+
+        if (mw || mb) {
+            cb(cur, "norm", il);
+        }
+
+        if (mw) {
+            cur = ggml_mul(ctx0, cur, mw);
+            if (mb) {
+                cb(cur, "norm_w", il);
+            }
+        }
+
+        if (mb) {
+            cur = ggml_add(ctx0, cur, mb);
+        }
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_ffn(
+             struct ggml_tensor * cur,
+             struct ggml_tensor * up,
+             struct ggml_tensor * up_b,
+             struct ggml_tensor * up_s,
+             struct ggml_tensor * gate,
+             struct ggml_tensor * gate_b,
+             struct ggml_tensor * gate_s,
+             struct ggml_tensor * down,
+             struct ggml_tensor * down_b,
+             struct ggml_tensor * down_s,
+             struct ggml_tensor * act_scales,
+                llm_ffn_op_type   type_op,
+              llm_ffn_gate_type   type_gate,
+                            int   il) {
+        struct ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
+        cb(tmp, "ffn_up", il);
+
+        if (up_b) {
+            tmp = ggml_add(ctx0, tmp, up_b);
+            cb(tmp, "ffn_up_b", il);
+        }
+
+        if (up_s) {
+            tmp = ggml_mul(ctx0, tmp, up_s);
+            cb(tmp, "ffn_up_s", il);
+        }
+
+        if (gate) {
+            switch (type_gate) {
+                case LLM_FFN_SEQ:
+                    {
+                        cur = build_lora_mm(gate, tmp);
+                        cb(cur, "ffn_gate", il);
+                    } break;
+                case LLM_FFN_PAR:
+                    {
+                        cur = build_lora_mm(gate, cur);
+                        cb(cur, "ffn_gate", il);
+                    } break;
+            }
+
+            if (gate_b) {
+                cur = ggml_add(ctx0, cur, gate_b);
+                cb(cur, "ffn_gate_b", il);
+            }
+
+            if (gate_s) {
+                cur = ggml_mul(ctx0, cur, gate_s);
+                cb(cur, "ffn_gate_s", il);
+            }
+
+        } else {
+            cur = tmp;
+        }
+
+        switch (type_op) {
+            case LLM_FFN_SILU:
+                {
+                    cur = ggml_silu(ctx0, cur);
+                    cb(cur, "ffn_silu", il);
+                } break;
+            case LLM_FFN_GELU:
+                {
+                    cur = ggml_gelu(ctx0, cur);
+                    cb(cur, "ffn_gelu", il);
+                    if (act_scales != NULL) {
+                        cur = ggml_div(ctx0, cur, act_scales);
+                        cb(cur, "ffn_act", il);
+                    }
+                } break;
+            case LLM_FFN_RELU:
+                {
+                    cur = ggml_relu(ctx0, cur);
+                    cb(cur, "ffn_relu", il);
+                } break;
+            case LLM_FFN_RELU_SQR:
+                {
+                    cur = ggml_relu(ctx0, cur);
+                    cb(cur, "ffn_relu", il);
+
+                    cur = ggml_sqr(ctx0, cur);
+                    cb(cur, "ffn_sqr(relu)", il);
+                } break;
+            case LLM_FFN_SWIGLU:
+                {
+                    // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+                    int64_t split_point = cur->ne[0] / 2;
+                    struct ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
+                    struct ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+                    x0 = ggml_silu(ctx0, x0);
+                    cb(cur, "ffn_silu", il);
+
+                    cur = ggml_mul(ctx0, x0, x1);
+                    cb(cur, "ffn_mul", il);
+                } break;
+        }
+
+        if (type_gate == LLM_FFN_PAR) {
+            cur = ggml_mul(ctx0, cur, tmp);
+            cb(cur, "ffn_gate_par", il);
+        }
+
+        if (down) {
+            cur = build_lora_mm(down, cur);
+        }
+
+        if (down_b) {
+            cb(cur, "ffn_down", il);
+        }
+
+        if (down_b) {
+            cur = ggml_add(ctx0, cur, down_b);
+        }
+
+        if (down_s) {
+            cur = ggml_mul(ctx0, cur, down_s);
+            cb(cur, "ffn_down_s", il);
+        }
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_moe_ffn(
+             struct ggml_tensor * cur,
+             struct ggml_tensor * gate_inp,
+             struct ggml_tensor * up_exps,
+             struct ggml_tensor * gate_exps,
+             struct ggml_tensor * down_exps,
+             struct ggml_tensor * exp_probs_b,
+                        int64_t   n_expert,
+                        int64_t   n_expert_used,
+                llm_ffn_op_type   type_op,
+                           bool   norm_w,
+                           bool   scale_w,
+                          float   w_scale,
+  llama_expert_gating_func_type   gating_op,
+                            int   il) {
+        int64_t n_embd = cur->ne[0];
+        int64_t n_tokens = cur->ne[1];
+
+        ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
+        cb(logits, "ffn_moe_logits", il);
+
+        ggml_tensor * probs = nullptr;
+        switch (gating_op) {
+            case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
+                {
+                    probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens]
+                } break;
+            case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
+                {
+                    probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
+                } break;
+            default:
+                GGML_ABORT("fatal error");
+        }
+        cb(probs, "ffn_moe_probs", il);
+
+        // add experts selection bias - introduced in DeepSeek V3
+        // leave probs unbiased as it's later used to get expert weights
+        ggml_tensor * selection_probs = probs;
+        if (exp_probs_b != nullptr) {
+            selection_probs = ggml_add(ctx0, probs, exp_probs_b);
+            cb(selection_probs, "ffn_moe_probs_biased", il);
+        }
+
+        // select experts
+        ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+        cb(selected_experts->src[0], "ffn_moe_argsort", il);
+        cb(selected_experts, "ffn_moe_topk", il);
+
+        ggml_tensor * weights = ggml_get_rows(ctx0,
+                ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+        cb(weights, "ffn_moe_weights", il);
+
+        if (norm_w) {
+            weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
+
+            ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
+            cb(weights_sum, "ffn_moe_weights_sum", il);
+
+            weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
+            cb(weights, "ffn_moe_weights_norm", il);
+
+            weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
+        }
+        if (scale_w) {
+            weights = ggml_scale(ctx0, weights, w_scale);
+            cb(weights, "ffn_moe_weights_scaled", il);
+        }
+
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+        ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        cb(up, "ffn_moe_up", il);
+
+        ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        cb(gate, "ffn_moe_gate", il);
+
+        switch (type_op) {
+            case LLM_FFN_SILU:
+                {
+                    gate = ggml_silu(ctx0, gate);
+                    cb(gate, "ffn_moe_silu", il);
+                } break;
+            case LLM_FFN_GELU:
+                {
+                    gate = ggml_gelu(ctx0, gate);
+                    cb(gate, "ffn_moe_gelu", il);
+                } break;
+            default:
+                GGML_ABORT("fatal error");
+        }
+
+        ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
+        cb(par, "ffn_moe_gate_par", il);
+
+        ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
+        cb(experts, "ffn_moe_down", il);
+
+        experts = ggml_mul(ctx0, experts, weights);
+
+        // aggregate experts
+        ggml_tensor * moe_out = nullptr;
+        for (int i = 0; i < n_expert_used; ++i) {
+            ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
+                    experts->nb[2], i*experts->nb[1]);
+
+            if (i == 0) {
+                moe_out = cur_expert;
+            } else {
+                moe_out = ggml_add(ctx0, moe_out, cur_expert);
+            }
+        }
+
+        if (n_expert_used == 1) {
+            // avoid returning a non-contiguous tensor
+            moe_out = ggml_cont(ctx0, moe_out);
+        }
+
+        return moe_out;
+    }
+
+    struct ggml_tensor * build_attn(
+             struct ggml_cgraph * graph,
+             struct ggml_tensor * wo,
+             struct ggml_tensor * wo_b,
+             struct ggml_tensor * k_cur,
+             struct ggml_tensor * v_cur,
+             struct ggml_tensor * q_cur,
+                        int32_t   n_tokens,
+                        float     kq_scale,
+                        int       il) {
+        // these nodes are added to the graph together so that they are not reordered
+        // by doing so, the number of splits in the graph is reduced
+        ggml_build_forward_expand(graph, q_cur);
+        ggml_build_forward_expand(graph, k_cur);
+        ggml_build_forward_expand(graph, v_cur);
+
+        //build_kv_store(graph, k_cur, v_cur, il);
+        lgf.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
+
+        struct ggml_tensor * cur;
+
+        //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il);
+        cur = lgf.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
+        cb(cur, "kqv_out", il);
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_rwkv_channel_mix(
+        const struct llama_layer * layer,
+        struct ggml_tensor * cur,
+        struct ggml_tensor * x_prev,
+        const llm_arch arch) {
+        struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+        switch (arch) {
+            case LLM_ARCH_RWKV6:
+            {
+                struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
+                struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
+
+                struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
+                struct ggml_tensor * k = ggml_sqr(
+                        ctx0,
+                        ggml_relu(
+                            ctx0,
+                            build_lora_mm(layer->channel_mix_key, xk)
+                            )
+                    );
+                cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
+            } break;
+            default:
+                GGML_ABORT("fatal error");
+        }
+
+        return cur;
+    }
+
+    struct ggml_cgraph * build_k_shift() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        lgf.build_k_shift(ctx0, gf);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_defrag() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        lgf.build_defrag(ctx0, gf);
+
+        return gf;
+    }
+
+    struct ggml_tensor * build_inp_pos() {
+        ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens);
+        cb(cur, "inp_pos", -1);
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_inp_out_ids() {
+        ggml_tensor * cur = lgf.build_inp_out_ids(ctx0, n_tokens, worst_case);
+        cb(cur, "inp_out_ids", -1);
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_inp_mean() {
+        ggml_tensor * cur = lgf.build_inp_mean(ctx0, n_tokens);
+        cb(cur, "inp_mean", -1);
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_inp_cls() {
+        ggml_tensor * cur = lgf.build_inp_cls(ctx0, n_tokens);
+        cb(cur, "inp_cls", -1);
+
+        return cur;
+    }
+
+    struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
+        // find result_norm tensor for input
+        struct ggml_tensor * inp = nullptr;
+        for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+            inp = ggml_graph_node(gf, i);
+            if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
+                break;
+            }
+
+            inp = nullptr;
+        }
+        GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
+
+        struct ggml_tensor * cur;
+
+        switch (pooling_type) {
+            case LLAMA_POOLING_TYPE_NONE:
+                {
+                    cur = inp;
+                } break;
+            case LLAMA_POOLING_TYPE_MEAN:
+                {
+                    struct ggml_tensor * inp_mean = build_inp_mean();
+                    cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
+                } break;
+            case LLAMA_POOLING_TYPE_CLS:
+            case LLAMA_POOLING_TYPE_LAST:
+                {
+                    struct ggml_tensor * inp_cls = build_inp_cls();
+                    cur = ggml_get_rows(ctx0, inp, inp_cls);
+                } break;
+            case LLAMA_POOLING_TYPE_RANK:
+                {
+                    struct ggml_tensor * inp_cls = build_inp_cls();
+                    inp = ggml_get_rows(ctx0, inp, inp_cls);
+
+                    // classification head
+                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
+                    GGML_ASSERT(model.cls       != nullptr);
+                    GGML_ASSERT(model.cls_b     != nullptr);
+
+                    cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
+                    cur = ggml_tanh(ctx0, cur);
+
+                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
+                    if (model.cls_out) {
+                        GGML_ASSERT(model.cls_out_b != nullptr);
+
+                        cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
+                    }
+                } break;
+            default:
+                {
+                    GGML_ABORT("unknown pooling type");
+                }
+        }
+
+        cb(cur, "result_embd_pooled", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    //struct ggml_tensor * build_pos_bucket(bool causal) {
+    //    if (causal) {
+    //        lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
+    //    } else {
+    //        lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
+    //    }
+
+    //    ggml_set_input(lctx.inp_pos_bucket);
+    //    cb(lctx.inp_pos_bucket, "pos_bucket", -1);
+
+    //    return lctx.inp_pos_bucket;
+    //}
+
+    //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
+    //    struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
+    //    cb(pos_bucket_1d, "pos_bucket_1d", -1);
+
+    //    struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
+    //    cb(pos_bias, "pos_bias", -1);
+
+    //    pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0],  0);
+    //    cb(pos_bias, "pos_bias", -1);
+
+    //    pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
+    //    cb(pos_bias, "pos_bias", -1);
+
+    //    pos_bias = ggml_cont(ctx0, pos_bias);
+    //    cb(pos_bias, "pos_bias", -1);
+
+    //    return pos_bias;
+    //}
+
+    struct ggml_tensor * build_inp_embd_enc() {
+        ggml_tensor * cur = lgf.build_inp_embd_enc(ctx0, n_tokens, worst_case);
+        cb(cur, "embd_enc", -1);
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_inp_KQ_mask_cross() {
+        ggml_tensor * cur = lgf.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
+        cb(cur, "KQ_mask_cross", -1);
+
+        return cur;
+    }
+
+    struct ggml_cgraph * build_llama() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, kq_scale, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // For Granite architecture
+            if (hparams.f_residual_scale) {
+                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            if (model.layers[il].ffn_gate_inp == nullptr) {
+
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_moe_ffn(cur,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_SILU, true,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                        il);
+                cb(cur, "ffn_moe_out", il);
+            }
+
+            // For Granite architecture
+            if (hparams.f_residual_scale) {
+                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        // For Granite architecture
+        if (hparams.f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_deci() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+            const int64_t n_head_kv = hparams.n_head_kv(il);
+            const int64_t n_head    = hparams.n_head(il);
+
+            if (n_head == 0) {
+                // attention-free layer of Llama-3_1-Nemotron-51B
+                cur = inpL;
+            } else {
+                // norm
+                cur = build_norm(inpL,
+                        model.layers[il].attn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "attn_norm", il);
+            }
+
+            if (n_head > 0 && n_head_kv == 0) {
+                // "linear attention" of Llama-3_1-Nemotron-51B
+                cur = build_lora_mm(model.layers[il].wo, cur);
+                cb(cur, "wo", il);
+            } else if (n_head > 0) {
+                // self-attention
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, kq_scale, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // For Granite architecture
+            if (hparams.f_residual_scale) {
+                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+            }
+
+            // modified to support attention-free layer of Llama-3_1-Nemotron-51B
+            struct ggml_tensor * ffn_inp = cur;
+            if (n_head > 0) {
+                ffn_inp = ggml_add(ctx0, cur, inpSA);
+                cb(ffn_inp, "ffn_inp", il);
+            }
+
+            // feed-forward network
+            if (model.layers[il].ffn_gate_inp == nullptr) {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // For Granite architecture
+            if (hparams.f_residual_scale) {
+                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        // For Granite architecture
+        if (hparams.f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_baichuan() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                switch (model.type) {
+                    case LLM_TYPE_7B:
+                        Qcur = ggml_rope_ext(
+                            ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                        Kcur = ggml_rope_ext(
+                            ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                        break;
+                    case LLM_TYPE_13B:
+                        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
+                        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
+                        break;
+                    default:
+                        GGML_ABORT("fatal error");
+                }
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_xverse() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_falcon() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * attn_norm;
+
+            attn_norm = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(attn_norm, "attn_norm", il);
+
+            // self-attention
+            {
+                if (model.layers[il].attn_norm_2) {
+                    // Falcon-40B
+                    cur = build_norm(inpL,
+                            model.layers[il].attn_norm_2,
+                            model.layers[il].attn_norm_2_b,
+                            LLM_NORM, il);
+                    cb(cur, "attn_norm_2", il);
+                } else {
+                    cur = attn_norm;
+                }
+
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                // using mode = 2 for neox mode
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur       = ggml_get_rows(ctx0,       cur, inp_out_ids);
+                inpL      = ggml_get_rows(ctx0,      inpL, inp_out_ids);
+                attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = cur;
+
+            // feed forward
+            {
+                cur = build_ffn(attn_norm, // !! use the attn norm, not the result
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        NULL,                      NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = ggml_add(ctx0, cur, inpL);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        // norm
+        cur = build_norm(cur,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_grok() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // multiply by embedding_multiplier_scale of 78.38367176906169
+        inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // Grok
+            // if attn_out_norm is present then apply it before adding the input
+            if (model.layers[il].attn_out_norm) {
+                cur = build_norm(cur,
+                        model.layers[il].attn_out_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "attn_out_norm", il);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            // MoE branch
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_GELU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
+            cb(cur, "ffn_moe_out", il);
+
+            // Grok
+            // if layer_out_norm is present then apply it before adding the input
+            // Idea: maybe ffn_out_norm is a better name
+            if (model.layers[il].layer_out_norm) {
+                cur = build_norm(cur,
+                        model.layers[il].layer_out_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "layer_out_norm", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        // Grok
+        // multiply logits by output_multiplier_scale of 0.5773502691896257
+
+        cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_dbrx() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = nullptr;
+                struct ggml_tensor * Kcur = nullptr;
+                struct ggml_tensor * Vcur = nullptr;
+
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                cb(cur, "wqkv_clamped", il);
+
+                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            // MoE branch
+            cur = build_norm(ffn_inp,
+                    model.layers[il].attn_out_norm, NULL,
+                    LLM_NORM, il);
+            cb(cur, "attn_out_norm", il);
+
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
+            cb(cur, "ffn_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_starcoder() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+        cb(pos, "pos_embd", -1);
+
+        inpL = ggml_add(ctx0, inpL, pos);
+        cb(inpL, "inpL", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = build_norm(inpL,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_refact() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                cb(Kcur, "Kcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                cb(Qcur, "Qcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_bert() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+        struct ggml_tensor * inp_pos = nullptr;
+
+        if (model.arch != LLM_ARCH_JINA_BERT_V2) {
+            inp_pos = build_inp_pos();
+        }
+
+        // construct input embeddings (token, type, position)
+        inpL = build_inp_embd(model.tok_embd);
+
+        // token types are hardcoded to zero ("Sentence A")
+        struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
+        inpL = ggml_add(ctx0, inpL, type_row0);
+        if (model.arch == LLM_ARCH_BERT) {
+            inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
+        }
+        cb(inpL, "inp_embd", -1);
+
+        // embed layer norm
+        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+        cb(inpL, "inp_norm", -1);
+
+        lgf.build_attn_inp(ctx0, n_tokens, false, false, worst_case);
+
+        // iterate layers
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * cur = inpL;
+
+            struct ggml_tensor * Qcur;
+            struct ggml_tensor * Kcur;
+            struct ggml_tensor * Vcur;
+
+            // self-attention
+            if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
+                Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = build_norm(Qcur,
+                            model.layers[il].attn_q_norm,
+                            model.layers[il].attn_q_norm_b,
+                            LLM_NORM, il);
+                }
+
+                Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                if (model.layers[il].attn_k_norm) {
+                    Kcur = build_norm(Kcur,
+                            model.layers[il].attn_k_norm,
+                            model.layers[il].attn_k_norm_b,
+                            LLM_NORM, il);
+                }
+                Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            } else {
+                // compute Q and K and RoPE them
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+            }
+
+            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+            cb(kq, "kq", il);
+
+            //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
+            kq = lgf.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
+            cb(kq, "kq_soft_max_ext", il);
+
+            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
+            cb(v, "v", il);
+
+            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
+            cb(kqv, "kqv", il);
+
+            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+            cb(kqv_merged, "kqv_merged", il);
+
+            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+            cb(cur, "kqv_merged_cont", il);
+
+            ggml_build_forward_expand(gf, cur);
+
+            cur = build_lora_mm(model.layers[il].wo, cur);
+            if (model.layers[il].bo) {
+                cb(cur, "kqv_wo", il);
+            }
+
+            if (model.layers[il].bo) {
+                cur = ggml_add(ctx0, cur, model.layers[il].bo);
+            }
+            cb(cur, "kqv_out", il);
+
+            if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // re-add the layer input
+            cur = ggml_add(ctx0, cur, inpL);
+
+            // attention layer norm
+            cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
+
+            if (model.layers[il].attn_norm_2 != nullptr) {
+                cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
+                cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
+            }
+
+            struct ggml_tensor * ffn_inp = cur;
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            if (model.arch == LLM_ARCH_BERT) {
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+            } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL,                        NULL,
+                        model.layers[il].ffn_gate, NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_PAR, il);
+            } else {
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+            }
+            cb(cur, "ffn_out", il);
+
+            // attentions bypass the intermediate layer
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            // output layer norm
+            cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cb(cur, "result_embd", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_bloom() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        inpL = build_norm(inpL,
+                model.tok_norm,
+                model.tok_norm_b,
+                LLM_NORM, -1);
+        cb(inpL, "inp_norm", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // Add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = build_norm(inpL,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_mpt() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * pos;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        if (model.pos_embd) {
+            // inp_pos - contains the positions
+            struct ggml_tensor * inp_pos = build_inp_pos();
+            pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+            cb(pos, "pos_embd", -1);
+
+            inpL = ggml_add(ctx0, inpL, pos);
+            cb(inpL, "inpL", -1);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * attn_norm;
+
+            attn_norm = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(attn_norm, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = attn_norm;
+
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                if (model.layers[il].bqkv){
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+                }
+
+                if (hparams.f_clamp_kqv > 0.0f) {
+                    cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                    cb(cur, "wqkv_clamped", il);
+                }
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                // Q/K Layernorm
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = build_norm(Qcur,
+                            model.layers[il].attn_q_norm,
+                            model.layers[il].attn_q_norm_b,
+                            LLM_NORM, il);
+                    cb(Qcur, "Qcur", il);
+
+                    Kcur = build_norm(Kcur,
+                            model.layers[il].attn_k_norm,
+                            model.layers[il].attn_k_norm_b,
+                            LLM_NORM, il);
+                    cb(Kcur, "Kcur", il);
+
+                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                    cur = build_attn(gf,
+                            model.layers[il].wo, model.layers[il].bo,
+                            Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                } else {
+                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                    cur = build_attn(gf,
+                            model.layers[il].wo, model.layers[il].bo,
+                            Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                }
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // Add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed forward
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, il);
+                cb(cur, "ffn_norm", il);
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        model.layers[il].ffn_act,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_stablelm() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            struct ggml_tensor * inpSA = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                cb(Qcur, "Qcur", il);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                cb(Kcur, "Kcur", il);
+
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = build_norm(Qcur,
+                            model.layers[il].attn_q_norm,
+                            NULL,
+                            LLM_NORM, il);
+                    cb(Qcur, "Qcur", il);
+                }
+                if (model.layers[il].attn_k_norm) {
+                    Kcur = build_norm(Kcur,
+                            model.layers[il].attn_k_norm,
+                            NULL,
+                            LLM_NORM, il);
+                    cb(Kcur, "Kcur", il);
+                }
+
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpL  = ggml_get_rows(ctx0,  inpL, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                if (model.layers[il].ffn_norm) {
+                    cur = build_norm(ffn_inp,
+                            model.layers[il].ffn_norm,
+                            model.layers[il].ffn_norm_b,
+                            LLM_NORM, il);
+                    cb(cur, "ffn_norm", il);
+                } else {
+                    // parallel residual
+                    cur = inpSA;
+                }
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_qwen() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                // using mode = 2 for neox mode
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward forward
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_qwen2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_qwen2vl() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        int sections[4];
+        std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_multi(
+                    ctx0,
+                    ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_multi(
+                    ctx0,
+                    ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_qwen2moe() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // MoE branch
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            ggml_tensor * moe_out =
+                    build_moe_ffn(cur,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_SILU, false,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                        il);
+            cb(cur, "ffn_moe_out", il);
+
+            // FFN shared expert
+            {
+                ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
+                cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
+
+                // sigmoid
+                ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
+                cb(cur_gate, "ffn_shexp_gate", il);
+
+                ggml_tensor * cur_ffn = build_ffn(cur,
+                        model.layers[il].ffn_up_shexp,   NULL, NULL,
+                        model.layers[il].ffn_gate_shexp, NULL, NULL,
+                        model.layers[il].ffn_down_shexp, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur_ffn, "ffn_shexp", il);
+
+                ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
+                cb(ffn_shexp_out, "ffn_shexp_out", il);
+
+                moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
+                cb(moe_out, "ffn_out", il);
+
+                cur = moe_out;
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_phi2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * attn_norm_output;
+        struct ggml_tensor * ffn_output;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            attn_norm_output = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(attn_norm_output, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = nullptr;
+                struct ggml_tensor * Kcur = nullptr;
+                struct ggml_tensor * Vcur = nullptr;
+
+                if (model.layers[il].wqkv) {
+                    cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
+                    cb(cur, "wqkv", il);
+
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+
+                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                } else {
+                    Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                }
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                // with phi2, we scale the Q to avoid precision issues
+                // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur              = ggml_get_rows(ctx0,              cur, inp_out_ids);
+                inpL             = ggml_get_rows(ctx0,             inpL, inp_out_ids);
+                attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
+            }
+
+            // FF
+            {
+                ffn_output = build_ffn(attn_norm_output,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(ffn_output, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_output);
+            cur = ggml_add(ctx0, cur, inpL);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = build_norm(inpL,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output_no_bias", -1);
+
+        cur = ggml_add(ctx0, cur, model.output_b);
+        cb(cur, "result_output", -1);
+        ggml_build_forward_expand(gf, cur);
+        return gf;
+    }
+
+    struct ggml_cgraph * build_phi3() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            auto residual = inpL;
+
+            // self-attention
+            {
+                // rope freq factors for 128k context
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+
+                struct ggml_tensor* attn_norm_output = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM_RMS, il);
+                cb(attn_norm_output, "attn_norm", il);
+
+                struct ggml_tensor * Qcur = nullptr;
+                struct ggml_tensor * Kcur = nullptr;
+                struct ggml_tensor * Vcur = nullptr;
+
+                if (model.layers[il].wqkv) {
+                    cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
+                    cb(cur, "wqkv", il);
+
+                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
+                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
+                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
+                } else {
+                    Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                }
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor* inp_out_ids = build_inp_out_ids();
+                cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+                residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+            }
+
+            cur = ggml_add(ctx0, cur, residual);
+            residual = cur;
+
+            cur = build_norm(cur,
+                model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+                LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            if (model.layers[il].ffn_gate_inp == nullptr) {
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        NULL,                      NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                cur = build_moe_ffn(cur,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        model.layers[il].ffn_gate_exps,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        n_expert, n_expert_used,
+                        LLM_FFN_SILU, true,
+                        false, 0.0,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                        il);
+                cb(cur, "ffn_moe_out", il);
+            }
+
+            cur = ggml_add(ctx0, residual, cur);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = build_norm(inpL,
+            model.output_norm,
+            model.output_norm_b,
+            LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+
+        if (model.output_b != nullptr) {
+            cb(cur, "result_output_no_bias", -1);
+            cur = ggml_add(ctx0, cur, model.output_b);
+        }
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+
+    struct ggml_cgraph * build_plamo() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            struct ggml_tensor * attention_norm = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head,    n_tokens), inp_pos, nullptr,
+                        n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
+                        n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+            struct ggml_tensor * sa_out = cur;
+
+            cur = attention_norm;
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur    = ggml_get_rows(ctx0,    cur, inp_out_ids);
+                sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
+                inpL   = ggml_get_rows(ctx0,   inpL, inp_out_ids);
+            }
+
+            // feed-forward network
+            {
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, sa_out);
+            cur = ggml_add(ctx0, cur, inpL);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_gpt2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * pos;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+        cb(pos, "pos_embd", -1);
+
+        inpL = ggml_add(ctx0, inpL, pos);
+        cb(inpL, "inpL", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = build_norm(inpL,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_codeshell() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(tmpq, "tmpq", il);
+                cb(tmpk, "tmpk", il);
+                cb(Vcur, "Vcur", il);
+
+                struct ggml_tensor * Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = build_norm(inpL,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_orion() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                // if (model.layers[il].bq) {
+                //     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                //     cb(Qcur, "Qcur", il);
+                // }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                // if (model.layers[il].bk) {
+                //     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                //     cb(Kcur, "Kcur", il);
+                // }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                // if (model.layers[il].bv) {
+                //     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                //     cb(Vcur, "Vcur", il);
+                // }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_internlm2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_minicpm3() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        //TODO: if the model varies, these parameters need to be read from the model
+        const int64_t n_embd_base = 256;
+        const float scale_embd  = 12.0f;
+        const float scale_depth = 1.4f;
+        const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
+
+        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
+        const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+        const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // scale the input embeddings
+        inpL = ggml_scale(ctx0, inpL, scale_embd);
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                struct ggml_tensor * q = NULL;
+                // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
+                q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+                cb(q, "q", il);
+
+                q = build_norm(q,
+                        model.layers[il].attn_q_a_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(q, "q", il);
+
+                // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+                q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
+                cb(q, "q", il);
+
+                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        0);
+                cb(q_nope, "q_nope", il);
+
+                // and {n_head * n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        ggml_row_size(q->type, n_embd_head_qk_nope));
+                cb(q_pe, "q_pe", il);
+
+                // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+
+                // split into {kv_lora_rank, n_tokens}
+                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
+                        kv_pe_compresseed->nb[1],
+                        0);
+                cb(kv_compressed, "kv_compressed", il);
+
+                // and {n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+                        kv_pe_compresseed->nb[1],
+                        kv_pe_compresseed->nb[1],
+                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+                cb(k_pe, "k_pe", il);
+
+                // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
+                kv_compressed = ggml_cont(ctx0, kv_compressed);
+                kv_compressed = build_norm(kv_compressed,
+                        model.layers[il].attn_kv_a_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(kv_compressed, "kv_compressed", il);
+
+                // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
+                struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
+                cb(kv, "kv", il);
+
+                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
+                        ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                        0);
+                cb(k_nope, "k_nope", il);
+
+                // and {n_head * n_embd_head_v, n_tokens}
+                struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope)));
+                cb(v_states, "v_states", il);
+
+                v_states = ggml_cont(ctx0, v_states);
+                cb(v_states, "v_states", il);
+
+                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
+                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+                    0);
+                cb(v_states, "v_states", il);
+
+                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
+                q_pe = ggml_rope_ext(
+                    ctx0, q_pe, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(q_pe, "q_pe", il);
+
+                // shared RoPE key
+                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
+                k_pe = ggml_rope_ext(
+                    ctx0, k_pe, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(k_pe, "k_pe", il);
+
+                struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+                cb(q_states, "q_states", il);
+
+                struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+                cb(k_states, "k_states", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        k_states, v_states, q_states, n_tokens, kq_scale, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // scale_res - scale the hidden states for residual connection
+            const float scale_res = scale_depth/sqrtf(float(n_layer));
+            cur = ggml_scale(ctx0, cur, scale_res);
+            cb(cur, "hidden_scaled", il);
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // scale the hidden states for residual connection
+            cur = ggml_scale(ctx0, cur, scale_res);
+            cb(cur, "hidden_scaled_ffn", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head scaling
+        const float scale_lmhead = float(n_embd_base)/float(n_embd);
+        cur = ggml_scale(ctx0, cur, scale_lmhead);
+        cb(cur, "lmhead_scaling", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_gemma() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+                cb(Qcur, "Qcur_scaled", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+            cb(sa_out, "sa_out", il);
+
+            cur = build_norm(sa_out,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            {
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, sa_out);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_gemma2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+                switch (model.type) {
+                    case LLM_TYPE_2B:
+                    case LLM_TYPE_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
+                    case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
+                    default: GGML_ABORT("fatal error");
+                };
+                cb(Qcur, "Qcur_scaled", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, il);
+            }
+
+            cur = build_norm(cur,
+                    model.layers[il].attn_post_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_post_norm", il);
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+            cb(sa_out, "sa_out", il);
+
+            cur = build_norm(sa_out,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            {
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = build_norm(cur,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, -1);
+            cb(cur, "ffn_post_norm", -1);
+
+            cur = ggml_add(ctx0, cur, sa_out);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        // final logit soft-capping
+        cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+        cur = ggml_tanh(ctx0, cur);
+        cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+
+    struct ggml_cgraph * build_starcoder2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_mamba() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        // {n_embd, n_tokens}
+        inpL = build_inp_embd(model.tok_embd);
+
+        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
+            cur = lgf.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // residual
+            cur = ggml_add(ctx0, cur, inpL);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        // final rmsnorm
+        cur = build_norm(inpL,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_command_r() {
+
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        const float f_logit_scale = hparams.f_logit_scale;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+            struct ggml_tensor * ffn_inp = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
+                                ggml_element_size(Qcur) * n_embd_head,
+                                ggml_element_size(Qcur) * n_embd_head * n_head,
+                                0);
+                    cb(Qcur, "Qcur", il);
+                    Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
+                                ggml_element_size(Kcur) * n_embd_head,
+                                ggml_element_size(Kcur) * n_embd_head * n_head_kv,
+                                0);
+                    cb(Kcur, "Kcur", il);
+
+                    Qcur = build_norm(Qcur,
+                                model.layers[il].attn_q_norm,
+                                NULL,
+                                LLM_NORM, il);
+                    cb(Qcur, "Qcur", il);
+
+                    Kcur = build_norm(Kcur,
+                            model.layers[il].attn_k_norm,
+                            NULL,
+                            LLM_NORM, il);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur     = ggml_get_rows(ctx0,     cur, inp_out_ids);
+                inpL    = ggml_get_rows(ctx0,    inpL, inp_out_ids);
+                ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+            }
+
+            struct ggml_tensor * attn_out = cur;
+
+            // feed-forward network
+            {
+                cur = build_ffn(ffn_inp,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // add together residual + FFN + self-attention
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = ggml_add(ctx0, cur, attn_out);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        if (f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+
+    }
+
+    struct ggml_cgraph * build_cohere2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        const float f_logit_scale = hparams.f_logit_scale;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+
+        // sliding window switch pattern
+        const int32_t sliding_window_pattern = 4;
+
+        for (int il = 0; il < n_layer; ++il) {
+            // three layers sliding window attention (window size 4096) and ROPE
+            // fourth layer uses global attention without positional embeddings
+            const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
+
+            // norm
+            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+            struct ggml_tensor * ffn_inp = cur;
+
+            // self-attention
+            {
+                // rope freq factors for 128k context
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                if (is_sliding) {
+                    Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
+                                        beta_fast, beta_slow);
+                    cb(Qcur, "Qcur", il);
+
+                    Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                                        rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
+                                        attn_factor, beta_fast, beta_slow);
+                    cb(Kcur, "Kcur", il);
+                } else {
+                    // For non-sliding layers, just reshape without applying RoPE
+                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                    cb(Qcur, "Qcur", il);
+
+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
+                                   n_tokens, 1.0f / sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur                              = ggml_get_rows(ctx0, cur, inp_out_ids);
+                inpL                             = ggml_get_rows(ctx0, inpL, inp_out_ids);
+                ffn_inp                          = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+            }
+
+            struct ggml_tensor * attn_out = cur;
+
+            // feed-forward network
+            {
+                cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
+                                    NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
+                                    il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // add together residual + FFN + self-attention
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = ggml_add(ctx0, cur, attn_out);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        if (f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    // ref: https://allenai.org/olmo
+    // based on the original build_llama() function, changes:
+    //   * non-parametric layer norm
+    //   * clamp qkv
+    //   * removed bias
+    //   * removed MoE
+    struct ggml_cgraph * build_olmo() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    NULL, NULL,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (hparams.f_clamp_kqv > 0.0f) {
+                    Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (hparams.f_clamp_kqv > 0.0f) {
+                    Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (hparams.f_clamp_kqv > 0.0f) {
+                    Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, nullptr,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    NULL, NULL,
+                    LLM_NORM, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                NULL, NULL,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_olmo2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = inpL;
+
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(Qcur, "Qcur_normed", il);
+
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(Kcur, "Kcur_normed", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur_rope", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur_rope", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            cur = build_norm(cur,
+                    model.layers[il].attn_post_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_post_norm", il);
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_ffn(ffn_inp,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = build_norm(cur,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, -1);
+            cb(cur, "ffn_post_norm", -1);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    // based on the build_qwen2moe() function, changes:
+    //   * removed shared experts
+    //   * removed bias
+    //   * added q, k norm
+    struct ggml_cgraph * build_olmoe() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(Qcur, "Qcur_normed", il);
+
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(Kcur, "Kcur_normed", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur_rope", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur_rope", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // MoE branch
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, false,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
+            cb(cur, "ffn_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_openelm() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            const int64_t n_head    = hparams.n_head(il);
+            const int64_t n_head_kv = hparams.n_head_kv(il);
+            const int64_t n_head_qkv = 2*n_head_kv + n_head;
+
+            cur = inpL;
+            struct ggml_tensor * residual = cur;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
+                cb(Vcur, "Vcur", il);
+
+                Qcur = build_norm(Qcur,
+                        model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(Qcur, "Qcur", il);
+
+                Kcur = build_norm(Kcur,
+                        model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(Kcur, "Kcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
+                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
+                cb(Qcur, "Vcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+                cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        // norm
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_gptneox() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // ffn
+            if (hparams.use_par_res) {
+                // attention and ffn are computed in parallel
+                // x = x + attn(ln1(x)) + ffn(ln2(x))
+
+                struct ggml_tensor * attn_out = cur;
+
+                cur = build_norm(inpL,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+
+                cur = ggml_add(ctx0, cur, inpL);
+                cb(cur, "ffn_out", il);
+
+                cur = ggml_add(ctx0, cur, attn_out);
+
+                cur = lgf.build_cvec(ctx0, cur, il);
+                cb(cur, "l_out", il);
+
+                // input for next layer
+                inpL = cur;
+            } else {
+                // attention and ffn are computed sequentially
+                // x = x + attn(ln1(x))
+                // x = x + ffn(ln2(x))
+
+                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+                cb(ffn_inp, "ffn_inp", il);
+
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        NULL,                      NULL,                        NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+
+                cur = ggml_add(ctx0, cur, ffn_inp);
+
+                cur = lgf.build_cvec(ctx0, cur, il);
+                cb(cur, "l_out", il);
+
+                // input for next layer
+                inpL = cur;
+            }
+        }
+
+        cur = build_norm(inpL,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_arctic() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
+            cb(ffn_out, "ffn_out", il);
+
+            // MoE
+            cur = build_norm(inpSA,
+                    model.layers[il].ffn_norm_exps, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm_exps", il);
+
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
+            cb(cur, "ffn_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_out);
+            cb(cur, "ffn_out", il);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_deepseek() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, kq_scale, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            if ((uint32_t) il < hparams.n_layer_dense_lead) {
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                ggml_tensor * moe_out =
+                        build_moe_ffn(cur,
+                            model.layers[il].ffn_gate_inp,
+                            model.layers[il].ffn_up_exps,
+                            model.layers[il].ffn_gate_exps,
+                            model.layers[il].ffn_down_exps,
+                            nullptr,
+                            n_expert, n_expert_used,
+                            LLM_FFN_SILU, false,
+                            false, hparams.expert_weights_scale,
+                            LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                            il);
+                cb(moe_out, "ffn_moe_out", il);
+
+                // FFN shared expert
+                {
+                    ggml_tensor * ffn_shexp = build_ffn(cur,
+                            model.layers[il].ffn_up_shexp,   NULL, NULL,
+                            model.layers[il].ffn_gate_shexp, NULL, NULL,
+                            model.layers[il].ffn_down_shexp, NULL, NULL,
+                            NULL,
+                            LLM_FFN_SILU, LLM_FFN_PAR, il);
+                    cb(ffn_shexp, "ffn_shexp", il);
+
+                    cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                    cb(cur, "ffn_out", il);
+                }
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_deepseek2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        bool is_lite = (hparams.n_layer == 27);
+
+        // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
+        // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+        const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+        const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
+        const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
+        const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+        const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        // {n_embd, n_tokens}
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                struct ggml_tensor * q = NULL;
+                if (!is_lite) {
+                    // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
+                    q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+                    cb(q, "q", il);
+
+                    q = build_norm(q,
+                            model.layers[il].attn_q_a_norm, NULL,
+                            LLM_NORM_RMS, il);
+                    cb(q, "q", il);
+
+                    // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+                    q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
+                    cb(q, "q", il);
+                } else {
+                    q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                    cb(q, "q", il);
+                }
+
+                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        0);
+                cb(q_nope, "q_nope", il);
+
+                // and {n_head * n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        ggml_row_size(q->type, n_embd_head_qk_nope));
+                cb(q_pe, "q_pe", il);
+
+                // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+
+                // split into {kv_lora_rank, n_tokens}
+                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
+                        kv_pe_compresseed->nb[1],
+                        0);
+                cb(kv_compressed, "kv_compressed", il);
+
+                // and {n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+                        kv_pe_compresseed->nb[1],
+                        kv_pe_compresseed->nb[1],
+                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+                cb(k_pe, "k_pe", il);
+
+                // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
+                kv_compressed = ggml_cont(ctx0, kv_compressed);
+                kv_compressed = build_norm(kv_compressed,
+                        model.layers[il].attn_kv_a_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(kv_compressed, "kv_compressed", il);
+
+                // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
+                struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
+                cb(kv, "kv", il);
+
+                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
+                        ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                        0);
+                cb(k_nope, "k_nope", il);
+
+                // and {n_head * n_embd_head_v, n_tokens}
+                struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope)));
+                cb(v_states, "v_states", il);
+
+                v_states = ggml_cont(ctx0, v_states);
+                cb(v_states, "v_states", il);
+
+                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
+                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+                    0);
+                cb(v_states, "v_states", il);
+
+                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
+                q_pe = ggml_rope_ext(
+                    ctx0, q_pe, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
+                );
+                cb(q_pe, "q_pe", il);
+
+                // shared RoPE key
+                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
+                k_pe = ggml_rope_ext(
+                    ctx0, k_pe, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
+                );
+                cb(k_pe, "k_pe", il);
+
+                struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+                cb(q_states, "q_states", il);
+
+                struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+                cb(k_states, "k_states", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        k_states, v_states, q_states, n_tokens, kq_scale, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            if ((uint32_t) il < hparams.n_layer_dense_lead) {
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                // MoE branch
+                ggml_tensor * moe_out =
+                        build_moe_ffn(cur,
+                            model.layers[il].ffn_gate_inp,
+                            model.layers[il].ffn_up_exps,
+                            model.layers[il].ffn_gate_exps,
+                            model.layers[il].ffn_down_exps,
+                            model.layers[il].ffn_exp_probs_b,
+                            n_expert, n_expert_used,
+                            LLM_FFN_SILU, hparams.expert_weights_norm,
+                            true, hparams.expert_weights_scale,
+                            (enum llama_expert_gating_func_type) hparams.expert_gating_func,
+                            il);
+                cb(moe_out, "ffn_moe_out", il);
+
+                // FFN shared expert
+                {
+                    ggml_tensor * ffn_shexp = build_ffn(cur,
+                            model.layers[il].ffn_up_shexp,   NULL, NULL,
+                            model.layers[il].ffn_gate_shexp, NULL, NULL,
+                            model.layers[il].ffn_down_shexp, NULL, NULL,
+                            NULL,
+                            LLM_FFN_SILU, LLM_FFN_PAR, il);
+                    cb(ffn_shexp, "ffn_shexp", il);
+
+                    cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                    cb(cur, "ffn_out", il);
+                }
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_bitnet() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                if (model.layers[il].wq_scale) {
+                    Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
+                }
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                // B1.K
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                if (model.layers[il].wk_scale) {
+                    Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
+                }
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                // B1.V
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                if (model.layers[il].wv_scale) {
+                    Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
+                }
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        NULL, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+
+                cur = build_norm(cur,
+                        model.layers[il].attn_sub_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "attn_sub_norm", il);
+
+                cur = build_lora_mm(model.layers[il].wo, cur);
+                if (model.layers[il].wo_scale) {
+                    cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
+                }
+                if (model.layers[il].bo) {
+                    cur = ggml_add(ctx0, cur, model.layers[il].bo);
+                }
+                cb(cur, "attn_o_out", il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward forward
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, model.layers[il].ffn_up_scale,
+                    model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
+                    NULL,                      NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_sub_out", il);
+
+            cur = build_norm(cur,
+                    model.layers[il].ffn_sub_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_sub_norm", il);
+
+            cur = build_lora_mm(model.layers[il].ffn_down, cur);
+            if (model.layers[il].ffn_down_scale) {
+                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
+            }
+            cb(cur, "ffn_down", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        // FIXME: do not use model.tok_embd directly, duplicate as model.output
+        cur = build_lora_mm(model.tok_embd, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+        return gf;
+    }
+
+    //struct ggml_cgraph * build_t5_enc() {
+    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+    //    const int64_t n_embd_head = hparams.n_embd_head_v;
+    //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+    //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+    //    struct ggml_tensor * cur;
+    //    struct ggml_tensor * inpL;
+
+    //    inpL = build_inp_embd(model.tok_embd);
+
+    //    GGML_ASSERT(lctx.is_encoding);
+    //    struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false);
+
+    //    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+    //    struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
+
+    //    for (int il = 0; il < n_layer; ++il) {
+    //        struct ggml_tensor * inpSA = inpL;
+
+    //        // norm
+    //        cur = build_norm(inpL,
+    //                model.layers[il].attn_norm_enc, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "attn_norm", il);
+
+    //        // self-attention
+    //        {
+    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
+    //            cb(Qcur, "Qcur", il);
+
+    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
+    //            cb(Kcur, "Kcur", il);
+
+    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
+    //            cb(Vcur, "Vcur", il);
+
+    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+    //            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+    //            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+    //            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    //            cb(kq, "kq", il);
+
+    //            struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+    //            struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b);
+    //            struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+    //            cb(kq_b, "kq_b", il);
+
+    //            kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
+    //            cb(kq, "kq_soft_max_ext", il);
+
+    //            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
+    //            cb(v, "v", il);
+
+    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
+    //            cb(kqv, "kqv", il);
+
+    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    //            cb(kqv_merged, "kqv_merged", il);
+
+    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+    //            cb(cur, "kqv_merged_cont", il);
+
+    //            ggml_build_forward_expand(gf, cur);
+
+    //            cur = build_lora_mm(model.layers[il].wo_enc, cur);
+    //            cb(cur, "kqv_out", il);
+    //        }
+
+    //        if (il == n_layer - 1) {
+    //            // skip computing output for unused tokens
+    //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+    //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+    //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+    //        }
+
+    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+    //        cb(ffn_inp, "ffn_inp", il);
+
+    //        // feed-forward network
+    //        {
+    //            cur = build_norm(ffn_inp,
+    //                    model.layers[il].ffn_norm_enc, NULL,
+    //                    LLM_NORM_RMS, il);
+    //            cb(cur, "ffn_norm", il);
+
+    //            // T5 uses relu, flan-T5 uses gelu-gated
+    //            cur = build_ffn(cur,
+    //                    model.layers[il].ffn_up_enc,   NULL, NULL,
+    //                    model.layers[il].ffn_gate_enc, NULL, NULL,
+    //                    model.layers[il].ffn_down_enc, NULL, NULL,
+    //                    NULL,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_PAR  : LLM_FFN_SEQ,
+    //                    il);
+    //            cb(cur, "ffn_out", il);
+    //        }
+
+    //        cur = ggml_add(ctx0, cur, ffn_inp);
+    //        cb(cur, "ffn_out", il);
+
+    //        ggml_tensor * layer_dir = cvec.tensor_for(il);
+    //        if (layer_dir != nullptr) {
+    //            cur = ggml_add(ctx0, cur, layer_dir);
+    //        }
+    //        cb(cur, "l_out", il);
+
+    //        // input for next layer
+    //        inpL = cur;
+    //    }
+
+    //    cur = inpL;
+    //    cb(cur, "result_embd", -1);
+
+    //    cur = build_norm(cur,
+    //            model.output_norm_enc, NULL,
+    //            LLM_NORM_RMS, -1);
+    //    cb(cur, "result_norm", -1);
+
+    //    ggml_build_forward_expand(gf, cur);
+
+    //    return gf;
+    //}
+
+    //struct ggml_cgraph * build_t5_dec() {
+    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+    //    const int64_t n_embd_head = hparams.n_embd_head_v;
+    //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+    //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+    //    struct ggml_tensor * cur;
+    //    struct ggml_tensor * inpL;
+
+    //    inpL = build_inp_embd(model.tok_embd);
+
+    //    GGML_ASSERT(!lctx.is_encoding);
+    //    GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
+
+    //    struct ggml_tensor * embd_enc       = build_inp_embd_enc();
+    //    struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true);
+
+    //    struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
+    //    struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross();
+
+    //    for (int il = 0; il < n_layer; ++il) {
+    //        struct ggml_tensor * inpSA = inpL;
+
+    //        // norm
+    //        cur = build_norm(inpL,
+    //                model.layers[il].attn_norm, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "attn_norm", il);
+
+    //        // self-attention
+    //        {
+    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+    //            cb(Qcur, "Qcur", il);
+
+    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+    //            cb(Kcur, "Kcur", il);
+
+    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+    //            cb(Vcur, "Vcur", il);
+
+    //            build_kv_store(gf, Kcur, Vcur, il);
+
+    //            struct ggml_tensor * k =
+    //                ggml_view_3d(ctx0, kv_self.k_l[il],
+    //                        n_embd_head_k, n_kv, n_head_kv,
+    //                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+    //                        ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+    //                        0);
+    //            cb(k, "k", il);
+
+    //            struct ggml_tensor * v =
+    //                ggml_view_3d(ctx0, kv_self.v_l[il],
+    //                        n_kv, n_embd_head_v, n_head_kv,
+    //                        ggml_element_size(kv_self.v_l[il])*n_ctx,
+    //                        ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+    //                        0);
+    //            cb(v, "v", il);
+
+    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+    //            struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+
+    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    //            cb(kq, "kq", il);
+
+    //            struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+    //            struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b);
+    //            struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+    //            cb(kq_b, "kq_b", il);
+
+    //            kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
+    //            cb(kq, "kq_soft_max_ext", il);
+
+    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+    //            cb(kqv, "kqv", il);
+
+    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    //            cb(kqv_merged, "kqv_merged", il);
+
+    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+    //            cb(cur, "kqv_merged_cont", il);
+
+    //            ggml_build_forward_expand(gf, cur);
+
+    //            cur = build_lora_mm(model.layers[il].wo, cur);
+    //            cb(cur, "kqv_out", il);
+    //        }
+
+    //        cur = ggml_add(ctx0, cur, inpSA);
+    //        cb(cur, "cross_inp", il);
+
+    //        struct ggml_tensor * inpCA = cur;
+
+    //        // norm
+    //        cur = build_norm(cur,
+    //                model.layers[il].attn_norm_cross, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "attn_norm_cross", il);
+
+    //        // cross-attention
+    //        {
+    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
+    //            cb(Qcur, "Qcur", il);
+
+    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
+    //            cb(Kcur, "Kcur", il);
+
+    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
+    //            cb(Vcur, "Vcur", il);
+
+    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+    //            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+
+    //            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+    //            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    //            cb(kq, "kq", il);
+
+    //            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
+    //            cb(kq, "kq_soft_max_ext", il);
+
+    //            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
+    //            cb(v, "v", il);
+
+    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
+    //            cb(kqv, "kqv", il);
+
+    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    //            cb(kqv_merged, "kqv_merged", il);
+
+    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+    //            cb(cur, "kqv_merged_cont", il);
+
+    //            ggml_build_forward_expand(gf, cur);
+
+    //            cur = build_lora_mm(model.layers[il].wo_cross, cur);
+    //            cb(cur, "kqv_out", il);
+    //        }
+
+    //        if (il == n_layer - 1) {
+    //            // skip computing output for unused tokens
+    //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+    //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+    //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+    //            inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
+    //        }
+
+    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
+    //        cb(ffn_inp, "ffn_inp", il);
+
+    //        // feed-forward network
+    //        {
+    //            cur = build_norm(ffn_inp,
+    //                    model.layers[il].ffn_norm, NULL,
+    //                    LLM_NORM_RMS, il);
+    //            cb(cur, "ffn_norm", il);
+
+    //            // T5 uses relu, flan-T5 uses gelu-gated
+    //            cur = build_ffn(cur,
+    //                    model.layers[il].ffn_up,   NULL, NULL,
+    //                    model.layers[il].ffn_gate, NULL, NULL,
+    //                    model.layers[il].ffn_down, NULL, NULL,
+    //                    NULL,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+    //                    il);
+    //            cb(cur, "ffn_out", il);
+    //        }
+
+    //        cur = ggml_add(ctx0, cur, ffn_inp);
+    //        cb(cur, "ffn_out", il);
+
+    //        ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+    //        if (layer_dir != nullptr) {
+    //            cur = ggml_add(ctx0, cur, layer_dir);
+    //        }
+    //        cb(cur, "l_out", il);
+
+    //        // input for next layer
+    //        inpL = cur;
+    //    }
+
+    //    cur = inpL;
+    //    cb(cur, "result_embd", -1);
+
+    //    cur = build_norm(cur,
+    //            model.output_norm, NULL,
+    //            LLM_NORM_RMS, -1);
+    //    cb(cur, "result_norm", -1);
+
+    //    // lm_head
+    //    cur = build_lora_mm(model.output, cur);
+    //    cb(cur, "result_output", -1);
+
+    //    ggml_build_forward_expand(gf, cur);
+
+    //    return gf;
+    //}
+
+    struct ggml_cgraph * build_jais() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            inpL = ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
+        }
+
+        cur = build_norm(inpL,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_chatglm() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = nullptr;
+                struct ggml_tensor * Kcur = nullptr;
+                struct ggml_tensor * Vcur = nullptr;
+
+                if (model.layers[il].wqkv == nullptr) {
+                    Qcur = build_lora_mm(model.layers[il].wq, cur);
+                    if (model.layers[il].bq) {
+                        Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    }
+                    Kcur = build_lora_mm(model.layers[il].wk, cur);
+                    if (model.layers[il].bk) {
+                        Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    }
+                    Vcur = build_lora_mm(model.layers[il].wv, cur);
+                    if (model.layers[il].bv) {
+                        Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    }
+                } else {
+                    cur = build_lora_mm(model.layers[il].wqkv, cur);
+                    cb(cur, "wqkv", il);
+                    if (model.layers[il].bqkv) {
+                        cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                        cb(cur, "bqkv", il);
+                    }
+                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                }
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur_rope", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur_rope", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // Add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm,
+                        NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        NULL,                      NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
+
+            }
+
+            inpL = ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
+        }
+
+        cur = build_norm(inpL,
+                model.output_norm,
+                NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_nemotron() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        //GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm,
+                    model.layers[il].ffn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                    NULL,                      NULL,                        NULL,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                    NULL,
+                    LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, model.output_norm_b,
+                LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_exaone() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    ggml_cgraph * build_rwkv6() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        GGML_ASSERT(hparams.token_shift_count == 2);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+
+        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
+
+        const auto n_embd = hparams.n_embd;
+        const auto n_seq_tokens = ubatch.n_seq_tokens;
+        const auto n_seqs = ubatch.n_seqs;
+
+        for (int il = 0; il < n_layer; ++il) {
+            const llama_layer * layer = &model.layers[il];
+
+            struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load(
+                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
+            );
+
+            struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+            struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
+
+            struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
+            cb(att_norm, "attn_norm", il);
+
+            struct ggml_tensor * x_prev = ggml_concat(
+                ctx0,
+                att_shift,
+                ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+                1
+            );
+
+            cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            struct ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
+            cb(ffn_norm, "ffn_norm", il);
+
+            x_prev = ggml_concat(
+                ctx0,
+                ffn_shift,
+                ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
+                1
+            );
+
+            cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            token_shift = ggml_concat(ctx0,
+                ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
+                ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
+                1
+            );
+            ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+
+            if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
+                cur = ggml_scale(ctx0, cur, 0.5F);
+            }
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+
+        cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
+    ggml_cgraph * build_rwkv6qwen2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        GGML_ASSERT(n_embd == hparams.n_embd_k_s());
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
+
+        const auto n_embd = hparams.n_embd;
+        const auto n_seq_tokens = ubatch.n_seq_tokens;
+        const auto n_seqs = ubatch.n_seqs;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        for (int il = 0; il < n_layer; ++il) {
+            const llama_layer * layer = &model.layers[il];
+
+            struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load(
+                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
+            );
+
+            struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
+            cb(att_norm, "attn_norm", il);
+
+            struct ggml_tensor * x_prev = ggml_concat(
+                ctx0,
+                token_shift,
+                ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+                1
+            );
+
+            cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
+
+            token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
+            ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+
+        cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    // ref: https://github.com/facebookresearch/chameleon
+    // based on the original build_llama() function, changes:
+    //   * qk-norm
+    //   * swin-norm
+    //   * removed bias
+    //   * removed MoE
+    struct ggml_cgraph * build_chameleon() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            if (hparams.swin_norm) {
+                cur = inpL;
+            } else {
+                cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+                cb(cur, "attn_norm", il);
+            }
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
+                                ggml_element_size(Qcur) * n_embd_head,
+                                ggml_element_size(Qcur) * n_embd_head * n_head,
+                                0);
+                    cb(Qcur, "Qcur", il);
+
+                    Qcur = build_norm(Qcur,
+                                model.layers[il].attn_q_norm,
+                                model.layers[il].attn_q_norm_b,
+                                LLM_NORM, il);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                if (model.layers[il].attn_k_norm) {
+                    Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
+                                ggml_element_size(Kcur) * n_embd_head,
+                                ggml_element_size(Kcur) * n_embd_head * n_head_kv,
+                                0);
+                    cb(Kcur, "Kcur", il);
+
+                    Kcur = build_norm(Kcur,
+                               model.layers[il].attn_k_norm,
+                               model.layers[il].attn_k_norm_b,
+                               LLM_NORM, il);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = build_attn(gf,
+                        model.layers[il].wo, nullptr,
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+
+                if (hparams.swin_norm) {
+                    cur = build_norm(cur,
+                        model.layers[il].attn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                }
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            if (!hparams.swin_norm) {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+            }
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+
+            if (hparams.swin_norm) {
+                cur = build_norm(cur,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lgf.build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output_with_img_logits", -1);
+
+        // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
+        // Needs to be removed once image outputs are supported.
+        int img_token_end_idx = 8196;
+        int img_token_start_idx = 4;
+        int num_img_tokens = img_token_end_idx - img_token_start_idx;
+        // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
+        // which ensures that text token values are always at least larger than image token values
+        struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
+        img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
+        cb(img_logits, "img_logits", -1);
+        cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+    struct ggml_cgraph * build_wavtokenizer_dec() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
+
+        cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
+        cur = ggml_add(ctx0, cur, model.conv1d_b);
+
+        // posnet
+        for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
+            const auto & layer = model.layers[il].posnet;
+
+            inpL = cur;
+
+            switch (il) {
+                case 0:
+                case 1:
+                case 3:
+                case 4:
+                    {
+                        cur = build_norm(cur,
+                                layer.norm1,
+                                layer.norm1_b,
+                                LLM_NORM_GROUP, 0);
+
+                        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+                        cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.conv1_b);
+
+                        cur = build_norm(cur,
+                                layer.norm2,
+                                layer.norm2_b,
+                                LLM_NORM_GROUP, 0);
+
+                        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+                        cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.conv2_b);
+
+                        cur = ggml_add(ctx0, cur, inpL);
+                    } break;
+                case 2:
+                    {
+                        cur = build_norm(cur,
+                                layer.attn_norm,
+                                layer.attn_norm_b,
+                                LLM_NORM_GROUP, 0);
+
+                        struct ggml_tensor * q;
+                        struct ggml_tensor * k;
+                        struct ggml_tensor * v;
+
+                        q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
+                        k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
+                        v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
+
+                        q = ggml_add(ctx0, q, layer.attn_q_b);
+                        k = ggml_add(ctx0, k, layer.attn_k_b);
+                        v = ggml_add(ctx0, v, layer.attn_v_b);
+
+                        q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
+                        k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
+
+                        struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+
+                        kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
+
+                        cur = ggml_mul_mat(ctx0, kq, v);
+
+                        cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.attn_o_b);
+
+                        cur = ggml_add(ctx0, cur, inpL);
+                    } break;
+                case 5:
+                    {
+                        cur = build_norm(cur,
+                                layer.norm,
+                                layer.norm_b,
+                                LLM_NORM_GROUP, 0);
+                    } break;
+                default: GGML_ABORT("unknown posnet layer");
+            };
+        }
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        cur = build_norm(cur,
+                model.tok_norm,
+                model.tok_norm_b,
+                LLM_NORM, -1);
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        inpL = cur;
+
+        // convnext
+        for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
+            const auto & layer = model.layers[il].convnext;
+
+            cur = inpL;
+
+            cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, layer.dw_b);
+
+            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+            cur = build_norm(cur,
+                    layer.norm,
+                    layer.norm_b,
+                    LLM_NORM, -1);
+
+            cur = build_ffn(cur,
+                    layer.pw1, layer.pw1_b, NULL,
+                    NULL,      NULL,        NULL,
+                    layer.pw2, layer.pw2_b, NULL,
+                    NULL,
+                    LLM_FFN_GELU, LLM_FFN_SEQ, il);
+
+            cur = ggml_mul(ctx0, cur, layer.gamma);
+
+            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+            inpL = ggml_add(ctx0, cur, inpL);
+        }
+
+        cur = inpL;
+
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        cur = build_norm(cur,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, -1);
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cur = ggml_add(ctx0, cur, model.output_b);
+        cb(cur, "result_embd", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+};
+
+ggml_cgraph * llama_model::build_graph(
+         llama_graph_i &  lgf,
+   const llama_cparams &  cparams,
+   const llama_ubatch  &  ubatch,
+      ggml_context_ptr && ctx,
+                  bool    worst_case) const {
+    struct ggml_cgraph * result = NULL;
+
+    struct llm_build_context llm(lgf, *this, cparams, ubatch, std::move(ctx), worst_case);
+
+    switch (arch) {
+        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_MINICPM:
+        case LLM_ARCH_GRANITE:
+        case LLM_ARCH_GRANITE_MOE:
+            {
+                result = llm.build_llama();
+            } break;
+        case LLM_ARCH_DECI:
+            {
+                result = llm.build_deci();
+            } break;
+        case LLM_ARCH_BAICHUAN:
+            {
+                result = llm.build_baichuan();
+            } break;
+        case LLM_ARCH_FALCON:
+            {
+                result = llm.build_falcon();
+            } break;
+        case LLM_ARCH_GROK:
+            {
+                result = llm.build_grok();
+            } break;
+        case LLM_ARCH_STARCODER:
+            {
+                result = llm.build_starcoder();
+            } break;
+        case LLM_ARCH_REFACT:
+            {
+                result = llm.build_refact();
+            } break;
+        case LLM_ARCH_BERT:
+        case LLM_ARCH_JINA_BERT_V2:
+        case LLM_ARCH_NOMIC_BERT:
+            {
+                result = llm.build_bert();
+            } break;
+        case LLM_ARCH_BLOOM:
+            {
+                result = llm.build_bloom();
+            } break;
+        case LLM_ARCH_MPT:
+            {
+                result = llm.build_mpt();
+            } break;
+         case LLM_ARCH_STABLELM:
+            {
+                result = llm.build_stablelm();
+            } break;
+        case LLM_ARCH_QWEN:
+            {
+                result = llm.build_qwen();
+            } break;
+        case LLM_ARCH_QWEN2:
+            {
+                result = llm.build_qwen2();
+            } break;
+        case LLM_ARCH_QWEN2VL:
+            {
+                result = llm.build_qwen2vl();
+            } break;
+        case LLM_ARCH_QWEN2MOE:
+            {
+                result = llm.build_qwen2moe();
+            } break;
+        case LLM_ARCH_PHI2:
+            {
+                result = llm.build_phi2();
+            } break;
+        case LLM_ARCH_PHI3:
+        case LLM_ARCH_PHIMOE:
+            {
+                result = llm.build_phi3();
+            } break;
+        case LLM_ARCH_PLAMO:
+            {
+                result = llm.build_plamo();
+            } break;
+        case LLM_ARCH_GPT2:
+            {
+                result = llm.build_gpt2();
+            } break;
+        case LLM_ARCH_CODESHELL:
+            {
+                result = llm.build_codeshell();
+            } break;
+        case LLM_ARCH_ORION:
+            {
+                result = llm.build_orion();
+            } break;
+        case LLM_ARCH_INTERNLM2:
+            {
+                result = llm.build_internlm2();
+            } break;
+        case LLM_ARCH_MINICPM3:
+            {
+                result = llm.build_minicpm3();
+            } break;
+        case LLM_ARCH_GEMMA:
+            {
+                result = llm.build_gemma();
+            } break;
+        case LLM_ARCH_GEMMA2:
+            {
+                result = llm.build_gemma2();
+            } break;
+        case LLM_ARCH_STARCODER2:
+            {
+                result = llm.build_starcoder2();
+            } break;
+        case LLM_ARCH_MAMBA:
+            {
+                result = llm.build_mamba();
+            } break;
+        case LLM_ARCH_XVERSE:
+            {
+                result = llm.build_xverse();
+            } break;
+        case LLM_ARCH_COMMAND_R:
+            {
+                result = llm.build_command_r();
+            } break;
+        case LLM_ARCH_COHERE2:
+            {
+                result = llm.build_cohere2();
+            } break;
+        case LLM_ARCH_DBRX:
+            {
+                result = llm.build_dbrx();
+            } break;
+        case LLM_ARCH_OLMO:
+            {
+                result = llm.build_olmo();
+            } break;
+        case LLM_ARCH_OLMO2:
+            {
+                result = llm.build_olmo2();
+            } break;
+        case LLM_ARCH_OLMOE:
+            {
+                result = llm.build_olmoe();
+            } break;
+        case LLM_ARCH_OPENELM:
+            {
+                result = llm.build_openelm();
+            } break;
+        case LLM_ARCH_GPTNEOX:
+            {
+                result = llm.build_gptneox();
+            } break;
+        case LLM_ARCH_ARCTIC:
+            {
+                result = llm.build_arctic();
+            } break;
+        case LLM_ARCH_DEEPSEEK:
+            {
+                result = llm.build_deepseek();
+            } break;
+        case LLM_ARCH_DEEPSEEK2:
+            {
+                result = llm.build_deepseek2();
+            } break;
+        case LLM_ARCH_CHATGLM:
+            {
+                result = llm.build_chatglm();
+            } break;
+        case LLM_ARCH_BITNET:
+            {
+                result = llm.build_bitnet();
+            } break;
+        //case LLM_ARCH_T5:
+        //    {
+        //        if (lctx.is_encoding) {
+        //            result = llm.build_t5_enc();
+        //        } else {
+        //            result = llm.build_t5_dec();
+        //        }
+        //    } break;
+        //case LLM_ARCH_T5ENCODER:
+        //    {
+        //        result = llm.build_t5_enc();
+        //    } break;
+        case LLM_ARCH_JAIS:
+            {
+                result = llm.build_jais();
+            } break;
+        case LLM_ARCH_NEMOTRON:
+            {
+                result = llm.build_nemotron();
+            } break;
+        case LLM_ARCH_EXAONE:
+            {
+                result = llm.build_exaone();
+            } break;
+        case LLM_ARCH_RWKV6:
+            {
+                result = llm.build_rwkv6();
+            } break;
+        case LLM_ARCH_RWKV6QWEN2:
+            {
+                result = llm.build_rwkv6qwen2();
+            } break;
+        case LLM_ARCH_CHAMELEON:
+            {
+                result = llm.build_chameleon();
+            } break;
+        case LLM_ARCH_WAVTOKENIZER_DEC:
+            {
+                result = llm.build_wavtokenizer_dec();
+            } break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+
+    // add on pooling layer
+    if (cparams.embeddings) {
+        result = llm.append_pooling(result);
+    }
+
+    return result;
+}
+
 //
 // interface implementation
 //
diff --git a/src/llama-model.h b/src/llama-model.h
index a7c30444786fd..5d2a07abc570f 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -5,11 +5,16 @@
 #include "llama-hparams.h"
 #include "llama-vocab.h"
 
+#include "ggml-cpp.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+class  llama_graph_i;
+struct llama_cparams;
+struct llama_ubatch;
 struct llama_model_loader;
 
 // available models
@@ -362,6 +367,14 @@ struct llama_model {
 
     const struct ggml_tensor * get_tensor(const char * name) const;
 
+    // TODO: add encode/decode graphs
+    ggml_cgraph * build_graph(
+             llama_graph_i &  lgf,
+       const llama_cparams &  cparams,
+       const llama_ubatch  &  ubatch,
+          ggml_context_ptr && ctx,
+                      bool    worst_case) const;
+
 private:
     struct impl;
     std::unique_ptr<impl> pimpl;
diff --git a/src/llama.cpp b/src/llama.cpp
index e71a87ee9fcdf..83b66035fc585 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9,7431 +9,18 @@
 
 #include "ggml.h"
 #include "ggml-backend.h"
-#include "ggml-cpp.h"
 
 #include <algorithm>
-#include <array>
-#include <cassert>
-#include <cfloat>
-#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
-#include <functional>
-#include <cinttypes>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-//
-// llm_build
-//
-
-using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
-
-enum llm_ffn_op_type {
-    LLM_FFN_SILU,
-    LLM_FFN_GELU,
-    LLM_FFN_RELU,
-    LLM_FFN_RELU_SQR,
-    LLM_FFN_SWIGLU,
-};
-
-enum llm_ffn_gate_type {
-    LLM_FFN_SEQ,
-    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
-};
-
-enum llm_norm_type {
-    LLM_NORM,
-    LLM_NORM_RMS,
-    LLM_NORM_GROUP,
-};
-
-struct llm_build_context {
-          llama_graph_i & lgf;
-    const llama_model   & model;
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
-    const llama_ubatch  & ubatch;
-
-    const int64_t n_embd;
-    const int64_t n_layer;
-    const int64_t n_rot;
-    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
-    const int64_t n_head;
-    const int64_t n_head_kv;
-    const int64_t n_embd_head_k;
-    const int64_t n_embd_k_gqa;
-    const int64_t n_embd_head_v;
-    const int64_t n_embd_v_gqa;
-    const int64_t n_expert;
-    const int64_t n_expert_used;
-
-    const float freq_base;
-    const float freq_scale;
-    const float ext_factor;
-    const float attn_factor;
-    const float beta_fast;
-    const float beta_slow;
-    const float norm_eps;
-    const float norm_rms_eps;
-
-    const int32_t n_tokens;
-    const int32_t n_ctx_orig;
-
-    const bool worst_case;
-    const bool flash_attn;
-
-    const enum llama_pooling_type pooling_type;
-    const enum llama_rope_type    rope_type;
-
-    const llm_build_cb & cb;
-
-    const ggml_context_ptr   ctx  = nullptr;
-          ggml_context     * ctx0 = nullptr;
-
-    // TODO: consider making the entire interface noexcept
-    llm_build_context(
-             llama_graph_i &  lgf,
-       const llama_model   &  model,
-       const llama_cparams &  cparams,
-       const llama_ubatch  &  ubatch,
-             llm_build_cb  && cb,
-          ggml_context_ptr && ctx,
-                     bool   worst_case) :
-        lgf              (lgf),
-        model            (model),
-        hparams          (model.hparams),
-        cparams          (cparams),
-        ubatch           (ubatch),
-        n_embd           (hparams.n_embd),
-        n_layer          (hparams.n_layer),
-        n_rot            (hparams.n_rot),
-        n_ctx            (cparams.n_ctx),
-        n_head           (hparams.n_head()),
-        n_head_kv        (hparams.n_head_kv()),
-        n_embd_head_k    (hparams.n_embd_head_k),
-        n_embd_k_gqa     (hparams.n_embd_k_gqa()),
-        n_embd_head_v    (hparams.n_embd_head_v),
-        n_embd_v_gqa     (hparams.n_embd_v_gqa()),
-        n_expert         (hparams.n_expert),
-        n_expert_used    (hparams.n_expert_used),
-        freq_base        (cparams.rope_freq_base),
-        freq_scale       (cparams.rope_freq_scale),
-        ext_factor       (cparams.yarn_ext_factor),
-        attn_factor      (cparams.yarn_attn_factor),
-        beta_fast        (cparams.yarn_beta_fast),
-        beta_slow        (cparams.yarn_beta_slow),
-        norm_eps         (hparams.f_norm_eps),
-        norm_rms_eps     (hparams.f_norm_rms_eps),
-        n_tokens         (ubatch.n_tokens),
-        n_ctx_orig       (cparams.n_ctx_orig_yarn),
-        worst_case       (worst_case),
-        flash_attn       (cparams.flash_attn),
-        pooling_type     (cparams.pooling_type),
-        rope_type        (hparams.rope_type),
-        cb               (std::move(cb)),
-        ctx              (std::move(ctx)),
-        ctx0             (this->ctx.get()) {
-        }
-
-    // TODO: tmp
-    struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
-        struct ggml_tensor * inpL = lgf.build_inp_embd(ctx0, tok_embd, ubatch);
-        cb(inpL, "inp_embd", -1);
-
-        return inpL;
-    }
-
-    // TODO: tmp
-    struct ggml_tensor * build_lora_mm(
-              struct ggml_tensor * w,
-              struct ggml_tensor * cur) {
-        return lgf.build_lora_mm(ctx0, w, cur);
-    }
-
-    // TODO: tmp
-    struct ggml_tensor * build_lora_mm_id(
-              struct ggml_tensor * w,   // struct ggml_tensor * as
-              struct ggml_tensor * cur, // struct ggml_tensor * b
-              struct ggml_tensor * ids) {
-        return lgf.build_lora_mm_id(ctx0, w, cur, ids);
-    }
-
-    struct ggml_tensor * build_norm(
-             struct ggml_tensor * cur,
-             struct ggml_tensor * mw,
-             struct ggml_tensor * mb,
-                  llm_norm_type   type,
-                            int   il) {
-        switch (type) {
-            case LLM_NORM:       cur = ggml_norm      (ctx0, cur, hparams.f_norm_eps);     break;
-            case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx0, cur, hparams.f_norm_rms_eps); break;
-            case LLM_NORM_GROUP:
-                {
-                    cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
-                    cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
-                    cur = ggml_reshape_2d(ctx0, cur, cur->ne[0],    cur->ne[2]);
-                } break;
-        }
-
-        if (mw || mb) {
-            cb(cur, "norm", il);
-        }
-
-        if (mw) {
-            cur = ggml_mul(ctx0, cur, mw);
-            if (mb) {
-                cb(cur, "norm_w", il);
-            }
-        }
-
-        if (mb) {
-            cur = ggml_add(ctx0, cur, mb);
-        }
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_ffn(
-             struct ggml_tensor * cur,
-             struct ggml_tensor * up,
-             struct ggml_tensor * up_b,
-             struct ggml_tensor * up_s,
-             struct ggml_tensor * gate,
-             struct ggml_tensor * gate_b,
-             struct ggml_tensor * gate_s,
-             struct ggml_tensor * down,
-             struct ggml_tensor * down_b,
-             struct ggml_tensor * down_s,
-             struct ggml_tensor * act_scales,
-                llm_ffn_op_type   type_op,
-              llm_ffn_gate_type   type_gate,
-             const llm_build_cb & cb,
-                            int   il) {
-        struct ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
-        cb(tmp, "ffn_up", il);
-
-        if (up_b) {
-            tmp = ggml_add(ctx0, tmp, up_b);
-            cb(tmp, "ffn_up_b", il);
-        }
-
-        if (up_s) {
-            tmp = ggml_mul(ctx0, tmp, up_s);
-            cb(tmp, "ffn_up_s", il);
-        }
-
-        if (gate) {
-            switch (type_gate) {
-                case LLM_FFN_SEQ:
-                    {
-                        cur = build_lora_mm(gate, tmp);
-                        cb(cur, "ffn_gate", il);
-                    } break;
-                case LLM_FFN_PAR:
-                    {
-                        cur = build_lora_mm(gate, cur);
-                        cb(cur, "ffn_gate", il);
-                    } break;
-            }
-
-            if (gate_b) {
-                cur = ggml_add(ctx0, cur, gate_b);
-                cb(cur, "ffn_gate_b", il);
-            }
-
-            if (gate_s) {
-                cur = ggml_mul(ctx0, cur, gate_s);
-                cb(cur, "ffn_gate_s", il);
-            }
-
-        } else {
-            cur = tmp;
-        }
-
-        switch (type_op) {
-            case LLM_FFN_SILU:
-                {
-                    cur = ggml_silu(ctx0, cur);
-                    cb(cur, "ffn_silu", il);
-                } break;
-            case LLM_FFN_GELU:
-                {
-                    cur = ggml_gelu(ctx0, cur);
-                    cb(cur, "ffn_gelu", il);
-                    if (act_scales != NULL) {
-                        cur = ggml_div(ctx0, cur, act_scales);
-                        cb(cur, "ffn_act", il);
-                    }
-                } break;
-            case LLM_FFN_RELU:
-                {
-                    cur = ggml_relu(ctx0, cur);
-                    cb(cur, "ffn_relu", il);
-                } break;
-            case LLM_FFN_RELU_SQR:
-                {
-                    cur = ggml_relu(ctx0, cur);
-                    cb(cur, "ffn_relu", il);
-
-                    cur = ggml_sqr(ctx0, cur);
-                    cb(cur, "ffn_sqr(relu)", il);
-                } break;
-            case LLM_FFN_SWIGLU:
-                {
-                    // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-                    int64_t split_point = cur->ne[0] / 2;
-                    struct ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
-                    struct ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
-
-                    x0 = ggml_silu(ctx0, x0);
-                    cb(cur, "ffn_silu", il);
-
-                    cur = ggml_mul(ctx0, x0, x1);
-                    cb(cur, "ffn_mul", il);
-                } break;
-        }
-
-        if (type_gate == LLM_FFN_PAR) {
-            cur = ggml_mul(ctx0, cur, tmp);
-            cb(cur, "ffn_gate_par", il);
-        }
-
-        if (down) {
-            cur = build_lora_mm(down, cur);
-        }
-
-        if (down_b) {
-            cb(cur, "ffn_down", il);
-        }
-
-        if (down_b) {
-            cur = ggml_add(ctx0, cur, down_b);
-        }
-
-        if (down_s) {
-            cur = ggml_mul(ctx0, cur, down_s);
-            cb(cur, "ffn_down_s", il);
-        }
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_moe_ffn(
-             struct ggml_tensor * cur,
-             struct ggml_tensor * gate_inp,
-             struct ggml_tensor * up_exps,
-             struct ggml_tensor * gate_exps,
-             struct ggml_tensor * down_exps,
-             struct ggml_tensor * exp_probs_b,
-                        int64_t   n_expert,
-                        int64_t   n_expert_used,
-                llm_ffn_op_type   type_op,
-                           bool   norm_w,
-                           bool   scale_w,
-                          float   w_scale,
-  llama_expert_gating_func_type   gating_op,
-             const llm_build_cb & cb,
-                            int   il) {
-        int64_t n_embd = cur->ne[0];
-        int64_t n_tokens = cur->ne[1];
-
-        ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
-        cb(logits, "ffn_moe_logits", il);
-
-        ggml_tensor * probs = nullptr;
-        switch (gating_op) {
-            case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
-                {
-                    probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens]
-                } break;
-            case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
-                {
-                    probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
-                } break;
-            default:
-                GGML_ABORT("fatal error");
-        }
-        cb(probs, "ffn_moe_probs", il);
-
-        // add experts selection bias - introduced in DeepSeek V3
-        // leave probs unbiased as it's later used to get expert weights
-        ggml_tensor * selection_probs = probs;
-        if (exp_probs_b != nullptr) {
-            selection_probs = ggml_add(ctx0, probs, exp_probs_b);
-            cb(selection_probs, "ffn_moe_probs_biased", il);
-        }
-
-        // select experts
-        ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
-        cb(selected_experts->src[0], "ffn_moe_argsort", il);
-        cb(selected_experts, "ffn_moe_topk", il);
-
-        ggml_tensor * weights = ggml_get_rows(ctx0,
-                ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
-        cb(weights, "ffn_moe_weights", il);
-
-        if (norm_w) {
-            weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
-
-            ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
-            cb(weights_sum, "ffn_moe_weights_sum", il);
-
-            weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
-            cb(weights, "ffn_moe_weights_norm", il);
-
-            weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
-        }
-        if (scale_w) {
-            weights = ggml_scale(ctx0, weights, w_scale);
-            cb(weights, "ffn_moe_weights_scaled", il);
-        }
-
-        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
-        ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-        cb(up, "ffn_moe_up", il);
-
-        ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-        cb(gate, "ffn_moe_gate", il);
-
-        switch (type_op) {
-            case LLM_FFN_SILU:
-                {
-                    gate = ggml_silu(ctx0, gate);
-                    cb(gate, "ffn_moe_silu", il);
-                } break;
-            case LLM_FFN_GELU:
-                {
-                    gate = ggml_gelu(ctx0, gate);
-                    cb(gate, "ffn_moe_gelu", il);
-                } break;
-            default:
-                GGML_ABORT("fatal error");
-        }
-
-        ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
-        cb(par, "ffn_moe_gate_par", il);
-
-        ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
-        cb(experts, "ffn_moe_down", il);
-
-        experts = ggml_mul(ctx0, experts, weights);
-
-        // aggregate experts
-        ggml_tensor * moe_out = nullptr;
-        for (int i = 0; i < n_expert_used; ++i) {
-            ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
-                    experts->nb[2], i*experts->nb[1]);
-
-            if (i == 0) {
-                moe_out = cur_expert;
-            } else {
-                moe_out = ggml_add(ctx0, moe_out, cur_expert);
-            }
-        }
-
-        if (n_expert_used == 1) {
-            // avoid returning a non-contiguous tensor
-            moe_out = ggml_cont(ctx0, moe_out);
-        }
-
-        return moe_out;
-    }
-
-    struct ggml_tensor * build_attn(
-             struct ggml_cgraph * graph,
-             struct ggml_tensor * wo,
-             struct ggml_tensor * wo_b,
-             struct ggml_tensor * k_cur,
-             struct ggml_tensor * v_cur,
-             struct ggml_tensor * q_cur,
-                        int32_t   n_tokens,
-                        float     kq_scale,
-             const llm_build_cb & cb,
-                        int       il) {
-        // these nodes are added to the graph together so that they are not reordered
-        // by doing so, the number of splits in the graph is reduced
-        ggml_build_forward_expand(graph, q_cur);
-        ggml_build_forward_expand(graph, k_cur);
-        ggml_build_forward_expand(graph, v_cur);
-
-        //build_kv_store(graph, k_cur, v_cur, il);
-        lgf.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
-
-        struct ggml_tensor * cur;
-
-        //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il);
-        cur = lgf.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
-        cb(cur, "kqv_out", il);
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_rwkv_channel_mix(
-        const struct llama_layer * layer,
-        struct ggml_tensor * cur,
-        struct ggml_tensor * x_prev,
-        const llm_arch arch) {
-        struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
-        switch (arch) {
-            case LLM_ARCH_RWKV6:
-            {
-                struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
-                struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
-
-                struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
-                struct ggml_tensor * k = ggml_sqr(
-                        ctx0,
-                        ggml_relu(
-                            ctx0,
-                            build_lora_mm(layer->channel_mix_key, xk)
-                            )
-                    );
-                cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
-            } break;
-            default:
-                GGML_ABORT("fatal error");
-        }
-
-        return cur;
-    }
-
-    struct ggml_cgraph * build_k_shift() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        lgf.build_k_shift(ctx0, gf);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_defrag() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        lgf.build_defrag(ctx0, gf);
-
-        return gf;
-    }
-
-    struct ggml_tensor * build_inp_pos() {
-        ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens);
-        cb(cur, "inp_pos", -1);
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_inp_out_ids() {
-        ggml_tensor * cur = lgf.build_inp_out_ids(ctx0, n_tokens, worst_case);
-        cb(cur, "inp_out_ids", -1);
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_inp_mean() {
-        ggml_tensor * cur = lgf.build_inp_mean(ctx0, n_tokens);
-        cb(cur, "inp_mean", -1);
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_inp_cls() {
-        ggml_tensor * cur = lgf.build_inp_cls(ctx0, n_tokens);
-        cb(cur, "inp_cls", -1);
-
-        return cur;
-    }
-
-    struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
-        // find result_norm tensor for input
-        struct ggml_tensor * inp = nullptr;
-        for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-            inp = ggml_graph_node(gf, i);
-            if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
-                break;
-            }
-
-            inp = nullptr;
-        }
-        GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
-
-        struct ggml_tensor * cur;
-
-        switch (pooling_type) {
-            case LLAMA_POOLING_TYPE_NONE:
-                {
-                    cur = inp;
-                } break;
-            case LLAMA_POOLING_TYPE_MEAN:
-                {
-                    struct ggml_tensor * inp_mean = build_inp_mean();
-                    cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
-                } break;
-            case LLAMA_POOLING_TYPE_CLS:
-            case LLAMA_POOLING_TYPE_LAST:
-                {
-                    struct ggml_tensor * inp_cls = build_inp_cls();
-                    cur = ggml_get_rows(ctx0, inp, inp_cls);
-                } break;
-            case LLAMA_POOLING_TYPE_RANK:
-                {
-                    struct ggml_tensor * inp_cls = build_inp_cls();
-                    inp = ggml_get_rows(ctx0, inp, inp_cls);
-
-                    // classification head
-                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
-                    GGML_ASSERT(model.cls       != nullptr);
-                    GGML_ASSERT(model.cls_b     != nullptr);
-
-                    cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
-                    cur = ggml_tanh(ctx0, cur);
-
-                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
-                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
-                    if (model.cls_out) {
-                        GGML_ASSERT(model.cls_out_b != nullptr);
-
-                        cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
-                    }
-                } break;
-            default:
-                {
-                    GGML_ABORT("unknown pooling type");
-                }
-        }
-
-        cb(cur, "result_embd_pooled", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    //struct ggml_tensor * build_pos_bucket(bool causal) {
-    //    if (causal) {
-    //        lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
-    //    } else {
-    //        lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
-    //    }
-
-    //    ggml_set_input(lctx.inp_pos_bucket);
-    //    cb(lctx.inp_pos_bucket, "pos_bucket", -1);
-
-    //    return lctx.inp_pos_bucket;
-    //}
-
-    //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
-    //    struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
-    //    cb(pos_bucket_1d, "pos_bucket_1d", -1);
-
-    //    struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
-    //    cb(pos_bias, "pos_bias", -1);
-
-    //    pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0],  0);
-    //    cb(pos_bias, "pos_bias", -1);
-
-    //    pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
-    //    cb(pos_bias, "pos_bias", -1);
-
-    //    pos_bias = ggml_cont(ctx0, pos_bias);
-    //    cb(pos_bias, "pos_bias", -1);
-
-    //    return pos_bias;
-    //}
-
-    struct ggml_tensor * build_inp_embd_enc() {
-        ggml_tensor * cur = lgf.build_inp_embd_enc(ctx0, n_tokens, worst_case);
-        cb(cur, "embd_enc", -1);
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_inp_KQ_mask_cross() {
-        ggml_tensor * cur = lgf.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
-        cb(cur, "KQ_mask_cross", -1);
-
-        return cur;
-    }
-
-    struct ggml_cgraph * build_llama() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
-
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            // For Granite architecture
-            if (hparams.f_residual_scale) {
-                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            if (model.layers[il].ffn_gate_inp == nullptr) {
-
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            } else {
-                // MoE branch
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_moe_ffn(cur,
-                        model.layers[il].ffn_gate_inp,
-                        model.layers[il].ffn_up_exps,
-                        model.layers[il].ffn_gate_exps,
-                        model.layers[il].ffn_down_exps,
-                        nullptr,
-                        n_expert, n_expert_used,
-                        LLM_FFN_SILU, true,
-                        false, 0.0,
-                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                        cb, il);
-                cb(cur, "ffn_moe_out", il);
-            }
-
-            // For Granite architecture
-            if (hparams.f_residual_scale) {
-                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        // For Granite architecture
-        if (hparams.f_logit_scale) {
-            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
-        }
-
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_deci() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-            const int64_t n_head_kv = hparams.n_head_kv(il);
-            const int64_t n_head    = hparams.n_head(il);
-
-            if (n_head == 0) {
-                // attention-free layer of Llama-3_1-Nemotron-51B
-                cur = inpL;
-            } else {
-                // norm
-                cur = build_norm(inpL,
-                        model.layers[il].attn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "attn_norm", il);
-            }
-
-            if (n_head > 0 && n_head_kv == 0) {
-                // "linear attention" of Llama-3_1-Nemotron-51B
-                cur = build_lora_mm(model.layers[il].wo, cur);
-                cb(cur, "wo", il);
-            } else if (n_head > 0) {
-                // self-attention
-                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
-
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            // For Granite architecture
-            if (hparams.f_residual_scale) {
-                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
-            }
-
-            // modified to support attention-free layer of Llama-3_1-Nemotron-51B
-            struct ggml_tensor * ffn_inp = cur;
-            if (n_head > 0) {
-                ffn_inp = ggml_add(ctx0, cur, inpSA);
-                cb(ffn_inp, "ffn_inp", il);
-            }
-
-            // feed-forward network
-            if (model.layers[il].ffn_gate_inp == nullptr) {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            // For Granite architecture
-            if (hparams.f_residual_scale) {
-                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        // For Granite architecture
-        if (hparams.f_logit_scale) {
-            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
-        }
-
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_baichuan() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                switch (model.type) {
-                    case LLM_TYPE_7B:
-                        Qcur = ggml_rope_ext(
-                            ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                            ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-                        Kcur = ggml_rope_ext(
-                            ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                            ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-                        break;
-                    case LLM_TYPE_13B:
-                        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
-                        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
-                        break;
-                    default:
-                        GGML_ABORT("fatal error");
-                }
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_xverse() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_falcon() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * attn_norm;
-
-            attn_norm = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(attn_norm, "attn_norm", il);
-
-            // self-attention
-            {
-                if (model.layers[il].attn_norm_2) {
-                    // Falcon-40B
-                    cur = build_norm(inpL,
-                            model.layers[il].attn_norm_2,
-                            model.layers[il].attn_norm_2_b,
-                            LLM_NORM, il);
-                    cb(cur, "attn_norm_2", il);
-                } else {
-                    cur = attn_norm;
-                }
-
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-                // using mode = 2 for neox mode
-                Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur       = ggml_get_rows(ctx0,       cur, inp_out_ids);
-                inpL      = ggml_get_rows(ctx0,      inpL, inp_out_ids);
-                attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = cur;
-
-            // feed forward
-            {
-                cur = build_ffn(attn_norm, // !! use the attn norm, not the result
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        NULL,                      NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = ggml_add(ctx0, cur, inpL);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        // norm
-        cur = build_norm(cur,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_grok() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // multiply by embedding_multiplier_scale of 78.38367176906169
-        inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            // Grok
-            // if attn_out_norm is present then apply it before adding the input
-            if (model.layers[il].attn_out_norm) {
-                cur = build_norm(cur,
-                        model.layers[il].attn_out_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "attn_out_norm", il);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            // MoE branch
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_moe_ffn(cur,
-                    model.layers[il].ffn_gate_inp,
-                    model.layers[il].ffn_up_exps,
-                    model.layers[il].ffn_gate_exps,
-                    model.layers[il].ffn_down_exps,
-                    nullptr,
-                    n_expert, n_expert_used,
-                    LLM_FFN_GELU, true,
-                    false, 0.0,
-                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                    cb, il);
-            cb(cur, "ffn_moe_out", il);
-
-            // Grok
-            // if layer_out_norm is present then apply it before adding the input
-            // Idea: maybe ffn_out_norm is a better name
-            if (model.layers[il].layer_out_norm) {
-                cur = build_norm(cur,
-                        model.layers[il].layer_out_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "layer_out_norm", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        // Grok
-        // multiply logits by output_multiplier_scale of 0.5773502691896257
-
-        cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
-
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_dbrx() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = nullptr;
-                struct ggml_tensor * Kcur = nullptr;
-                struct ggml_tensor * Vcur = nullptr;
-
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
-                cb(cur, "wqkv_clamped", il);
-
-                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            // MoE branch
-            cur = build_norm(ffn_inp,
-                    model.layers[il].attn_out_norm, NULL,
-                    LLM_NORM, il);
-            cb(cur, "attn_out_norm", il);
-
-            cur = build_moe_ffn(cur,
-                    model.layers[il].ffn_gate_inp,
-                    model.layers[il].ffn_up_exps,
-                    model.layers[il].ffn_gate_exps,
-                    model.layers[il].ffn_down_exps,
-                    nullptr,
-                    n_expert, n_expert_used,
-                    LLM_FFN_SILU, true,
-                    false, 0.0,
-                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                    cb, il);
-            cb(cur, "ffn_moe_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_starcoder() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
-        cb(pos, "pos_embd", -1);
-
-        inpL = ggml_add(ctx0, inpL, pos);
-        cb(inpL, "inpL", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            // add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // FF
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = build_norm(inpL,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_refact() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-                cb(Kcur, "Kcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                cb(Qcur, "Qcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_bert() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-        struct ggml_tensor * inp_pos = nullptr;
-
-        if (model.arch != LLM_ARCH_JINA_BERT_V2) {
-            inp_pos = build_inp_pos();
-        }
-
-        // construct input embeddings (token, type, position)
-        inpL = build_inp_embd(model.tok_embd);
-
-        // token types are hardcoded to zero ("Sentence A")
-        struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
-        inpL = ggml_add(ctx0, inpL, type_row0);
-        if (model.arch == LLM_ARCH_BERT) {
-            inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
-        }
-        cb(inpL, "inp_embd", -1);
-
-        // embed layer norm
-        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
-        cb(inpL, "inp_norm", -1);
-
-        lgf.build_attn_inp(ctx0, n_tokens, false, false, worst_case);
-
-        // iterate layers
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * cur = inpL;
-
-            struct ggml_tensor * Qcur;
-            struct ggml_tensor * Kcur;
-            struct ggml_tensor * Vcur;
-
-            // self-attention
-            if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
-                Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
-                cb(Qcur, "Qcur", il);
-
-                if (model.layers[il].attn_q_norm) {
-                    Qcur = build_norm(Qcur,
-                            model.layers[il].attn_q_norm,
-                            model.layers[il].attn_q_norm_b,
-                            LLM_NORM, il);
-                }
-
-                Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
-                cb(Kcur, "Kcur", il);
-
-                if (model.layers[il].attn_k_norm) {
-                    Kcur = build_norm(Kcur,
-                            model.layers[il].attn_k_norm,
-                            model.layers[il].attn_k_norm_b,
-                            LLM_NORM, il);
-                }
-                Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-            } else {
-                // compute Q and K and RoPE them
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-            }
-
-            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-
-            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-            cb(kq, "kq", il);
-
-            //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
-            kq = lgf.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
-            cb(kq, "kq_soft_max_ext", il);
-
-            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
-            cb(v, "v", il);
-
-            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
-            cb(kqv, "kqv", il);
-
-            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-            cb(kqv_merged, "kqv_merged", il);
-
-            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-            cb(cur, "kqv_merged_cont", il);
-
-            ggml_build_forward_expand(gf, cur);
-
-            cur = build_lora_mm(model.layers[il].wo, cur);
-            if (model.layers[il].bo) {
-                cb(cur, "kqv_wo", il);
-            }
-
-            if (model.layers[il].bo) {
-                cur = ggml_add(ctx0, cur, model.layers[il].bo);
-            }
-            cb(cur, "kqv_out", il);
-
-            if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            // re-add the layer input
-            cur = ggml_add(ctx0, cur, inpL);
-
-            // attention layer norm
-            cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
-
-            if (model.layers[il].attn_norm_2 != nullptr) {
-                cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
-                cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
-            }
-
-            struct ggml_tensor * ffn_inp = cur;
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            if (model.arch == LLM_ARCH_BERT) {
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL,                        NULL,
-                        model.layers[il].ffn_gate, NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
-            } else {
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            }
-            cb(cur, "ffn_out", il);
-
-            // attentions bypass the intermediate layer
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            // output layer norm
-            cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cb(cur, "result_embd", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_bloom() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        inpL = build_norm(inpL,
-                model.tok_norm,
-                model.tok_norm_b,
-                LLM_NORM, -1);
-        cb(inpL, "inp_norm", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            // Add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // FF
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = build_norm(inpL,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_mpt() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * pos;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        if (model.pos_embd) {
-            // inp_pos - contains the positions
-            struct ggml_tensor * inp_pos = build_inp_pos();
-            pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
-            cb(pos, "pos_embd", -1);
-
-            inpL = ggml_add(ctx0, inpL, pos);
-            cb(inpL, "inpL", -1);
-        }
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * attn_norm;
-
-            attn_norm = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(attn_norm, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = attn_norm;
-
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                if (model.layers[il].bqkv){
-                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                    cb(cur, "bqkv", il);
-                }
-
-                if (hparams.f_clamp_kqv > 0.0f) {
-                    cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
-                    cb(cur, "wqkv_clamped", il);
-                }
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                // Q/K Layernorm
-                if (model.layers[il].attn_q_norm) {
-                    Qcur = build_norm(Qcur,
-                            model.layers[il].attn_q_norm,
-                            model.layers[il].attn_q_norm_b,
-                            LLM_NORM, il);
-                    cb(Qcur, "Qcur", il);
-
-                    Kcur = build_norm(Kcur,
-                            model.layers[il].attn_k_norm,
-                            model.layers[il].attn_k_norm_b,
-                            LLM_NORM, il);
-                    cb(Kcur, "Kcur", il);
-
-                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-                    cur = build_attn(gf,
-                            model.layers[il].wo, model.layers[il].bo,
-                            Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                } else {
-                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-                    cur = build_attn(gf,
-                            model.layers[il].wo, model.layers[il].bo,
-                            Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-                }
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            // Add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed forward
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, il);
-                cb(cur, "ffn_norm", il);
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        model.layers[il].ffn_act,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_stablelm() {
-        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            struct ggml_tensor * inpSA = cur;
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                cb(Qcur, "Qcur", il);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-                cb(Kcur, "Kcur", il);
-
-                if (model.layers[il].attn_q_norm) {
-                    Qcur = build_norm(Qcur,
-                            model.layers[il].attn_q_norm,
-                            NULL,
-                            LLM_NORM, il);
-                    cb(Qcur, "Qcur", il);
-                }
-                if (model.layers[il].attn_k_norm) {
-                    Kcur = build_norm(Kcur,
-                            model.layers[il].attn_k_norm,
-                            NULL,
-                            LLM_NORM, il);
-                    cb(Kcur, "Kcur", il);
-                }
-
-
-                Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpL  = ggml_get_rows(ctx0,  inpL, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                if (model.layers[il].ffn_norm) {
-                    cur = build_norm(ffn_inp,
-                            model.layers[il].ffn_norm,
-                            model.layers[il].ffn_norm_b,
-                            LLM_NORM, il);
-                    cb(cur, "ffn_norm", il);
-                } else {
-                    // parallel residual
-                    cur = inpSA;
-                }
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_qwen() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-                // using mode = 2 for neox mode
-                Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward forward
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_qwen2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_qwen2vl() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        int sections[4];
-        std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_multi(
-                    ctx0,
-                    ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_multi(
-                    ctx0,
-                    ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_qwen2moe() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self_attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // MoE branch
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            ggml_tensor * moe_out =
-                    build_moe_ffn(cur,
-                        model.layers[il].ffn_gate_inp,
-                        model.layers[il].ffn_up_exps,
-                        model.layers[il].ffn_gate_exps,
-                        model.layers[il].ffn_down_exps,
-                        nullptr,
-                        n_expert, n_expert_used,
-                        LLM_FFN_SILU, false,
-                        false, 0.0,
-                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                        cb, il);
-            cb(cur, "ffn_moe_out", il);
-
-            // FFN shared expert
-            {
-                ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
-                cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
-
-                // sigmoid
-                ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
-                cb(cur_gate, "ffn_shexp_gate", il);
-
-                ggml_tensor * cur_ffn = build_ffn(cur,
-                        model.layers[il].ffn_up_shexp,   NULL, NULL,
-                        model.layers[il].ffn_gate_shexp, NULL, NULL,
-                        model.layers[il].ffn_down_shexp, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur_ffn, "ffn_shexp", il);
-
-                ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
-                cb(ffn_shexp_out, "ffn_shexp_out", il);
-
-                moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
-                cb(moe_out, "ffn_out", il);
-
-                cur = moe_out;
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_phi2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * attn_norm_output;
-        struct ggml_tensor * ffn_output;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            attn_norm_output = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(attn_norm_output, "attn_norm", il);
-
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = nullptr;
-                struct ggml_tensor * Kcur = nullptr;
-                struct ggml_tensor * Vcur = nullptr;
-
-                if (model.layers[il].wqkv) {
-                    cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
-                    cb(cur, "wqkv", il);
-
-                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                    cb(cur, "bqkv", il);
-
-                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-                } else {
-                    Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
-                    Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
-                    Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
-                }
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                // with phi2, we scale the Q to avoid precision issues
-                // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
-                Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur              = ggml_get_rows(ctx0,              cur, inp_out_ids);
-                inpL             = ggml_get_rows(ctx0,             inpL, inp_out_ids);
-                attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
-            }
-
-            // FF
-            {
-                ffn_output = build_ffn(attn_norm_output,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(ffn_output, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_output);
-            cur = ggml_add(ctx0, cur, inpL);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = build_norm(inpL,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output_no_bias", -1);
-
-        cur = ggml_add(ctx0, cur, model.output_b);
-        cb(cur, "result_output", -1);
-        ggml_build_forward_expand(gf, cur);
-        return gf;
-    }
-
-    struct ggml_cgraph * build_phi3() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            auto residual = inpL;
-
-            // self-attention
-            {
-                // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
-
-                struct ggml_tensor* attn_norm_output = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM_RMS, il);
-                cb(attn_norm_output, "attn_norm", il);
-
-                struct ggml_tensor * Qcur = nullptr;
-                struct ggml_tensor * Kcur = nullptr;
-                struct ggml_tensor * Vcur = nullptr;
-
-                if (model.layers[il].wqkv) {
-                    cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
-                    cb(cur, "wqkv", il);
-
-                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
-                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
-                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
-                } else {
-                    Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
-                    Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
-                    Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
-                }
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor* inp_out_ids = build_inp_out_ids();
-                cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-                residual = ggml_get_rows(ctx0, residual, inp_out_ids);
-            }
-
-            cur = ggml_add(ctx0, cur, residual);
-            residual = cur;
-
-            cur = build_norm(cur,
-                model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            // feed-forward network
-            if (model.layers[il].ffn_gate_inp == nullptr) {
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        NULL,                      NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            } else {
-                // MoE branch
-                cur = build_moe_ffn(cur,
-                        model.layers[il].ffn_gate_inp,
-                        model.layers[il].ffn_up_exps,
-                        model.layers[il].ffn_gate_exps,
-                        model.layers[il].ffn_down_exps,
-                        nullptr,
-                        n_expert, n_expert_used,
-                        LLM_FFN_SILU, true,
-                        false, 0.0,
-                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                        cb, il);
-                cb(cur, "ffn_moe_out", il);
-            }
-
-            cur = ggml_add(ctx0, residual, cur);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = build_norm(inpL,
-            model.output_norm,
-            model.output_norm_b,
-            LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-
-        if (model.output_b != nullptr) {
-            cb(cur, "result_output_no_bias", -1);
-            cur = ggml_add(ctx0, cur, model.output_b);
-        }
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-
-    struct ggml_cgraph * build_plamo() {
-        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            struct ggml_tensor * attention_norm = cur;
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_ext(
-                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head,    n_tokens), inp_pos, nullptr,
-                        n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
-                        n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-            struct ggml_tensor * sa_out = cur;
-
-            cur = attention_norm;
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur    = ggml_get_rows(ctx0,    cur, inp_out_ids);
-                sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
-                inpL   = ggml_get_rows(ctx0,   inpL, inp_out_ids);
-            }
-
-            // feed-forward network
-            {
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, sa_out);
-            cur = ggml_add(ctx0, cur, inpL);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_gpt2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * pos;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
-        cb(pos, "pos_embd", -1);
-
-        inpL = ggml_add(ctx0, inpL, pos);
-        cb(inpL, "inpL", -1);
-
-        for (int il = 0; il < n_layer; ++il) {
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            // add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // FF
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = build_norm(inpL,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_codeshell() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(tmpq, "tmpq", il);
-                cb(tmpk, "tmpk", il);
-                cb(Vcur, "Vcur", il);
-
-                struct ggml_tensor * Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            // add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // FF
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = build_norm(inpL,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_orion() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                // if (model.layers[il].bq) {
-                //     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                //     cb(Qcur, "Qcur", il);
-                // }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                // if (model.layers[il].bk) {
-                //     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                //     cb(Kcur, "Kcur", il);
-                // }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                // if (model.layers[il].bv) {
-                //     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                //     cb(Vcur, "Vcur", il);
-                // }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_internlm2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_minicpm3() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        //TODO: if the model varies, these parameters need to be read from the model
-        const int64_t n_embd_base = 256;
-        const float scale_embd  = 12.0f;
-        const float scale_depth = 1.4f;
-        const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
-
-        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
-        const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
-        const uint32_t kv_lora_rank = hparams.n_lora_kv;
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // scale the input embeddings
-        inpL = ggml_scale(ctx0, inpL, scale_embd);
-        cb(inpL, "inp_scaled", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self_attention
-            {
-                struct ggml_tensor * q = NULL;
-                // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
-                q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
-                cb(q, "q", il);
-
-                q = build_norm(q,
-                        model.layers[il].attn_q_a_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(q, "q", il);
-
-                // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
-                q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
-                cb(q, "q", il);
-
-                // split into {n_head * n_embd_head_qk_nope, n_tokens}
-                struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
-                        ggml_row_size(q->type, hparams.n_embd_head_k),
-                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
-                        0);
-                cb(q_nope, "q_nope", il);
-
-                // and {n_head * n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
-                        ggml_row_size(q->type, hparams.n_embd_head_k),
-                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
-                        ggml_row_size(q->type, n_embd_head_qk_nope));
-                cb(q_pe, "q_pe", il);
-
-                // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
-                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
-
-                // split into {kv_lora_rank, n_tokens}
-                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
-                        kv_pe_compresseed->nb[1],
-                        0);
-                cb(kv_compressed, "kv_compressed", il);
-
-                // and {n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
-                        kv_pe_compresseed->nb[1],
-                        kv_pe_compresseed->nb[1],
-                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
-                cb(k_pe, "k_pe", il);
-
-                // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
-                kv_compressed = ggml_cont(ctx0, kv_compressed);
-                kv_compressed = build_norm(kv_compressed,
-                        model.layers[il].attn_kv_a_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(kv_compressed, "kv_compressed", il);
-
-                // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
-                struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
-                cb(kv, "kv", il);
-
-                // split into {n_head * n_embd_head_qk_nope, n_tokens}
-                struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
-                        ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
-                        ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
-                        0);
-                cb(k_nope, "k_nope", il);
-
-                // and {n_head * n_embd_head_v, n_tokens}
-                struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
-                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
-                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
-                        ggml_row_size(kv->type, (n_embd_head_qk_nope)));
-                cb(v_states, "v_states", il);
-
-                v_states = ggml_cont(ctx0, v_states);
-                cb(v_states, "v_states", il);
-
-                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
-                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
-                    0);
-                cb(v_states, "v_states", il);
-
-                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
-                q_pe = ggml_rope_ext(
-                    ctx0, q_pe, inp_pos, rope_factors,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(q_pe, "q_pe", il);
-
-                // shared RoPE key
-                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
-                k_pe = ggml_rope_ext(
-                    ctx0, k_pe, inp_pos, rope_factors,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(k_pe, "k_pe", il);
-
-                struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
-                cb(q_states, "q_states", il);
-
-                struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
-                cb(k_states, "k_states", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        k_states, v_states, q_states, n_tokens, kq_scale, cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            // scale_res - scale the hidden states for residual connection
-            const float scale_res = scale_depth/sqrtf(float(n_layer));
-            cur = ggml_scale(ctx0, cur, scale_res);
-            cb(cur, "hidden_scaled", il);
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            // scale the hidden states for residual connection
-            cur = ggml_scale(ctx0, cur, scale_res);
-            cb(cur, "hidden_scaled_ffn", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head scaling
-        const float scale_lmhead = float(n_embd_base)/float(n_embd);
-        cur = ggml_scale(ctx0, cur, scale_lmhead);
-        cb(cur, "lmhead_scaling", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_gemma() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
-        cb(inpL, "inp_scaled", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_ext(
-                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(Qcur, "Qcur", il);
-
-                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
-                cb(Qcur, "Qcur_scaled", il);
-
-                Kcur = ggml_rope_ext(
-                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
-            cb(sa_out, "sa_out", il);
-
-            cur = build_norm(sa_out,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            // feed-forward network
-            {
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, sa_out);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_gemma2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
-        cb(inpL, "inp_scaled", -1);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_ext(
-                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(Qcur, "Qcur", il);
-
-                // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
-                switch (model.type) {
-                    case LLM_TYPE_2B:
-                    case LLM_TYPE_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
-                    case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
-                    default: GGML_ABORT("fatal error");
-                };
-                cb(Qcur, "Qcur_scaled", il);
-
-                Kcur = ggml_rope_ext(
-                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
-            }
-
-            cur = build_norm(cur,
-                    model.layers[il].attn_post_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_post_norm", il);
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
-            cb(sa_out, "sa_out", il);
-
-            cur = build_norm(sa_out,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            // feed-forward network
-            {
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = build_norm(cur,
-                model.layers[il].ffn_post_norm, NULL,
-                LLM_NORM_RMS, -1);
-            cb(cur, "ffn_post_norm", -1);
-
-            cur = ggml_add(ctx0, cur, sa_out);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        // final logit soft-capping
-        cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
-        cur = ggml_tanh(ctx0, cur);
-        cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
-
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-
-    struct ggml_cgraph * build_starcoder2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_mamba() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        // {n_embd, n_tokens}
-        inpL = build_inp_embd(model.tok_embd);
-
-        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
-            cur = lgf.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            // residual
-            cur = ggml_add(ctx0, cur, inpL);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        // final rmsnorm
-        cur = build_norm(inpL,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_command_r() {
-
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        const float f_logit_scale = hparams.f_logit_scale;
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-            struct ggml_tensor * ffn_inp = cur;
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                if (model.layers[il].attn_q_norm) {
-                    Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
-                                ggml_element_size(Qcur) * n_embd_head,
-                                ggml_element_size(Qcur) * n_embd_head * n_head,
-                                0);
-                    cb(Qcur, "Qcur", il);
-                    Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
-                                ggml_element_size(Kcur) * n_embd_head,
-                                ggml_element_size(Kcur) * n_embd_head * n_head_kv,
-                                0);
-                    cb(Kcur, "Kcur", il);
-
-                    Qcur = build_norm(Qcur,
-                                model.layers[il].attn_q_norm,
-                                NULL,
-                                LLM_NORM, il);
-                    cb(Qcur, "Qcur", il);
-
-                    Kcur = build_norm(Kcur,
-                            model.layers[il].attn_k_norm,
-                            NULL,
-                            LLM_NORM, il);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur     = ggml_get_rows(ctx0,     cur, inp_out_ids);
-                inpL    = ggml_get_rows(ctx0,    inpL, inp_out_ids);
-                ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
-            }
-
-            struct ggml_tensor * attn_out = cur;
-
-            // feed-forward network
-            {
-                cur = build_ffn(ffn_inp,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            // add together residual + FFN + self-attention
-            cur = ggml_add(ctx0, cur, inpL);
-            cur = ggml_add(ctx0, cur, attn_out);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        if (f_logit_scale) {
-            cur = ggml_scale(ctx0, cur, f_logit_scale);
-        }
-
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-
-    }
-
-    struct ggml_cgraph * build_cohere2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        const float f_logit_scale = hparams.f_logit_scale;
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
-
-        // sliding window switch pattern
-        const int32_t sliding_window_pattern = 4;
-
-        for (int il = 0; il < n_layer; ++il) {
-            // three layers sliding window attention (window size 4096) and ROPE
-            // fourth layer uses global attention without positional embeddings
-            const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
-
-            // norm
-            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-            struct ggml_tensor * ffn_inp = cur;
-
-            // self-attention
-            {
-                // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
-
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                if (is_sliding) {
-                    Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-                                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
-                                        beta_fast, beta_slow);
-                    cb(Qcur, "Qcur", il);
-
-                    Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                                        rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
-                                        attn_factor, beta_fast, beta_slow);
-                    cb(Kcur, "Kcur", il);
-                } else {
-                    // For non-sliding layers, just reshape without applying RoPE
-                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                    cb(Qcur, "Qcur", il);
-
-                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
-                                   n_tokens, 1.0f / sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur                              = ggml_get_rows(ctx0, cur, inp_out_ids);
-                inpL                             = ggml_get_rows(ctx0, inpL, inp_out_ids);
-                ffn_inp                          = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
-            }
-
-            struct ggml_tensor * attn_out = cur;
-
-            // feed-forward network
-            {
-                cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
-                                    NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
-                                    cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            // add together residual + FFN + self-attention
-            cur = ggml_add(ctx0, cur, inpL);
-            cur = ggml_add(ctx0, cur, attn_out);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        if (f_logit_scale) {
-            cur = ggml_scale(ctx0, cur, f_logit_scale);
-        }
-
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    // ref: https://allenai.org/olmo
-    // based on the original build_llama() function, changes:
-    //   * non-parametric layer norm
-    //   * clamp qkv
-    //   * removed bias
-    //   * removed MoE
-    struct ggml_cgraph * build_olmo() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    NULL, NULL,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (hparams.f_clamp_kqv > 0.0f) {
-                    Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (hparams.f_clamp_kqv > 0.0f) {
-                    Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (hparams.f_clamp_kqv > 0.0f) {
-                    Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, nullptr,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = build_norm(ffn_inp,
-                    NULL, NULL,
-                    LLM_NORM, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                NULL, NULL,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_olmo2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            cur = inpL;
-
-            // self_attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(Qcur, "Qcur_normed", il);
-
-                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(Kcur, "Kcur_normed", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur_rope", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur_rope", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            cur = build_norm(cur,
-                    model.layers[il].attn_post_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_post_norm", il);
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = build_ffn(ffn_inp,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = build_norm(cur,
-                model.layers[il].ffn_post_norm, NULL,
-                LLM_NORM_RMS, -1);
-            cb(cur, "ffn_post_norm", -1);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    // based on the build_qwen2moe() function, changes:
-    //   * removed shared experts
-    //   * removed bias
-    //   * added q, k norm
-    struct ggml_cgraph * build_olmoe() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self_attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(Qcur, "Qcur_normed", il);
-
-                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(Kcur, "Kcur_normed", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur_rope", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur_rope", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // MoE branch
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_moe_ffn(cur,
-                    model.layers[il].ffn_gate_inp,
-                    model.layers[il].ffn_up_exps,
-                    model.layers[il].ffn_gate_exps,
-                    model.layers[il].ffn_down_exps,
-                    nullptr,
-                    n_expert, n_expert_used,
-                    LLM_FFN_SILU, false,
-                    false, 0.0,
-                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                    cb, il);
-            cb(cur, "ffn_moe_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_openelm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            const int64_t n_head    = hparams.n_head(il);
-            const int64_t n_head_kv = hparams.n_head_kv(il);
-            const int64_t n_head_qkv = 2*n_head_kv + n_head;
-
-            cur = inpL;
-            struct ggml_tensor * residual = cur;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
-                cb(Vcur, "Vcur", il);
-
-                Qcur = build_norm(Qcur,
-                        model.layers[il].attn_q_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(Qcur, "Qcur", il);
-
-                Kcur = build_norm(Kcur,
-                        model.layers[il].attn_k_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(Kcur, "Kcur", il);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
-                cb(Qcur, "Vcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                residual = ggml_get_rows(ctx0, residual, inp_out_ids);
-                cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        // norm
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_gptneox() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            // ffn
-            if (hparams.use_par_res) {
-                // attention and ffn are computed in parallel
-                // x = x + attn(ln1(x)) + ffn(ln2(x))
-
-                struct ggml_tensor * attn_out = cur;
-
-                cur = build_norm(inpL,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-
-                cur = ggml_add(ctx0, cur, inpL);
-                cb(cur, "ffn_out", il);
-
-                cur = ggml_add(ctx0, cur, attn_out);
-
-                cur = lgf.build_cvec(ctx0, cur, il);
-                cb(cur, "l_out", il);
-
-                // input for next layer
-                inpL = cur;
-            } else {
-                // attention and ffn are computed sequentially
-                // x = x + attn(ln1(x))
-                // x = x + ffn(ln2(x))
-
-                struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-                cb(ffn_inp, "ffn_inp", il);
-
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        NULL,                      NULL,                        NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-
-                cur = ggml_add(ctx0, cur, ffn_inp);
-
-                cur = lgf.build_cvec(ctx0, cur, il);
-                cb(cur, "l_out", il);
-
-                // input for next layer
-                inpL = cur;
-            }
-        }
-
-        cur = build_norm(inpL,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_arctic() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
-            cb(ffn_out, "ffn_out", il);
-
-            // MoE
-            cur = build_norm(inpSA,
-                    model.layers[il].ffn_norm_exps, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm_exps", il);
-
-            cur = build_moe_ffn(cur,
-                    model.layers[il].ffn_gate_inp,
-                    model.layers[il].ffn_up_exps,
-                    model.layers[il].ffn_gate_exps,
-                    model.layers[il].ffn_down_exps,
-                    nullptr,
-                    n_expert, n_expert_used,
-                    LLM_FFN_SILU, true,
-                    false, 0.0,
-                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                    cb, il);
-            cb(cur, "ffn_moe_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_out);
-            cb(cur, "ffn_out", il);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_deepseek() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
-
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            if ((uint32_t) il < hparams.n_layer_dense_lead) {
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            } else {
-                // MoE branch
-                ggml_tensor * moe_out =
-                        build_moe_ffn(cur,
-                            model.layers[il].ffn_gate_inp,
-                            model.layers[il].ffn_up_exps,
-                            model.layers[il].ffn_gate_exps,
-                            model.layers[il].ffn_down_exps,
-                            nullptr,
-                            n_expert, n_expert_used,
-                            LLM_FFN_SILU, false,
-                            false, hparams.expert_weights_scale,
-                            LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
-                            cb, il);
-                cb(moe_out, "ffn_moe_out", il);
-
-                // FFN shared expert
-                {
-                    ggml_tensor * ffn_shexp = build_ffn(cur,
-                            model.layers[il].ffn_up_shexp,   NULL, NULL,
-                            model.layers[il].ffn_gate_shexp, NULL, NULL,
-                            model.layers[il].ffn_down_shexp, NULL, NULL,
-                            NULL,
-                            LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                    cb(ffn_shexp, "ffn_shexp", il);
-
-                    cur = ggml_add(ctx0, moe_out, ffn_shexp);
-                    cb(cur, "ffn_out", il);
-                }
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_deepseek2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        bool is_lite = (hparams.n_layer == 27);
-
-        // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
-        // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-        const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
-        const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
-        const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
-
-        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
-        const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
-        const uint32_t kv_lora_rank = hparams.n_lora_kv;
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        // {n_embd, n_tokens}
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self_attention
-            {
-                struct ggml_tensor * q = NULL;
-                if (!is_lite) {
-                    // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
-                    q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
-                    cb(q, "q", il);
-
-                    q = build_norm(q,
-                            model.layers[il].attn_q_a_norm, NULL,
-                            LLM_NORM_RMS, il);
-                    cb(q, "q", il);
-
-                    // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
-                    q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
-                    cb(q, "q", il);
-                } else {
-                    q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                    cb(q, "q", il);
-                }
-
-                // split into {n_head * n_embd_head_qk_nope, n_tokens}
-                struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
-                        ggml_row_size(q->type, hparams.n_embd_head_k),
-                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
-                        0);
-                cb(q_nope, "q_nope", il);
-
-                // and {n_head * n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
-                        ggml_row_size(q->type, hparams.n_embd_head_k),
-                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
-                        ggml_row_size(q->type, n_embd_head_qk_nope));
-                cb(q_pe, "q_pe", il);
-
-                // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
-                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
-
-                // split into {kv_lora_rank, n_tokens}
-                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
-                        kv_pe_compresseed->nb[1],
-                        0);
-                cb(kv_compressed, "kv_compressed", il);
-
-                // and {n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
-                        kv_pe_compresseed->nb[1],
-                        kv_pe_compresseed->nb[1],
-                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
-                cb(k_pe, "k_pe", il);
-
-                // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
-                kv_compressed = ggml_cont(ctx0, kv_compressed);
-                kv_compressed = build_norm(kv_compressed,
-                        model.layers[il].attn_kv_a_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(kv_compressed, "kv_compressed", il);
-
-                // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
-                struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
-                cb(kv, "kv", il);
-
-                // split into {n_head * n_embd_head_qk_nope, n_tokens}
-                struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
-                        ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
-                        ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
-                        0);
-                cb(k_nope, "k_nope", il);
-
-                // and {n_head * n_embd_head_v, n_tokens}
-                struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
-                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
-                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
-                        ggml_row_size(kv->type, (n_embd_head_qk_nope)));
-                cb(v_states, "v_states", il);
-
-                v_states = ggml_cont(ctx0, v_states);
-                cb(v_states, "v_states", il);
-
-                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
-                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
-                    0);
-                cb(v_states, "v_states", il);
-
-                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
-                q_pe = ggml_rope_ext(
-                    ctx0, q_pe, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                );
-                cb(q_pe, "q_pe", il);
-
-                // shared RoPE key
-                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
-                k_pe = ggml_rope_ext(
-                    ctx0, k_pe, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                );
-                cb(k_pe, "k_pe", il);
-
-                struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
-                cb(q_states, "q_states", il);
-
-                struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
-                cb(k_states, "k_states", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        k_states, v_states, q_states, n_tokens, kq_scale, cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            if ((uint32_t) il < hparams.n_layer_dense_lead) {
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            } else {
-                // MoE branch
-                ggml_tensor * moe_out =
-                        build_moe_ffn(cur,
-                            model.layers[il].ffn_gate_inp,
-                            model.layers[il].ffn_up_exps,
-                            model.layers[il].ffn_gate_exps,
-                            model.layers[il].ffn_down_exps,
-                            model.layers[il].ffn_exp_probs_b,
-                            n_expert, n_expert_used,
-                            LLM_FFN_SILU, hparams.expert_weights_norm,
-                            true, hparams.expert_weights_scale,
-                            (enum llama_expert_gating_func_type) hparams.expert_gating_func,
-                            cb, il);
-                cb(moe_out, "ffn_moe_out", il);
-
-                // FFN shared expert
-                {
-                    ggml_tensor * ffn_shexp = build_ffn(cur,
-                            model.layers[il].ffn_up_shexp,   NULL, NULL,
-                            model.layers[il].ffn_gate_shexp, NULL, NULL,
-                            model.layers[il].ffn_down_shexp, NULL, NULL,
-                            NULL,
-                            LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                    cb(ffn_shexp, "ffn_shexp", il);
-
-                    cur = ggml_add(ctx0, moe_out, ffn_shexp);
-                    cb(cur, "ffn_out", il);
-                }
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_bitnet() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                if (model.layers[il].wq_scale) {
-                    Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
-                }
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                // B1.K
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                if (model.layers[il].wk_scale) {
-                    Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
-                }
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                // B1.V
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                if (model.layers[il].wv_scale) {
-                    Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
-                }
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        NULL, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-
-                cur = build_norm(cur,
-                        model.layers[il].attn_sub_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "attn_sub_norm", il);
-
-                cur = build_lora_mm(model.layers[il].wo, cur);
-                if (model.layers[il].wo_scale) {
-                    cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
-                }
-                if (model.layers[il].bo) {
-                    cur = ggml_add(ctx0, cur, model.layers[il].bo);
-                }
-                cb(cur, "attn_o_out", il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward forward
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, model.layers[il].ffn_up_scale,
-                    model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
-                    NULL,                      NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_sub_out", il);
-
-            cur = build_norm(cur,
-                    model.layers[il].ffn_sub_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_sub_norm", il);
-
-            cur = build_lora_mm(model.layers[il].ffn_down, cur);
-            if (model.layers[il].ffn_down_scale) {
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
-            }
-            cb(cur, "ffn_down", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        // FIXME: do not use model.tok_embd directly, duplicate as model.output
-        cur = build_lora_mm(model.tok_embd, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-        return gf;
-    }
-
-    //struct ggml_cgraph * build_t5_enc() {
-    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-    //    const int64_t n_embd_head = hparams.n_embd_head_v;
-    //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-    //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-    //    struct ggml_tensor * cur;
-    //    struct ggml_tensor * inpL;
-
-    //    inpL = build_inp_embd(model.tok_embd);
-
-    //    GGML_ASSERT(lctx.is_encoding);
-    //    struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false);
-
-    //    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    //    struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
-
-    //    for (int il = 0; il < n_layer; ++il) {
-    //        struct ggml_tensor * inpSA = inpL;
-
-    //        // norm
-    //        cur = build_norm(inpL,
-    //                model.layers[il].attn_norm_enc, NULL,
-    //                LLM_NORM_RMS, il);
-    //        cb(cur, "attn_norm", il);
-
-    //        // self-attention
-    //        {
-    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
-    //            cb(Qcur, "Qcur", il);
-
-    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
-    //            cb(Kcur, "Kcur", il);
-
-    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
-    //            cb(Vcur, "Vcur", il);
-
-    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-    //            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-
-    //            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-    //            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-
-    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-    //            cb(kq, "kq", il);
-
-    //            struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
-    //            struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b);
-    //            struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
-    //            cb(kq_b, "kq_b", il);
-
-    //            kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
-    //            cb(kq, "kq_soft_max_ext", il);
-
-    //            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
-    //            cb(v, "v", il);
-
-    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
-    //            cb(kqv, "kqv", il);
-
-    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-    //            cb(kqv_merged, "kqv_merged", il);
-
-    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-    //            cb(cur, "kqv_merged_cont", il);
-
-    //            ggml_build_forward_expand(gf, cur);
-
-    //            cur = build_lora_mm(model.layers[il].wo_enc, cur);
-    //            cb(cur, "kqv_out", il);
-    //        }
-
-    //        if (il == n_layer - 1) {
-    //            // skip computing output for unused tokens
-    //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-    //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-    //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-    //        }
-
-    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-    //        cb(ffn_inp, "ffn_inp", il);
-
-    //        // feed-forward network
-    //        {
-    //            cur = build_norm(ffn_inp,
-    //                    model.layers[il].ffn_norm_enc, NULL,
-    //                    LLM_NORM_RMS, il);
-    //            cb(cur, "ffn_norm", il);
-
-    //            // T5 uses relu, flan-T5 uses gelu-gated
-    //            cur = build_ffn(cur,
-    //                    model.layers[il].ffn_up_enc,   NULL, NULL,
-    //                    model.layers[il].ffn_gate_enc, NULL, NULL,
-    //                    model.layers[il].ffn_down_enc, NULL, NULL,
-    //                    NULL,
-    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
-    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_PAR  : LLM_FFN_SEQ,
-    //                    cb, il);
-    //            cb(cur, "ffn_out", il);
-    //        }
-
-    //        cur = ggml_add(ctx0, cur, ffn_inp);
-    //        cb(cur, "ffn_out", il);
-
-    //        ggml_tensor * layer_dir = cvec.tensor_for(il);
-    //        if (layer_dir != nullptr) {
-    //            cur = ggml_add(ctx0, cur, layer_dir);
-    //        }
-    //        cb(cur, "l_out", il);
-
-    //        // input for next layer
-    //        inpL = cur;
-    //    }
-
-    //    cur = inpL;
-    //    cb(cur, "result_embd", -1);
-
-    //    cur = build_norm(cur,
-    //            model.output_norm_enc, NULL,
-    //            LLM_NORM_RMS, -1);
-    //    cb(cur, "result_norm", -1);
-
-    //    ggml_build_forward_expand(gf, cur);
-
-    //    return gf;
-    //}
-
-    //struct ggml_cgraph * build_t5_dec() {
-    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-    //    const int64_t n_embd_head = hparams.n_embd_head_v;
-    //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-    //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-    //    struct ggml_tensor * cur;
-    //    struct ggml_tensor * inpL;
-
-    //    inpL = build_inp_embd(model.tok_embd);
-
-    //    GGML_ASSERT(!lctx.is_encoding);
-    //    GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
-
-    //    struct ggml_tensor * embd_enc       = build_inp_embd_enc();
-    //    struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true);
-
-    //    struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
-    //    struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross();
-
-    //    for (int il = 0; il < n_layer; ++il) {
-    //        struct ggml_tensor * inpSA = inpL;
-
-    //        // norm
-    //        cur = build_norm(inpL,
-    //                model.layers[il].attn_norm, NULL,
-    //                LLM_NORM_RMS, il);
-    //        cb(cur, "attn_norm", il);
-
-    //        // self-attention
-    //        {
-    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-    //            cb(Qcur, "Qcur", il);
-
-    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-    //            cb(Kcur, "Kcur", il);
-
-    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-    //            cb(Vcur, "Vcur", il);
-
-    //            build_kv_store(gf, Kcur, Vcur, il);
-
-    //            struct ggml_tensor * k =
-    //                ggml_view_3d(ctx0, kv_self.k_l[il],
-    //                        n_embd_head_k, n_kv, n_head_kv,
-    //                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-    //                        ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-    //                        0);
-    //            cb(k, "k", il);
-
-    //            struct ggml_tensor * v =
-    //                ggml_view_3d(ctx0, kv_self.v_l[il],
-    //                        n_kv, n_embd_head_v, n_head_kv,
-    //                        ggml_element_size(kv_self.v_l[il])*n_ctx,
-    //                        ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
-    //                        0);
-    //            cb(v, "v", il);
-
-    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-    //            struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-
-    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-    //            cb(kq, "kq", il);
-
-    //            struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
-    //            struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b);
-    //            struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
-    //            cb(kq_b, "kq_b", il);
-
-    //            kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
-    //            cb(kq, "kq_soft_max_ext", il);
-
-    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
-    //            cb(kqv, "kqv", il);
-
-    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-    //            cb(kqv_merged, "kqv_merged", il);
-
-    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-    //            cb(cur, "kqv_merged_cont", il);
-
-    //            ggml_build_forward_expand(gf, cur);
-
-    //            cur = build_lora_mm(model.layers[il].wo, cur);
-    //            cb(cur, "kqv_out", il);
-    //        }
-
-    //        cur = ggml_add(ctx0, cur, inpSA);
-    //        cb(cur, "cross_inp", il);
-
-    //        struct ggml_tensor * inpCA = cur;
-
-    //        // norm
-    //        cur = build_norm(cur,
-    //                model.layers[il].attn_norm_cross, NULL,
-    //                LLM_NORM_RMS, il);
-    //        cb(cur, "attn_norm_cross", il);
-
-    //        // cross-attention
-    //        {
-    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
-    //            cb(Qcur, "Qcur", il);
-
-    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
-    //            cb(Kcur, "Kcur", il);
-
-    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
-    //            cb(Vcur, "Vcur", il);
-
-    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-    //            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
-
-    //            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-    //            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
-
-    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-    //            cb(kq, "kq", il);
-
-    //            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
-    //            cb(kq, "kq_soft_max_ext", il);
-
-    //            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
-    //            cb(v, "v", il);
-
-    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
-    //            cb(kqv, "kqv", il);
-
-    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-    //            cb(kqv_merged, "kqv_merged", il);
-
-    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-    //            cb(cur, "kqv_merged_cont", il);
-
-    //            ggml_build_forward_expand(gf, cur);
-
-    //            cur = build_lora_mm(model.layers[il].wo_cross, cur);
-    //            cb(cur, "kqv_out", il);
-    //        }
-
-    //        if (il == n_layer - 1) {
-    //            // skip computing output for unused tokens
-    //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-    //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-    //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-    //            inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
-    //        }
-
-    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
-    //        cb(ffn_inp, "ffn_inp", il);
-
-    //        // feed-forward network
-    //        {
-    //            cur = build_norm(ffn_inp,
-    //                    model.layers[il].ffn_norm, NULL,
-    //                    LLM_NORM_RMS, il);
-    //            cb(cur, "ffn_norm", il);
-
-    //            // T5 uses relu, flan-T5 uses gelu-gated
-    //            cur = build_ffn(cur,
-    //                    model.layers[il].ffn_up,   NULL, NULL,
-    //                    model.layers[il].ffn_gate, NULL, NULL,
-    //                    model.layers[il].ffn_down, NULL, NULL,
-    //                    NULL,
-    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
-    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
-    //                    cb, il);
-    //            cb(cur, "ffn_out", il);
-    //        }
-
-    //        cur = ggml_add(ctx0, cur, ffn_inp);
-    //        cb(cur, "ffn_out", il);
-
-    //        ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-    //        if (layer_dir != nullptr) {
-    //            cur = ggml_add(ctx0, cur, layer_dir);
-    //        }
-    //        cb(cur, "l_out", il);
-
-    //        // input for next layer
-    //        inpL = cur;
-    //    }
-
-    //    cur = inpL;
-    //    cb(cur, "result_embd", -1);
-
-    //    cur = build_norm(cur,
-    //            model.output_norm, NULL,
-    //            LLM_NORM_RMS, -1);
-    //    cb(cur, "result_norm", -1);
-
-    //    // lm_head
-    //    cur = build_lora_mm(model.output, cur);
-    //    cb(cur, "result_output", -1);
-
-    //    ggml_build_forward_expand(gf, cur);
-
-    //    return gf;
-    //}
-
-    struct ggml_cgraph * build_jais() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
-
-                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                cb(cur, "bqkv", il);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
-            // add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // FF
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm,
-                        model.layers[il].ffn_norm_b,
-                        LLM_NORM, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
-                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                        NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-                cb(cur, "ffn_out", il);
-            }
-
-            inpL = ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
-        }
-
-        cur = build_norm(inpL,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_chatglm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = nullptr;
-                struct ggml_tensor * Kcur = nullptr;
-                struct ggml_tensor * Vcur = nullptr;
-
-                if (model.layers[il].wqkv == nullptr) {
-                    Qcur = build_lora_mm(model.layers[il].wq, cur);
-                    if (model.layers[il].bq) {
-                        Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    }
-                    Kcur = build_lora_mm(model.layers[il].wk, cur);
-                    if (model.layers[il].bk) {
-                        Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    }
-                    Vcur = build_lora_mm(model.layers[il].wv, cur);
-                    if (model.layers[il].bv) {
-                        Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    }
-                } else {
-                    cur = build_lora_mm(model.layers[il].wqkv, cur);
-                    cb(cur, "wqkv", il);
-                    if (model.layers[il].bqkv) {
-                        cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                        cb(cur, "bqkv", il);
-                    }
-                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-                }
-
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
-                //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur_rope", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur_rope", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            // Add the input
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // FF
-            {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm,
-                        NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-
-                cur = build_ffn(cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        NULL,                      NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
-                cb(cur, "ffn_out", il);
-
-            }
-
-            inpL = ggml_add(ctx0, cur, ffn_inp);
-            cb(inpL, "l_out", il);
-        }
-
-        cur = build_norm(inpL,
-                model.output_norm,
-                NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_nemotron() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        //GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm,
-                    model.layers[il].attn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm,
-                    model.layers[il].ffn_norm_b,
-                    LLM_NORM, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
-                    NULL,                      NULL,                        NULL,
-                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
-                    NULL,
-                    LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, model.output_norm_b,
-                LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_exaone() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "attn_norm", il);
-
-            // self-attention
-            {
-                // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
-
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-                if (model.layers[il].bq) {
-                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-                if (model.layers[il].bk) {
-                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-                if (model.layers[il].bv) {
-                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-                    cb(Vcur, "Vcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    ggml_cgraph * build_rwkv6() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        GGML_ASSERT(hparams.token_shift_count == 2);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
-
-        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
-
-        const auto n_embd = hparams.n_embd;
-        const auto n_seq_tokens = ubatch.n_seq_tokens;
-        const auto n_seqs = ubatch.n_seqs;
-
-        for (int il = 0; il < n_layer; ++il) {
-            const llama_layer * layer = &model.layers[il];
-
-            struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load(
-                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
-            );
-
-            struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
-            struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
-
-            struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
-            cb(att_norm, "attn_norm", il);
-
-            struct ggml_tensor * x_prev = ggml_concat(
-                ctx0,
-                att_shift,
-                ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
-                1
-            );
-
-            cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            struct ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
-            cb(ffn_norm, "ffn_norm", il);
-
-            x_prev = ggml_concat(
-                ctx0,
-                ffn_shift,
-                ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
-                1
-            );
-
-            cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            token_shift = ggml_concat(ctx0,
-                ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
-                ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
-                1
-            );
-            ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
-
-            if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
-                cur = ggml_scale(ctx0, cur, 0.5F);
-            }
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-
-        cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
-    ggml_cgraph * build_rwkv6qwen2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        GGML_ASSERT(n_embd == hparams.n_embd_k_s());
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
-
-        const auto n_embd = hparams.n_embd;
-        const auto n_seq_tokens = ubatch.n_seq_tokens;
-        const auto n_seqs = ubatch.n_seqs;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        for (int il = 0; il < n_layer; ++il) {
-            const llama_layer * layer = &model.layers[il];
-
-            struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load(
-                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
-            );
-
-            struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
-            cb(att_norm, "attn_norm", il);
-
-            struct ggml_tensor * x_prev = ggml_concat(
-                ctx0,
-                token_shift,
-                ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
-                1
-            );
-
-            cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
-
-            token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
-            ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            cur = build_norm(ffn_inp,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, il);
-            cb(cur, "ffn_norm", il);
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-
-        cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    // ref: https://github.com/facebookresearch/chameleon
-    // based on the original build_llama() function, changes:
-    //   * qk-norm
-    //   * swin-norm
-    //   * removed bias
-    //   * removed MoE
-    struct ggml_cgraph * build_chameleon() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-        GGML_ASSERT(n_embd_head == hparams.n_rot);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
-
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
-
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            if (hparams.swin_norm) {
-                cur = inpL;
-            } else {
-                cur = build_norm(inpL,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, il);
-                cb(cur, "attn_norm", il);
-            }
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
-
-                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
-
-                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
-
-                if (model.layers[il].attn_q_norm) {
-                    Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
-                                ggml_element_size(Qcur) * n_embd_head,
-                                ggml_element_size(Qcur) * n_embd_head * n_head,
-                                0);
-                    cb(Qcur, "Qcur", il);
-
-                    Qcur = build_norm(Qcur,
-                                model.layers[il].attn_q_norm,
-                                model.layers[il].attn_q_norm_b,
-                                LLM_NORM, il);
-                    cb(Qcur, "Qcur", il);
-                }
-
-                if (model.layers[il].attn_k_norm) {
-                    Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
-                                ggml_element_size(Kcur) * n_embd_head,
-                                ggml_element_size(Kcur) * n_embd_head * n_head_kv,
-                                0);
-                    cb(Kcur, "Kcur", il);
-
-                    Kcur = build_norm(Kcur,
-                               model.layers[il].attn_k_norm,
-                               model.layers[il].attn_k_norm_b,
-                               LLM_NORM, il);
-                    cb(Kcur, "Kcur", il);
-                }
-
-                Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Qcur, "Qcur", il);
-
-                Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-                cb(Kcur, "Kcur", il);
-
-                cur = build_attn(gf,
-                        model.layers[il].wo, nullptr,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
-
-                if (hparams.swin_norm) {
-                    cur = build_norm(cur,
-                        model.layers[il].attn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                }
-            }
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
-
-            // feed-forward network
-            if (!hparams.swin_norm) {
-                cur = build_norm(ffn_inp,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-            }
-
-            cur = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
-
-            if (hparams.swin_norm) {
-                cur = build_norm(cur,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, il);
-                cb(cur, "ffn_norm", il);
-            }
-
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
-
-            cur = lgf.build_cvec(ctx0, cur, il);
-            cb(cur, "l_out", il);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        cur = build_norm(cur,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-        cb(cur, "result_output_with_img_logits", -1);
-
-        // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
-        // Needs to be removed once image outputs are supported.
-        int img_token_end_idx = 8196;
-        int img_token_start_idx = 4;
-        int num_img_tokens = img_token_end_idx - img_token_start_idx;
-        // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
-        // which ensures that text token values are always at least larger than image token values
-        struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
-        img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
-        cb(img_logits, "img_logits", -1);
-        cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_wavtokenizer_dec() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = build_inp_embd(model.tok_embd);
-
-        cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
-
-        cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
-        cur = ggml_add(ctx0, cur, model.conv1d_b);
-
-        // posnet
-        for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
-            const auto & layer = model.layers[il].posnet;
-
-            inpL = cur;
-
-            switch (il) {
-                case 0:
-                case 1:
-                case 3:
-                case 4:
-                    {
-                        cur = build_norm(cur,
-                                layer.norm1,
-                                layer.norm1_b,
-                                LLM_NORM_GROUP, 0);
-
-                        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
-
-                        cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
-                        cur = ggml_add(ctx0, cur, layer.conv1_b);
-
-                        cur = build_norm(cur,
-                                layer.norm2,
-                                layer.norm2_b,
-                                LLM_NORM_GROUP, 0);
-
-                        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
-
-                        cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
-                        cur = ggml_add(ctx0, cur, layer.conv2_b);
-
-                        cur = ggml_add(ctx0, cur, inpL);
-                    } break;
-                case 2:
-                    {
-                        cur = build_norm(cur,
-                                layer.attn_norm,
-                                layer.attn_norm_b,
-                                LLM_NORM_GROUP, 0);
-
-                        struct ggml_tensor * q;
-                        struct ggml_tensor * k;
-                        struct ggml_tensor * v;
-
-                        q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
-                        k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
-                        v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
-
-                        q = ggml_add(ctx0, q, layer.attn_q_b);
-                        k = ggml_add(ctx0, k, layer.attn_k_b);
-                        v = ggml_add(ctx0, v, layer.attn_v_b);
-
-                        q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
-                        k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
-
-                        struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-
-                        kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
-
-                        cur = ggml_mul_mat(ctx0, kq, v);
-
-                        cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
-                        cur = ggml_add(ctx0, cur, layer.attn_o_b);
-
-                        cur = ggml_add(ctx0, cur, inpL);
-                    } break;
-                case 5:
-                    {
-                        cur = build_norm(cur,
-                                layer.norm,
-                                layer.norm_b,
-                                LLM_NORM_GROUP, 0);
-                    } break;
-                default: GGML_ABORT("unknown posnet layer");
-            };
-        }
-
-        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
-        cur = build_norm(cur,
-                model.tok_norm,
-                model.tok_norm_b,
-                LLM_NORM, -1);
-
-        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
-        inpL = cur;
-
-        // convnext
-        for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
-            const auto & layer = model.layers[il].convnext;
-
-            cur = inpL;
-
-            cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, layer.dw_b);
-
-            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
-            cur = build_norm(cur,
-                    layer.norm,
-                    layer.norm_b,
-                    LLM_NORM, -1);
-
-            cur = build_ffn(cur,
-                    layer.pw1, layer.pw1_b, NULL,
-                    NULL,      NULL,        NULL,
-                    layer.pw2, layer.pw2_b, NULL,
-                    NULL,
-                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-
-            cur = ggml_mul(ctx0, cur, layer.gamma);
-
-            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
-            inpL = ggml_add(ctx0, cur, inpL);
-        }
-
-        cur = inpL;
-
-        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
-
-        cur = build_norm(cur,
-                model.output_norm,
-                model.output_norm_b,
-                LLM_NORM, -1);
-
-        // lm_head
-        cur = build_lora_mm(model.output, cur);
-
-        cur = ggml_add(ctx0, cur, model.output_b);
-        cb(cur, "result_embd", -1);
-
-        ggml_build_forward_expand(gf, cur);
-
-        return gf;
-    }
-};
-
-static struct ggml_cgraph * llama_build_graph(
-         llama_context & lctx,
-    const llama_ubatch & ubatch,
-                  bool   worst_case) {
-    const auto & model   = lctx.get_model();
-    const auto & cparams = lctx.get_cparams();
-
-    // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
-    llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
-        if (il >= 0) {
-            ggml_format_name(cur, "%s-%d", name, il);
-        } else {
-            ggml_set_name(cur, name);
-        }
-
-        if (!cparams.offload_kqv) {
-            if (strcmp(name, "kqv_merged_cont") == 0) {
-                // all nodes between the KV store and the attention output are run on the CPU
-                ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu);
-            }
-        }
-
-        // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
-        // FIXME: fix in ggml_backend_sched
-        const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
-        if (ubatch.n_tokens < 32 || full_offload) {
-            if (il != -1 && strcmp(name, "norm") == 0) {
-                const auto & dev_layer = model.dev_layer(il);
-                for (auto & backend : lctx.backends) {
-                    if (ggml_backend_get_device(backend.get()) == dev_layer) {
-                        if (ggml_backend_supports_op(backend.get(), cur)) {
-                            ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get());
-                        }
-                    }
-                }
-            }
-        }
-    };
-
-    struct ggml_cgraph * result = NULL;
-
-    struct llm_build_context llm(lctx, lctx.get_model(), lctx.get_cparams(), ubatch, std::move(cb), lctx.init(), worst_case);
-
-    switch (model.arch) {
-        case LLM_ARCH_LLAMA:
-        case LLM_ARCH_MINICPM:
-        case LLM_ARCH_GRANITE:
-        case LLM_ARCH_GRANITE_MOE:
-            {
-                result = llm.build_llama();
-            } break;
-        case LLM_ARCH_DECI:
-            {
-                result = llm.build_deci();
-            } break;
-        case LLM_ARCH_BAICHUAN:
-            {
-                result = llm.build_baichuan();
-            } break;
-        case LLM_ARCH_FALCON:
-            {
-                result = llm.build_falcon();
-            } break;
-        case LLM_ARCH_GROK:
-            {
-                result = llm.build_grok();
-            } break;
-        case LLM_ARCH_STARCODER:
-            {
-                result = llm.build_starcoder();
-            } break;
-        case LLM_ARCH_REFACT:
-            {
-                result = llm.build_refact();
-            } break;
-        case LLM_ARCH_BERT:
-        case LLM_ARCH_JINA_BERT_V2:
-        case LLM_ARCH_NOMIC_BERT:
-            {
-                result = llm.build_bert();
-            } break;
-        case LLM_ARCH_BLOOM:
-            {
-                result = llm.build_bloom();
-            } break;
-        case LLM_ARCH_MPT:
-            {
-                result = llm.build_mpt();
-            } break;
-         case LLM_ARCH_STABLELM:
-            {
-                result = llm.build_stablelm();
-            } break;
-        case LLM_ARCH_QWEN:
-            {
-                result = llm.build_qwen();
-            } break;
-        case LLM_ARCH_QWEN2:
-            {
-                result = llm.build_qwen2();
-            } break;
-        case LLM_ARCH_QWEN2VL:
-            {
-                result = llm.build_qwen2vl();
-            } break;
-        case LLM_ARCH_QWEN2MOE:
-            {
-                result = llm.build_qwen2moe();
-            } break;
-        case LLM_ARCH_PHI2:
-            {
-                result = llm.build_phi2();
-            } break;
-        case LLM_ARCH_PHI3:
-        case LLM_ARCH_PHIMOE:
-            {
-                result = llm.build_phi3();
-            } break;
-        case LLM_ARCH_PLAMO:
-            {
-                result = llm.build_plamo();
-            } break;
-        case LLM_ARCH_GPT2:
-            {
-                result = llm.build_gpt2();
-            } break;
-        case LLM_ARCH_CODESHELL:
-            {
-                result = llm.build_codeshell();
-            } break;
-        case LLM_ARCH_ORION:
-            {
-                result = llm.build_orion();
-            } break;
-        case LLM_ARCH_INTERNLM2:
-            {
-                result = llm.build_internlm2();
-            } break;
-        case LLM_ARCH_MINICPM3:
-            {
-                result = llm.build_minicpm3();
-            } break;
-        case LLM_ARCH_GEMMA:
-            {
-                result = llm.build_gemma();
-            } break;
-        case LLM_ARCH_GEMMA2:
-            {
-                result = llm.build_gemma2();
-            } break;
-        case LLM_ARCH_STARCODER2:
-            {
-                result = llm.build_starcoder2();
-            } break;
-        case LLM_ARCH_MAMBA:
-            {
-                result = llm.build_mamba();
-            } break;
-        case LLM_ARCH_XVERSE:
-            {
-                result = llm.build_xverse();
-            } break;
-        case LLM_ARCH_COMMAND_R:
-            {
-                result = llm.build_command_r();
-            } break;
-        case LLM_ARCH_COHERE2:
-            {
-                result = llm.build_cohere2();
-            } break;
-        case LLM_ARCH_DBRX:
-            {
-                result = llm.build_dbrx();
-            } break;
-        case LLM_ARCH_OLMO:
-            {
-                result = llm.build_olmo();
-            } break;
-        case LLM_ARCH_OLMO2:
-            {
-                result = llm.build_olmo2();
-            } break;
-        case LLM_ARCH_OLMOE:
-            {
-                result = llm.build_olmoe();
-            } break;
-        case LLM_ARCH_OPENELM:
-            {
-                result = llm.build_openelm();
-            } break;
-        case LLM_ARCH_GPTNEOX:
-            {
-                result = llm.build_gptneox();
-            } break;
-        case LLM_ARCH_ARCTIC:
-            {
-                result = llm.build_arctic();
-            } break;
-        case LLM_ARCH_DEEPSEEK:
-            {
-                result = llm.build_deepseek();
-            } break;
-        case LLM_ARCH_DEEPSEEK2:
-            {
-                result = llm.build_deepseek2();
-            } break;
-        case LLM_ARCH_CHATGLM:
-            {
-                result = llm.build_chatglm();
-            } break;
-        case LLM_ARCH_BITNET:
-            {
-                result = llm.build_bitnet();
-            } break;
-        //case LLM_ARCH_T5:
-        //    {
-        //        if (lctx.is_encoding) {
-        //            result = llm.build_t5_enc();
-        //        } else {
-        //            result = llm.build_t5_dec();
-        //        }
-        //    } break;
-        //case LLM_ARCH_T5ENCODER:
-        //    {
-        //        result = llm.build_t5_enc();
-        //    } break;
-        case LLM_ARCH_JAIS:
-            {
-                result = llm.build_jais();
-            } break;
-        case LLM_ARCH_NEMOTRON:
-            {
-                result = llm.build_nemotron();
-            } break;
-        case LLM_ARCH_EXAONE:
-            {
-                result = llm.build_exaone();
-            } break;
-        case LLM_ARCH_RWKV6:
-            {
-                result = llm.build_rwkv6();
-            } break;
-        case LLM_ARCH_RWKV6QWEN2:
-            {
-                result = llm.build_rwkv6qwen2();
-            } break;
-        case LLM_ARCH_CHAMELEON:
-            {
-                result = llm.build_chameleon();
-            } break;
-        case LLM_ARCH_WAVTOKENIZER_DEC:
-            {
-                result = llm.build_wavtokenizer_dec();
-            } break;
-        default:
-            GGML_ABORT("fatal error");
-    }
-
-    // add on pooling layer
-    if (cparams.embeddings) {
-        result = llm.append_pooling(result);
-    }
-
-    return result;
-}
-
 //
 // interface implementation
 //
@@ -7740,10 +327,7 @@ struct llama_context * llama_init_from_model(
 
     try {
         // TODO: add logic which llama_context implementation to construct
-        ctx = new llama_context_unified(*model, params,
-                [](llama_context & lctx, const llama_ubatch & ubatch, bool worst_case) {
-                    return llama_build_graph(lctx, ubatch, worst_case);
-                });
+        ctx = new llama_context_unified(*model, params);
     } catch (const std::exception & e) {
         LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what());
         return nullptr;

From 6ee86e5e0f45e99fe2f0c3b322fe3ab82e632f9b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 12 Feb 2025 16:29:15 +0200
Subject: [PATCH 37/84] graph : restore ubatch in build_cb

ggml-ci
---
 src/llama-context.cpp | 6 ++----
 src/llama-context.h   | 1 +
 src/llama-graph.h     | 1 +
 src/llama-model.cpp   | 3 ++-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 74d6a67bbe9d2..62f76f48b9d08 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -196,6 +196,7 @@ bool llama_context::apply_adapter_cvec(
 void llama_context::build_cb(
          ggml_tensor * cur,
           const char * name,
+  const llama_ubatch & ubatch,
                  int   il) {
     if (il >= 0) {
         ggml_format_name(cur, "%s-%d", name, il);
@@ -213,10 +214,7 @@ void llama_context::build_cb(
     // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
     // FIXME: fix in ggml_backend_sched
     const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
-    // TODO: during #11213, the requirement for ubatch.n_tokens < 32 was removed to simplify
-    //       not sure if this is still needed, but it can be brought back if needed
-    //if (ubatch.n_tokens < 32 || full_offload) {
-    if (full_offload) {
+    if (ubatch.n_tokens < 32 || full_offload) {
         if (il != -1 && strcmp(name, "norm") == 0) {
             const auto & dev_layer = model.dev_layer(il);
             for (auto & backend : backends) {
diff --git a/src/llama-context.h b/src/llama-context.h
index 8d7a6ad58dec4..dc85c797100a4 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -85,6 +85,7 @@ struct llama_context : public llama_graph_i {
     virtual void build_cb(
              ggml_tensor * cur,
               const char * name,
+      const llama_ubatch & ubatch,
                      int   il);
 
     // TODO: add encode/decode graphs
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 0084d99ccade6..d111d76e92b93 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -14,6 +14,7 @@ class llama_graph_i {
     virtual void build_cb(
              ggml_tensor * cur,
               const char * name,
+      const llama_ubatch & ubatch,
                      int   il) = 0;
 
     // apply control vector for layer il
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index bded48be6c25b..ba11f1e1514cc 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -248,6 +248,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
             return cur_buft;
         }
     }
+
     return nullptr;
 }
 
@@ -3888,7 +3889,7 @@ struct llm_build_context {
 
     // TODO: tmp
     void cb(struct ggml_tensor * cur, const char * name, int il) {
-        lgf.build_cb(cur, name, il);
+        lgf.build_cb(cur, name, ubatch, il);
     }
 
     // TODO: tmp

From fbe6a07256c36264bfbb0749d2285f397edf38bb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 12 Feb 2025 17:16:44 +0200
Subject: [PATCH 38/84] context : rename to llama_context_kv_self

---
 src/llama-context.cpp | 140 +++++++++++++++++++++---------------------
 src/llama-context.h   |  54 ++++++++--------
 src/llama-graph.h     |   3 +
 src/llama-model.h     |   1 +
 src/llama.cpp         |   2 +-
 5 files changed, 102 insertions(+), 98 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 62f76f48b9d08..665a144d70252 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -332,10 +332,10 @@ void llama_context::perf_reset() {
 }
 
 //
-// llama_context_unified
+// llama_context_kv_self
 //
 
-llama_context_unified::llama_context_unified(
+llama_context_kv_self::llama_context_kv_self(
         const llama_model & model,
         const llama_context_params & params) : llama_context(model) {
     const auto & hparams = model.hparams;
@@ -636,29 +636,29 @@ llama_context_unified::llama_context_unified(
     }
 }
 
-llama_context_unified::~llama_context_unified() = default;
+llama_context_kv_self::~llama_context_kv_self() = default;
 
-uint32_t llama_context_unified::n_seq_max() const {
+uint32_t llama_context_kv_self::n_seq_max() const {
     // TODO: add notion of n_seq_max to llama_kv_cache and use it here
     return kv_self.size;
 }
 
-llama_kv_cache * llama_context_unified::get_kv_self() {
+llama_kv_cache * llama_context_kv_self::get_kv_self() {
     return &kv_self;
 }
 
-const llama_kv_cache * llama_context_unified::get_kv_self() const {
+const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
     return &kv_self;
 }
 
-float * llama_context_unified::get_logits() {
+float * llama_context_kv_self::get_logits() {
     // reorder logits for backward compatibility
     reorder_outputs();
 
     return logits;
 }
 
-float * llama_context_unified::get_logits_ith(int32_t i) {
+float * llama_context_kv_self::get_logits_ith(int32_t i) {
     int32_t j = -1;
 
     try {
@@ -696,14 +696,14 @@ float * llama_context_unified::get_logits_ith(int32_t i) {
     }
 }
 
-float * llama_context_unified::get_embeddings() {
+float * llama_context_kv_self::get_embeddings() {
     // reorder embeddings for backward compatibility
     reorder_outputs();
 
     return embd;
 }
 
-float * llama_context_unified::get_embeddings_ith(int32_t i) {
+float * llama_context_kv_self::get_embeddings_ith(int32_t i) {
     int32_t j = -1;
 
     try {
@@ -741,7 +741,7 @@ float * llama_context_unified::get_embeddings_ith(int32_t i) {
     }
 }
 
-float * llama_context_unified::get_embeddings_seq(llama_seq_id seq_id) {
+float * llama_context_kv_self::get_embeddings_seq(llama_seq_id seq_id) {
     auto it = embd_seq.find(seq_id);
     if (it == embd_seq.end()) {
         return nullptr;
@@ -750,7 +750,7 @@ float * llama_context_unified::get_embeddings_seq(llama_seq_id seq_id) {
     return it->second.data();
 }
 
-ggml_context_ptr llama_context_unified::init() {
+ggml_context_ptr llama_context_kv_self::init() {
     inp_tokens          = nullptr;
     inp_embd            = nullptr;
     inp_pos             = nullptr;
@@ -771,8 +771,8 @@ ggml_context_ptr llama_context_unified::init() {
     return llama_context::init();
 }
 
-struct llama_context_unified::batch_manager {
-    batch_manager(llama_context_unified & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
+struct llama_context_kv_self::batch_manager {
+    batch_manager(llama_context_kv_self & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
         const auto & model   = lctx.model;
         const auto & cparams = lctx.cparams;
         const auto & hparams = lctx.model.hparams;
@@ -982,18 +982,18 @@ struct llama_context_unified::batch_manager {
 
     int64_t n_outputs_all = 0;
 
-    llama_context_unified & lctx;
+    llama_context_kv_self & lctx;
 
     const llama_batch & batch;
 
     llama_kv_slot_restorer kv_slot_restorer;
 };
 
-std::unique_ptr<llama_context_unified::batch_manager> llama_context_unified::prepare_batch(const llama_batch & batch) {
+std::unique_ptr<llama_context_kv_self::batch_manager> llama_context_kv_self::prepare_batch(const llama_batch & batch) {
     return std::make_unique<batch_manager>(*this, batch);
 }
 
-int llama_context_unified::decode(llama_batch & inp_batch) {
+int llama_context_kv_self::decode(llama_batch & inp_batch) {
     is_encoding = false;
 
     if (inp_batch.n_tokens == 0) {
@@ -1198,7 +1198,7 @@ int llama_context_unified::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-int llama_context_unified::encode(llama_batch & inp_batch) {
+int llama_context_kv_self::encode(llama_batch & inp_batch) {
     is_encoding = true;
 
     if (inp_batch.n_tokens == 0) {
@@ -1375,7 +1375,7 @@ int llama_context_unified::encode(llama_batch & inp_batch) {
     return 0;
 }
 
-enum ggml_status llama_context_unified::compute_graph(
+enum ggml_status llama_context_kv_self::compute_graph(
             ggml_cgraph * graph,
                    bool   batched) {
     int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
@@ -1402,23 +1402,23 @@ enum ggml_status llama_context_unified::compute_graph(
     return status;
 }
 
-llama_pos llama_context_unified::pos_max() const {
+llama_pos llama_context_kv_self::pos_max() const {
     return kv_self.pos_max();
 }
 
-uint32_t llama_context_unified::get_ctx_padding(const llama_cparams & cparams) const {
+uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) const {
     return kv_self.get_padding(cparams);
 }
 
-void llama_context_unified::prepare_k_shift() {
+void llama_context_kv_self::prepare_k_shift() {
 }
 
-void llama_context_unified::prepare_defrag() {
+void llama_context_kv_self::prepare_defrag() {
 }
 
 // llama input
 
-void llama_context_unified::set_inputs(const llama_ubatch & ubatch) {
+void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) {
     const llama_hparams & hparams = model.hparams;
 
     //
@@ -1837,7 +1837,7 @@ void llama_context_unified::set_inputs(const llama_ubatch & ubatch) {
     }
 }
 
-void llama_context_unified::reorder_outputs() {
+void llama_context_kv_self::reorder_outputs() {
     std::vector<size_t> & out_ids = sbatch.out_ids;
     if (!out_ids.empty()) {
         const uint32_t n_vocab = model.vocab.n_tokens();
@@ -1875,7 +1875,7 @@ void llama_context_unified::reorder_outputs() {
     }
 }
 
-size_t llama_context_unified::reserve_outputs(size_t n_outputs) {
+size_t llama_context_kv_self::reserve_outputs(size_t n_outputs) {
     const auto & hparams = model.hparams;
     const auto & vocab   = model.vocab;
 
@@ -1944,7 +1944,7 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) {
     return n_outputs_max;
 }
 
-void llama_context_unified::kv_self_update() {
+void llama_context_kv_self::kv_self_update() {
     auto & kv = kv_self;
 
     if (kv.has_shift) {
@@ -2009,7 +2009,7 @@ void llama_context_unified::kv_self_update() {
     }
 }
 
-void llama_context_unified::build_attn_inp(
+void llama_context_kv_self::build_attn_inp(
         ggml_context * ctx0,
              int32_t   n_tokens,
                 bool   causal,
@@ -2040,7 +2040,7 @@ void llama_context_unified::build_attn_inp(
     }
 }
 
-void llama_context_unified::build_attn_kv_store(
+void llama_context_kv_self::build_attn_kv_store(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * k_cur,
@@ -2084,7 +2084,7 @@ void llama_context_unified::build_attn_kv_store(
     ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view));
 }
 
-ggml_tensor * llama_context_unified::build_attn_qkv(
+ggml_tensor * llama_context_kv_self::build_attn_qkv(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * wo,
@@ -2236,7 +2236,7 @@ ggml_tensor * llama_context_unified::build_attn_qkv(
     return cur;
 }
 
-ggml_tensor * llama_context_unified::build_soft_max_ext(
+ggml_tensor * llama_context_kv_self::build_soft_max_ext(
         ggml_context * ctx0,
          ggml_tensor * kq,
              float     kq_scale) {
@@ -2245,7 +2245,7 @@ ggml_tensor * llama_context_unified::build_soft_max_ext(
     return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
 }
 
-ggml_tensor * llama_context_unified::build_inp_embd(
+ggml_tensor * llama_context_kv_self::build_inp_embd(
         ggml_context * ctx0,
          ggml_tensor * tok_embd,
   const llama_ubatch & ubatch) {
@@ -2295,7 +2295,7 @@ ggml_tensor * llama_context_unified::build_inp_embd(
     return inpL;
 }
 
-ggml_tensor * llama_context_unified::build_inp_pos(
+ggml_tensor * llama_context_kv_self::build_inp_pos(
         ggml_context * ctx0,
              int32_t   n_tokens) {
     inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
@@ -2304,7 +2304,7 @@ ggml_tensor * llama_context_unified::build_inp_pos(
     return inp_pos;
 }
 
-ggml_tensor * llama_context_unified::build_inp_out_ids(
+ggml_tensor * llama_context_kv_self::build_inp_out_ids(
         ggml_context * ctx0,
              int32_t   n_tokens,
                 bool   worst_case) {
@@ -2316,7 +2316,7 @@ ggml_tensor * llama_context_unified::build_inp_out_ids(
     return inp_out_ids;
 }
 
-ggml_tensor * llama_context_unified::build_inp_mean(
+ggml_tensor * llama_context_kv_self::build_inp_mean(
         ggml_context * ctx0,
              int32_t   n_tokens) {
     inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
@@ -2325,7 +2325,7 @@ ggml_tensor * llama_context_unified::build_inp_mean(
     return inp_mean;
 }
 
-ggml_tensor * llama_context_unified::build_inp_cls(
+ggml_tensor * llama_context_kv_self::build_inp_cls(
         ggml_context * ctx0,
              int32_t   n_tokens) {
     inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -2334,7 +2334,7 @@ ggml_tensor * llama_context_unified::build_inp_cls(
     return inp_cls;
 }
 
-void llama_context_unified::build_k_shift(
+void llama_context_kv_self::build_k_shift(
         ggml_context * ctx0,
          ggml_cgraph * graph) {
     const auto & n_ctx      = cparams.n_ctx;
@@ -2406,7 +2406,7 @@ void llama_context_unified::build_k_shift(
     }
 }
 
-void llama_context_unified::build_defrag(
+void llama_context_kv_self::build_defrag(
         ggml_context * ctx0,
          ggml_cgraph * graph) {
     const auto & hparams = model.hparams;
@@ -2676,7 +2676,7 @@ void llama_context_unified::build_defrag(
 #endif
 }
 
-ggml_tensor * llama_context_unified::build_inp_embd_enc(
+ggml_tensor * llama_context_kv_self::build_inp_embd_enc(
         ggml_context * ctx0,
              int32_t   n_tokens,
                 bool   worst_case) {
@@ -2692,7 +2692,7 @@ ggml_tensor * llama_context_unified::build_inp_embd_enc(
     return inp_embd_enc;
 }
 
-ggml_tensor * llama_context_unified::build_inp_KQ_mask_cross(
+ggml_tensor * llama_context_kv_self::build_inp_KQ_mask_cross(
             ggml_context * ctx0,
                  int32_t   n_tokens,
                  bool   worst_case) {
@@ -2708,7 +2708,7 @@ ggml_tensor * llama_context_unified::build_inp_KQ_mask_cross(
     return inp_KQ_mask_cross;
 }
 
-ggml_tensor * llama_context_unified::build_inp_s_copy(
+ggml_tensor * llama_context_kv_self::build_inp_s_copy(
         ggml_context * ctx0,
                 bool   worst_case) {
     const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
@@ -2719,7 +2719,7 @@ ggml_tensor * llama_context_unified::build_inp_s_copy(
     return inp_s_copy;
 }
 
-ggml_tensor * llama_context_unified::build_inp_s_mask(
+ggml_tensor * llama_context_kv_self::build_inp_s_mask(
         ggml_context * ctx0,
                 bool   worst_case) {
     const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
@@ -2729,7 +2729,7 @@ ggml_tensor * llama_context_unified::build_inp_s_mask(
     return inp_s_mask;
 }
 
-ggml_tensor * llama_context_unified::build_copy_mask_state(
+ggml_tensor * llama_context_kv_self::build_copy_mask_state(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * s,
@@ -2764,7 +2764,7 @@ ggml_tensor * llama_context_unified::build_copy_mask_state(
 }
 
 // TODO: split
-ggml_tensor * llama_context_unified::build_mamba_layer(
+ggml_tensor * llama_context_kv_self::build_mamba_layer(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * cur,
@@ -2900,7 +2900,7 @@ ggml_tensor * llama_context_unified::build_mamba_layer(
 }
 
 
-ggml_tensor * llama_context_unified::build_rwkv_token_shift_load(
+ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * state_copy,
@@ -2927,7 +2927,7 @@ ggml_tensor * llama_context_unified::build_rwkv_token_shift_load(
 }
 
 
-ggml_tensor * llama_context_unified::build_rwkv_token_shift_store(
+ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store(
         ggml_context * ctx0,
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
@@ -2951,7 +2951,7 @@ ggml_tensor * llama_context_unified::build_rwkv_token_shift_store(
 }
 
 
-ggml_tensor * llama_context_unified::build_rwkv6_time_mix(
+ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
         ggml_context * ctx0,
          ggml_cgraph * graph,
          ggml_tensor * cur,
@@ -3130,7 +3130,7 @@ ggml_tensor * llama_context_unified::build_rwkv6_time_mix(
 
 // TODO: replace all non-fatal assertions with returned errors or exceptions
 struct llama_data_write {
-    llama_data_write(llama_context_unified * ctx) : ctx(ctx) {}
+    llama_data_write(llama_context_kv_self * ctx) : ctx(ctx) {}
     virtual ~llama_data_write() = default;
 
     virtual void write(const void * src, size_t size) = 0;
@@ -3215,11 +3215,11 @@ struct llama_data_write {
         }
     }
 
-    llama_context_unified * ctx;
+    llama_context_kv_self * ctx;
 };
 
 struct llama_data_read {
-    llama_data_read(llama_context_unified * ctx) : ctx(ctx) {}
+    llama_data_read(llama_context_kv_self * ctx) : ctx(ctx) {}
     virtual ~llama_data_read() = default;
 
     virtual const uint8_t * read(size_t size) = 0;
@@ -3311,11 +3311,11 @@ struct llama_data_read {
         }
     }
 
-    llama_context_unified * ctx;
+    llama_context_kv_self * ctx;
 };
 
 struct llama_data_write_dummy : llama_data_write {
-    llama_data_write_dummy(llama_context_unified * ctx) : llama_data_write(ctx) {}
+    llama_data_write_dummy(llama_context_kv_self * ctx) : llama_data_write(ctx) {}
 
     void write(const void * /* src */, size_t size) override {
         size_written += size;
@@ -3334,7 +3334,7 @@ struct llama_data_write_dummy : llama_data_write {
 
 struct llama_data_write_buffer : llama_data_write {
     llama_data_write_buffer(
-            llama_context_unified * ctx,
+            llama_context_kv_self * ctx,
             uint8_t * p, size_t len) : llama_data_write(ctx), ptr(p), buf_size(len) {}
 
     void write(const void * src, size_t size) override {
@@ -3368,7 +3368,7 @@ struct llama_data_write_buffer : llama_data_write {
 
 struct llama_data_read_buffer : llama_data_read {
     llama_data_read_buffer(
-            llama_context_unified * ctx,
+            llama_context_kv_self * ctx,
             const uint8_t * p, size_t len) : llama_data_read(ctx), ptr(p), buf_size(len) {}
 
     const uint8_t * read(size_t size) override {
@@ -3397,7 +3397,7 @@ struct llama_data_read_buffer : llama_data_read {
 
 struct llama_data_write_file : llama_data_write {
     llama_data_write_file(
-            llama_context_unified * ctx,
+            llama_context_kv_self * ctx,
             llama_file * f) : llama_data_write(ctx), file(f) {}
 
     void write(const void * src, size_t size) override {
@@ -3422,7 +3422,7 @@ struct llama_data_write_file : llama_data_write {
 
 struct llama_data_read_file : llama_data_read {
     llama_data_read_file(
-            llama_context_unified * ctx,
+            llama_context_kv_self * ctx,
             llama_file * f) : llama_data_read(ctx), file(f) {}
 
     void read_to(void * dst, size_t size) override {
@@ -3445,7 +3445,7 @@ struct llama_data_read_file : llama_data_read {
     std::vector<uint8_t> temp_buffer;
 };
 
-size_t llama_context_unified::state_get_size() {
+size_t llama_context_kv_self::state_get_size() {
     llama_data_write_dummy data_ctx(this);
     try {
         return state_get_data(data_ctx);
@@ -3455,7 +3455,7 @@ size_t llama_context_unified::state_get_size() {
     }
 }
 
-size_t llama_context_unified::state_get_data(uint8_t * dst, size_t size) {
+size_t llama_context_kv_self::state_get_data(uint8_t * dst, size_t size) {
     llama_data_write_buffer data_ctx(this, dst, size);
     try {
         return state_get_data(data_ctx);
@@ -3465,7 +3465,7 @@ size_t llama_context_unified::state_get_data(uint8_t * dst, size_t size) {
     }
 }
 
-size_t llama_context_unified::state_set_data(const uint8_t * src, size_t size) {
+size_t llama_context_kv_self::state_set_data(const uint8_t * src, size_t size) {
     llama_data_read_buffer data_ctx(this, src, size);
     try {
         return state_set_data(data_ctx);
@@ -3475,7 +3475,7 @@ size_t llama_context_unified::state_set_data(const uint8_t * src, size_t size) {
     }
 }
 
-size_t llama_context_unified::state_seq_get_size(llama_seq_id seq_id) {
+size_t llama_context_kv_self::state_seq_get_size(llama_seq_id seq_id) {
     llama_data_write_dummy data_ctx(this);
     try {
         return state_seq_get_data(data_ctx, seq_id);
@@ -3485,7 +3485,7 @@ size_t llama_context_unified::state_seq_get_size(llama_seq_id seq_id) {
     }
 }
 
-size_t llama_context_unified::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
+size_t llama_context_kv_self::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
     llama_data_write_buffer data_ctx(this, dst, size);
     try {
         return state_seq_get_data(data_ctx, seq_id);
@@ -3495,7 +3495,7 @@ size_t llama_context_unified::state_seq_get_data(llama_seq_id seq_id, uint8_t *
     }
 }
 
-size_t llama_context_unified::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
+size_t llama_context_kv_self::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
     llama_data_read_buffer data_ctx(this, src, size);
     try {
         return state_seq_set_data(data_ctx, seq_id);
@@ -3505,7 +3505,7 @@ size_t llama_context_unified::state_seq_set_data(llama_seq_id seq_id, const uint
     }
 }
 
-bool llama_context_unified::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+bool llama_context_kv_self::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
     llama_file file(filepath, "rb");
 
     // sanity checks
@@ -3548,7 +3548,7 @@ bool llama_context_unified::state_load_file(const char * filepath, llama_token *
     return true;
 }
 
-bool llama_context_unified::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) {
+bool llama_context_kv_self::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) {
     llama_file file(filepath, "wb");
 
     file.write_u32(LLAMA_SESSION_MAGIC);
@@ -3565,7 +3565,7 @@ bool llama_context_unified::state_save_file(const char * filepath, const llama_t
     return true;
 }
 
-size_t llama_context_unified::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+size_t llama_context_kv_self::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
     llama_file file(filepath, "rb");
 
     // version checks
@@ -3608,7 +3608,7 @@ size_t llama_context_unified::state_seq_load_file(llama_seq_id seq_id, const cha
     return file.tell();
 }
 
-size_t llama_context_unified::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) {
+size_t llama_context_kv_self::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) {
     llama_file file(filepath, "wb");
 
     file.write_u32(LLAMA_STATE_SEQ_MAGIC);
@@ -3641,7 +3641,7 @@ size_t llama_context_unified::state_seq_save_file(llama_seq_id seq_id, const cha
  * llama_state_get_data_internal(ctx, data_ctx);
  *
 */
-size_t llama_context_unified::state_get_data(llama_data_write & data_ctx) {
+size_t llama_context_kv_self::state_get_data(llama_data_write & data_ctx) {
     synchronize();
 
     data_ctx.write_model_info();
@@ -3667,7 +3667,7 @@ size_t llama_context_unified::state_get_data(llama_data_write & data_ctx) {
     return data_ctx.get_size_written();
 }
 
-size_t llama_context_unified::state_set_data(llama_data_read & data_ctx) {
+size_t llama_context_kv_self::state_set_data(llama_data_read & data_ctx) {
     synchronize();
 
     data_ctx.read_model_info();
@@ -3693,7 +3693,7 @@ size_t llama_context_unified::state_set_data(llama_data_read & data_ctx) {
     return data_ctx.get_size_read();
 }
 
-size_t llama_context_unified::state_seq_get_data(llama_data_write & data_ctx, llama_seq_id seq_id) {
+size_t llama_context_kv_self::state_seq_get_data(llama_data_write & data_ctx, llama_seq_id seq_id) {
     synchronize();
 
     llama_kv_cache::io io = {
@@ -3712,7 +3712,7 @@ size_t llama_context_unified::state_seq_get_data(llama_data_write & data_ctx, ll
     return data_ctx.get_size_written();
 }
 
-size_t llama_context_unified::state_seq_set_data(llama_data_read & data_ctx, llama_seq_id seq_id) {
+size_t llama_context_kv_self::state_seq_set_data(llama_data_read & data_ctx, llama_seq_id seq_id) {
     synchronize();
 
     llama_kv_cache::io io = {
diff --git a/src/llama-context.h b/src/llama-context.h
index dc85c797100a4..648a41045a070 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -82,6 +82,8 @@ struct llama_context : public llama_graph_i {
                 int32_t   il_start,
                 int32_t   il_end);
 
+    // graph build API (generic)
+
     virtual void build_cb(
              ggml_tensor * cur,
               const char * name,
@@ -91,6 +93,27 @@ struct llama_context : public llama_graph_i {
     // TODO: add encode/decode graphs
     virtual ggml_cgraph * build_graph(const llama_ubatch & ubatch, bool worst_case);
 
+    // apply control vector for layer il
+    virtual ggml_tensor * build_cvec(
+            ggml_context * ctx0,
+             ggml_tensor * cur,
+                     int   il);
+
+    // do mat_mul, while optionally apply lora
+    virtual ggml_tensor * build_lora_mm(
+            ggml_context * ctx0,
+             ggml_tensor * w,
+             ggml_tensor * cur);
+
+    // do mat_mul_id, while optionally apply lora
+    virtual ggml_tensor * build_lora_mm_id(
+            ggml_context * ctx0,
+             ggml_tensor * w,   // struct ggml_tensor * as
+             ggml_tensor * cur, // struct ggml_tensor * b
+             ggml_tensor * ids);
+
+    virtual ggml_tensor * build_rope_factors(int il);
+
     // decode a batch of tokens by evaluating the transformer
     // in case of unsuccessful decoding (error or warning),
     // the kv_cache state will be returned to its original state
@@ -116,29 +139,6 @@ struct llama_context : public llama_graph_i {
     //
     virtual int encode(llama_batch & inp_batch) = 0;
 
-    // graph build API (generic)
-
-    // apply control vector for layer il
-    virtual ggml_tensor * build_cvec(
-            ggml_context * ctx0,
-             ggml_tensor * cur,
-                     int   il);
-
-    // do mat_mul, while optionally apply lora
-    virtual ggml_tensor * build_lora_mm(
-            ggml_context * ctx0,
-             ggml_tensor * w,
-             ggml_tensor * cur);
-
-    // do mat_mul_id, while optionally apply lora
-    virtual ggml_tensor * build_lora_mm_id(
-            ggml_context * ctx0,
-             ggml_tensor * w,   // struct ggml_tensor * as
-             ggml_tensor * cur, // struct ggml_tensor * b
-             ggml_tensor * ids);
-
-    virtual ggml_tensor * build_rope_factors(int il);
-
     // state save/load
 
     virtual size_t state_get_size()                                 = 0;
@@ -217,16 +217,16 @@ struct llama_context : public llama_graph_i {
     mutable int32_t n_eval   = 0; // number of eval calls
 };
 
-// TODO: make implementation details private
-class llama_context_unified : public llama_context {
+// transformer with a self-attention KV cache
+class llama_context_kv_self : public llama_context {
 public:
     struct batch_manager;
 
-    llama_context_unified(
+    llama_context_kv_self(
             const llama_model & model,
             const llama_context_params & params);
 
-    virtual ~llama_context_unified();
+    virtual ~llama_context_kv_self();
 
     virtual uint32_t n_seq_max() const override;
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d111d76e92b93..5267d53da4c06 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -2,6 +2,9 @@
 
 #include <cstdint>
 
+// note: do not add high-level objects here, such as llama_context, llama_kv_cache, etc.
+//       not sure about llama_batch/llama_sbatch yet
+
 struct ggml_cgraph;
 struct ggml_context;
 struct ggml_tensor;
diff --git a/src/llama-model.h b/src/llama-model.h
index 5d2a07abc570f..0374b484b10ab 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -368,6 +368,7 @@ struct llama_model {
     const struct ggml_tensor * get_tensor(const char * name) const;
 
     // TODO: add encode/decode graphs
+    // TODO: return a struct containing the graph and the output tensors, such as logits, embeddings, etc.
     ggml_cgraph * build_graph(
              llama_graph_i &  lgf,
        const llama_cparams &  cparams,
diff --git a/src/llama.cpp b/src/llama.cpp
index 83b66035fc585..d20a2a6d50f60 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -327,7 +327,7 @@ struct llama_context * llama_init_from_model(
 
     try {
         // TODO: add logic which llama_context implementation to construct
-        ctx = new llama_context_unified(*model, params);
+        ctx = new llama_context_kv_self(*model, params);
     } catch (const std::exception & e) {
         LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what());
         return nullptr;

From 3a504d9a0bd7d952d22cd2d707446de2316ec955 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 13 Feb 2025 12:18:44 +0200
Subject: [PATCH 39/84] llama : introduce llama_io interfaces

ggml-ci
---
 src/CMakeLists.txt     |   1 +
 src/llama-context.cpp  | 488 +++++++++++++++--------------------------
 src/llama-context.h    |  14 +-
 src/llama-io.cpp       |  15 ++
 src/llama-io.h         |  35 +++
 src/llama-kv-cache.cpp |  18 +-
 src/llama-kv-cache.h   |  21 +-
 7 files changed, 254 insertions(+), 338 deletions(-)
 create mode 100644 src/llama-io.cpp
 create mode 100644 src/llama-io.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f1f5d41d495a1..7f919c90ec5c3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,6 +18,7 @@ add_library(llama
             llama-graph.cpp
             llama-hparams.cpp
             llama-impl.cpp
+            llama-io.cpp
             llama-kv-cache.cpp
             llama-mmap.cpp
             llama-model-loader.cpp
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 665a144d70252..d6618f1438869 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2,6 +2,7 @@
 
 #include "llama-impl.h"
 #include "llama-mmap.h"
+#include "llama-io.h"
 
 #include <cassert>
 #include <cmath>
@@ -3128,214 +3129,29 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
 
 // TODO: this needs a big rework
 
-// TODO: replace all non-fatal assertions with returned errors or exceptions
-struct llama_data_write {
-    llama_data_write(llama_context_kv_self * ctx) : ctx(ctx) {}
-    virtual ~llama_data_write() = default;
-
-    virtual void write(const void * src, size_t size) = 0;
-    virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
-    virtual size_t get_size_written() = 0;
-
-    void write_string(const std::string & str) {
-        uint32_t str_size = str.size();
-
-        write(&str_size,  sizeof(str_size));
-        write(str.data(), str_size);
-    }
-
-    void write_model_info() {
-        const auto & model = ctx->get_model();
-        const std::string arch_str = llm_arch_name(model.arch);
-        write_string(arch_str);
-        // TODO: add more model-specific info which should prevent loading the session file if not identical
-    }
-
-    //void write_rng(const std::mt19937 & rng) {
-    //    std::ostringstream rng_ss;
-    //    rng_ss << rng;
-
-    //    const std::string & rng_str = rng_ss.str();
-
-    //    write_string(rng_str);
-    //}
-
-    void write_output_ids() {
-        ctx->reorder_outputs();
-
-        const uint32_t n_outputs = ctx->n_outputs;
-
-        std::vector<int32_t> output_pos;
-
-        const size_t    n_batch = ctx->n_batch();
-        const auto & output_ids = ctx->output_ids;
-
-        GGML_ASSERT(n_outputs <= ctx->output_size);
-
-        output_pos.resize(n_outputs);
-
-        // build a more compact representation of the output ids
-        for (size_t i = 0; i < n_batch; ++i) {
-            // map an output id to a position in the batch
-            int32_t pos = output_ids[i];
-            if (pos >= 0) {
-                GGML_ASSERT((uint32_t) pos < n_outputs);
-                output_pos[pos] = i;
-            }
-        }
-
-        write(&n_outputs, sizeof(n_outputs));
-
-        if (n_outputs) {
-            write(output_pos.data(), n_outputs * sizeof(int32_t));
-        }
-    }
-
-    void write_logits() {
-        const auto & model = ctx->get_model();
-
-        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * model.vocab.n_tokens());
-
-        write(&logits_size, sizeof(logits_size));
-
-        if (logits_size) {
-            write(ctx->logits, logits_size * sizeof(float));
-        }
-    }
-
-    void write_embeddings() {
-        const auto & model = ctx->get_model();
-
-        const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * model.hparams.n_embd);
-
-        write(&embeddings_size, sizeof(embeddings_size));
-
-        if (embeddings_size) {
-            write(ctx->embd, embeddings_size * sizeof(float));
-        }
-    }
-
-    llama_context_kv_self * ctx;
-};
-
-struct llama_data_read {
-    llama_data_read(llama_context_kv_self * ctx) : ctx(ctx) {}
-    virtual ~llama_data_read() = default;
-
-    virtual const uint8_t * read(size_t size) = 0;
-    virtual void read_to(void * dst, size_t size) = 0;
-    virtual size_t get_size_read() = 0;
-
-    void read_string(std::string & str) {
-        uint32_t str_size;
-        read_to(&str_size, sizeof(str_size));
-
-        str.assign((const char *) read(str_size), str_size);
-    }
-
-    // validate model information
-    void read_model_info() {
-        const auto & model = ctx->get_model();
-
-        const std::string cur_arch_str = llm_arch_name(model.arch);
-
-        std::string arch_str;
-        read_string(arch_str);
-        if (cur_arch_str != arch_str) {
-            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
-        }
-        // TODO: add more info which needs to be identical but which is not verified otherwise
-    }
-
-    //void read_rng(std::mt19937 & rng) {
-    //    std::string rng_str;
-    //    read_string(rng_str);
-
-    //    std::istringstream rng_ss(rng_str);
-    //    rng_ss >> rng;
-
-    //    if (rng_ss.fail()) {
-    //        throw std::runtime_error("failed to load RNG state");
-    //    }
-    //}
-
-    void read_output_ids() {
-        std::vector<int32_t> output_pos;
-
-        uint32_t n_outputs;
-        read_to(&n_outputs, sizeof(n_outputs));
-
-        if (n_outputs > ctx->reserve_outputs(n_outputs)) {
-            throw std::runtime_error("could not reserve outputs");
-        }
-
-        if (n_outputs) {
-            output_pos.resize(n_outputs);
-            read_to(output_pos.data(), n_outputs * sizeof(int32_t));
-
-            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
-                int32_t id = output_pos[i];
-                if ((uint32_t) id >= ctx->n_batch()) {
-                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->n_batch()));
-                }
-                ctx->output_ids[id] = i;
-            }
-
-            ctx->n_outputs = n_outputs;
-        }
-    }
-
-    void read_logits() {
-        uint64_t logits_size;
-        read_to(&logits_size, sizeof(logits_size));
-
-        if (ctx->logits_size < logits_size) {
-            throw std::runtime_error("logits buffer too small");
-        }
-
-        if (logits_size) {
-            read_to(ctx->logits, logits_size * sizeof(float));
-        }
-    }
-
-    void read_embeddings() {
-        uint64_t embeddings_size;
-        read_to(&embeddings_size, sizeof(embeddings_size));
-
-        if (ctx->embd_size < embeddings_size) {
-            throw std::runtime_error("embeddings buffer too small");
-        }
-
-        if (embeddings_size) {
-            read_to(ctx->embd, embeddings_size * sizeof(float));
-        }
-    }
-
-    llama_context_kv_self * ctx;
-};
-
-struct llama_data_write_dummy : llama_data_write {
-    llama_data_write_dummy(llama_context_kv_self * ctx) : llama_data_write(ctx) {}
+class llama_io_write_dummy : public llama_io_write_i {
+public:
+    llama_io_write_dummy() = default;
 
     void write(const void * /* src */, size_t size) override {
         size_written += size;
     }
 
-    void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
+    void write_tensor(const ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
         size_written += size;
     }
 
-    size_t get_size_written() override {
+    size_t n_bytes() override {
         return size_written;
     }
 
     size_t size_written = 0;
 };
 
-struct llama_data_write_buffer : llama_data_write {
-    llama_data_write_buffer(
-            llama_context_kv_self * ctx,
-            uint8_t * p, size_t len) : llama_data_write(ctx), ptr(p), buf_size(len) {}
+class llama_io_write_buffer : public llama_io_write_i {
+public:
+    llama_io_write_buffer(
+            uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
 
     void write(const void * src, size_t size) override {
         if (size > buf_size) {
@@ -3347,7 +3163,7 @@ struct llama_data_write_buffer : llama_data_write {
         buf_size -= size;
     }
 
-    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
+    void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
         if (size > buf_size) {
             throw std::runtime_error("unexpectedly reached end of buffer");
         }
@@ -3357,7 +3173,7 @@ struct llama_data_write_buffer : llama_data_write {
         buf_size -= size;
     }
 
-    size_t get_size_written() override {
+    size_t n_bytes() override {
         return size_written;
     }
 
@@ -3366,10 +3182,9 @@ struct llama_data_write_buffer : llama_data_write {
     size_t size_written = 0;
 };
 
-struct llama_data_read_buffer : llama_data_read {
-    llama_data_read_buffer(
-            llama_context_kv_self * ctx,
-            const uint8_t * p, size_t len) : llama_data_read(ctx), ptr(p), buf_size(len) {}
+class llama_io_read_buffer : public llama_io_read_i {
+public:
+    llama_io_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
 
     const uint8_t * read(size_t size) override {
         const uint8_t * base_ptr = ptr;
@@ -3386,7 +3201,7 @@ struct llama_data_read_buffer : llama_data_read {
         memcpy(dst, read(size), size);
     }
 
-    size_t get_size_read() override {
+    size_t n_bytes() override {
         return size_read;
     }
 
@@ -3395,23 +3210,22 @@ struct llama_data_read_buffer : llama_data_read {
     size_t size_read = 0;
 };
 
-struct llama_data_write_file : llama_data_write {
-    llama_data_write_file(
-            llama_context_kv_self * ctx,
-            llama_file * f) : llama_data_write(ctx), file(f) {}
+class llama_io_write_file : public llama_io_write_i {
+public:
+    llama_io_write_file(llama_file * f) : file(f) {}
 
     void write(const void * src, size_t size) override {
         file->write_raw(src, size);
         size_written += size;
     }
 
-    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
+    void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
         temp_buffer.resize(size);
         ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
         write(temp_buffer.data(), temp_buffer.size());
     }
 
-    size_t get_size_written() override {
+    size_t n_bytes() override {
         return size_written;
     }
 
@@ -3420,10 +3234,9 @@ struct llama_data_write_file : llama_data_write {
     std::vector<uint8_t> temp_buffer;
 };
 
-struct llama_data_read_file : llama_data_read {
-    llama_data_read_file(
-            llama_context_kv_self * ctx,
-            llama_file * f) : llama_data_read(ctx), file(f) {}
+class llama_io_read_file : public llama_io_read_i {
+public:
+    llama_io_read_file(llama_file * f) : file(f) {}
 
     void read_to(void * dst, size_t size) override {
         file->read_raw(dst, size);
@@ -3436,7 +3249,7 @@ struct llama_data_read_file : llama_data_read {
         return temp_buffer.data();
     }
 
-    size_t get_size_read() override {
+    size_t n_bytes() override {
         return size_read;
     }
 
@@ -3446,9 +3259,9 @@ struct llama_data_read_file : llama_data_read {
 };
 
 size_t llama_context_kv_self::state_get_size() {
-    llama_data_write_dummy data_ctx(this);
+    llama_io_write_dummy io;
     try {
-        return state_get_data(data_ctx);
+        return state_get_data(io);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
         return 0;
@@ -3456,9 +3269,9 @@ size_t llama_context_kv_self::state_get_size() {
 }
 
 size_t llama_context_kv_self::state_get_data(uint8_t * dst, size_t size) {
-    llama_data_write_buffer data_ctx(this, dst, size);
+    llama_io_write_buffer io(dst, size);
     try {
-        return state_get_data(data_ctx);
+        return state_get_data(io);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
         return 0;
@@ -3466,9 +3279,9 @@ size_t llama_context_kv_self::state_get_data(uint8_t * dst, size_t size) {
 }
 
 size_t llama_context_kv_self::state_set_data(const uint8_t * src, size_t size) {
-    llama_data_read_buffer data_ctx(this, src, size);
+    llama_io_read_buffer io(src, size);
     try {
-        return state_set_data(data_ctx);
+        return state_set_data(io);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
         return 0;
@@ -3476,9 +3289,9 @@ size_t llama_context_kv_self::state_set_data(const uint8_t * src, size_t size) {
 }
 
 size_t llama_context_kv_self::state_seq_get_size(llama_seq_id seq_id) {
-    llama_data_write_dummy data_ctx(this);
+    llama_io_write_dummy io;
     try {
-        return state_seq_get_data(data_ctx, seq_id);
+        return state_seq_get_data(io, seq_id);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
         return 0;
@@ -3486,9 +3299,9 @@ size_t llama_context_kv_self::state_seq_get_size(llama_seq_id seq_id) {
 }
 
 size_t llama_context_kv_self::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
-    llama_data_write_buffer data_ctx(this, dst, size);
+    llama_io_write_buffer io(dst, size);
     try {
-        return state_seq_get_data(data_ctx, seq_id);
+        return state_seq_get_data(io, seq_id);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
         return 0;
@@ -3496,9 +3309,9 @@ size_t llama_context_kv_self::state_seq_get_data(llama_seq_id seq_id, uint8_t *
 }
 
 size_t llama_context_kv_self::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
-    llama_data_read_buffer data_ctx(this, src, size);
+    llama_io_read_buffer io(src, size);
     try {
-        return state_seq_set_data(data_ctx, seq_id);
+        return state_seq_set_data(io, seq_id);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
         return 0;
@@ -3536,8 +3349,8 @@ bool llama_context_kv_self::state_load_file(const char * filepath, llama_token *
     {
         const size_t n_state_size_cur = file.size() - file.tell();
 
-        llama_data_read_file data_ctx(this, &file);
-        const size_t n_read = state_set_data(data_ctx);
+        llama_io_read_file io( &file);
+        const size_t n_read = state_set_data(io);
 
         if (n_read != n_state_size_cur) {
             LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
@@ -3559,8 +3372,8 @@ bool llama_context_kv_self::state_save_file(const char * filepath, const llama_t
     file.write_raw(tokens, sizeof(llama_token) * n_token_count);
 
     // save the context state using stream saving
-    llama_data_write_file data_ctx(this, &file);
-    state_get_data(data_ctx);
+    llama_io_write_file io(&file);
+    state_get_data(io);
 
     return true;
 }
@@ -3595,8 +3408,8 @@ size_t llama_context_kv_self::state_seq_load_file(llama_seq_id seq_id, const cha
     // restore the context state
     {
         const size_t state_size = file.size() - file.tell();
-        llama_data_read_file data_ctx(this, &file);
-        const size_t nread = state_seq_set_data(data_ctx, seq_id);
+        llama_io_read_file io(&file);
+        const size_t nread = state_seq_set_data(io, seq_id);
         if (!nread) {
             LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
             return 0;
@@ -3619,116 +3432,171 @@ size_t llama_context_kv_self::state_seq_save_file(llama_seq_id seq_id, const cha
     file.write_raw(tokens, sizeof(llama_token) * n_token_count);
 
     // save the context state using stream saving
-    llama_data_write_file data_ctx(this, &file);
-    state_seq_get_data(data_ctx, seq_id);
+    llama_io_write_file io(&file);
+    state_seq_get_data(io, seq_id);
 
     const size_t res = file.tell();
-    GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
+    GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
 
     return res;
 }
 
-/** copy state data into either a buffer or file depending on the passed in context
- *
- * file context:
- * llama_file file("/path", "wb");
- * llama_data_write_file data_ctx(&file);
- * llama_state_get_data_internal(ctx, data_ctx);
- *
- * buffer context:
- * std::vector<uint8_t> buf(max_size, 0);
- * llama_data_write_buffer data_ctx(buf.data(), max_size);
- * llama_state_get_data_internal(ctx, data_ctx);
- *
-*/
-size_t llama_context_kv_self::state_get_data(llama_data_write & data_ctx) {
+size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
     synchronize();
 
-    data_ctx.write_model_info();
-
-    // copy outputs
-    data_ctx.write_output_ids();
-    data_ctx.write_logits();
-    data_ctx.write_embeddings();
-
-    llama_kv_cache::io io = {
-        /* .write = */ [&](const void * src, size_t size) {
-            data_ctx.write(src, size);
-        },
-        /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
-            data_ctx.write_tensor_data(tensor, offset, size);
-        },
-        /* .read    = */ nullptr,
-        /* .read_to = */ nullptr,
-    };
+    // write model info
+    {
+        const std::string arch_str = llm_arch_name(model.arch);
+        io.write_string(arch_str);
+        // TODO: add more model-specific info which should prevent loading the session file if not identical
+    }
+
+    // write output ids
+    {
+        reorder_outputs();
+
+        const uint32_t n_outputs = this->n_outputs;
+        const auto & output_ids  = this->output_ids;
+
+        std::vector<int32_t> w_output_pos;
+
+        GGML_ASSERT(n_outputs <= output_size);
+
+        w_output_pos.resize(n_outputs);
+
+        // build a more compact representation of the output ids
+        for (size_t i = 0; i < n_batch(); ++i) {
+            // map an output id to a position in the batch
+            int32_t pos = output_ids[i];
+            if (pos >= 0) {
+                GGML_ASSERT((uint32_t) pos < n_outputs);
+                w_output_pos[pos] = i;
+            }
+        }
+
+        io.write(&n_outputs, sizeof(n_outputs));
+
+        if (n_outputs) {
+            io.write(w_output_pos.data(), n_outputs * sizeof(int32_t));
+        }
+    }
+
+    // write logits
+    {
+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
+
+        io.write(&logits_size, sizeof(logits_size));
+
+        if (logits_size) {
+            io.write(logits, logits_size * sizeof(float));
+        }
+    }
+
+    // write mbeddings
+    {
+        const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd);
+
+        io.write(&embd_size, sizeof(embd_size));
+
+        if (embd_size) {
+            io.write(embd, embd_size * sizeof(float));
+        }
+    }
 
     kv_self.state_write(io, model.hparams);
 
-    return data_ctx.get_size_written();
+    return io.n_bytes();
 }
 
-size_t llama_context_kv_self::state_set_data(llama_data_read & data_ctx) {
+size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
     synchronize();
 
-    data_ctx.read_model_info();
-
-    // set outputs
-    data_ctx.read_output_ids();
-    data_ctx.read_logits();
-    data_ctx.read_embeddings();
-
-    llama_kv_cache::io io = {
-        /* .write = */ nullptr,
-        /* .write_tensor_data = */ nullptr,
-        /* .read = */ [&](size_t size) {
-            return data_ctx.read(size);
-        },
-        /* .read_to = */ [&](void * dst, size_t size) {
-            data_ctx.read_to(dst, size);
-        },
-    };
+    // read model info
+    {
+        const std::string cur_arch_str = llm_arch_name(model.arch);
+
+        std::string arch_str;
+        io.read_string(arch_str);
+        if (cur_arch_str != arch_str) {
+            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
+        }
+        // TODO: add more info which needs to be identical but which is not verified otherwise
+    }
+
+    // read output ids
+    {
+        std::vector<int32_t> output_pos;
+
+        uint32_t n_outputs;
+        io.read_to(&n_outputs, sizeof(n_outputs));
+
+        if (n_outputs > reserve_outputs(n_outputs)) {
+            throw std::runtime_error("could not reserve outputs");
+        }
+
+        if (n_outputs) {
+            output_pos.resize(n_outputs);
+            io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
+
+            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
+                int32_t id = output_pos[i];
+                if ((uint32_t) id >= n_batch()) {
+                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch()));
+                }
+                this->output_ids[id] = i;
+            }
+
+            this->n_outputs = n_outputs;
+        }
+    }
+
+    // read logits
+    {
+        uint64_t logits_size;
+        io.read_to(&logits_size, sizeof(logits_size));
+
+        if (this->logits_size < logits_size) {
+            throw std::runtime_error("logits buffer too small");
+        }
+
+        if (logits_size) {
+            io.read_to(this->logits, logits_size * sizeof(float));
+        }
+    }
+
+    // read embeddings
+    {
+        uint64_t embd_size;
+        io.read_to(&embd_size, sizeof(embd_size));
+
+        if (this->embd_size < embd_size) {
+            throw std::runtime_error("embeddings buffer too small");
+        }
+
+        if (embd_size) {
+            io.read_to(this->embd, embd_size * sizeof(float));
+        }
+    }
 
     kv_self.state_read(io, model.hparams);
 
-    return data_ctx.get_size_read();
+    return io.n_bytes();
 }
 
-size_t llama_context_kv_self::state_seq_get_data(llama_data_write & data_ctx, llama_seq_id seq_id) {
+size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
     synchronize();
 
-    llama_kv_cache::io io = {
-        /* .write = */ [&](const void * src, size_t size) {
-            data_ctx.write(src, size);
-        },
-        /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
-            data_ctx.write_tensor_data(tensor, offset, size);
-        },
-        /* .read = */    nullptr,
-        /* .read_to = */ nullptr,
-    };
-
     kv_self.state_write(io, model.hparams, seq_id);
 
-    return data_ctx.get_size_written();
+    return io.n_bytes();
 }
 
-size_t llama_context_kv_self::state_seq_set_data(llama_data_read & data_ctx, llama_seq_id seq_id) {
+size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
     synchronize();
 
-    llama_kv_cache::io io = {
-        /* .write = */ nullptr,
-        /* .write_tensor_data = */ nullptr,
-        /* .read = */ [&](size_t size) {
-            return data_ctx.read(size);
-        },
-        /* .read_to = */ [&](void * dst, size_t size) {
-            data_ctx.read_to(dst, size);
-        },
-    };
-
     kv_self.state_read(io, model.hparams, seq_id);
 
-    return data_ctx.get_size_read();
+    return io.n_bytes();
 }
 
 //
diff --git a/src/llama-context.h b/src/llama-context.h
index 648a41045a070..204793d75a5b1 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -15,6 +15,9 @@
 #include <vector>
 #include <set>
 
+class llama_io_read_i;
+class llama_io_write_i;
+
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
 struct llama_context : public llama_graph_i {
@@ -178,9 +181,10 @@ struct llama_context : public llama_graph_i {
     virtual llama_perf_context_data perf_get_data() const;
     virtual void perf_reset();
 
+protected:
+
     // members
 
-protected:
     const llama_model & model;
 
     llama_cparams      cparams;
@@ -502,11 +506,11 @@ class llama_context_kv_self : public llama_context {
                 size_t   n_token_count) override;
 
 private:
-    size_t state_get_data(struct llama_data_write & data_ctx);
-    size_t state_set_data(struct llama_data_read  & data_ctx);
+    size_t state_get_data(llama_io_write_i & io);
+    size_t state_set_data(llama_io_read_i  & io);
 
-    size_t state_seq_get_data(struct llama_data_write & data_ctx, llama_seq_id seq_id);
-    size_t state_seq_set_data(struct llama_data_read  & data_ctx, llama_seq_id seq_id);
+    size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id);
+    size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id);
 };
 
 // For internal test use
diff --git a/src/llama-io.cpp b/src/llama-io.cpp
new file mode 100644
index 0000000000000..7ad70d163343d
--- /dev/null
+++ b/src/llama-io.cpp
@@ -0,0 +1,15 @@
+#include "llama-io.h"
+
+void llama_io_write_i::write_string(const std::string & str) {
+    uint32_t str_size = str.size();
+
+    write(&str_size,  sizeof(str_size));
+    write(str.data(), str_size);
+}
+
+void llama_io_read_i::read_string(std::string & str) {
+    uint32_t str_size;
+    read_to(&str_size, sizeof(str_size));
+
+    str.assign((const char *) read(str_size), str_size);
+}
diff --git a/src/llama-io.h b/src/llama-io.h
new file mode 100644
index 0000000000000..ce9216b83b192
--- /dev/null
+++ b/src/llama-io.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+struct ggml_tensor;
+
+class llama_io_write_i {
+public:
+    llama_io_write_i() = default;
+    virtual ~llama_io_write_i() = default;
+
+    virtual void write(const void * src, size_t size) = 0;
+    virtual void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) = 0;
+
+    // bytes written so far
+    virtual size_t n_bytes() = 0;
+
+    void write_string(const std::string & str);
+};
+
+class llama_io_read_i {
+public:
+    llama_io_read_i() = default;
+    virtual ~llama_io_read_i() = default;
+
+    virtual const uint8_t * read(size_t size) = 0;
+    virtual void read_to(void * dst, size_t size) = 0;
+
+    // bytes read so far
+    virtual size_t n_bytes() = 0;
+
+    void read_string(std::string & str);
+};
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index b79c2ff934a6e..c93410f0a412c 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -698,7 +698,7 @@ size_t llama_kv_cache::size_v_bytes() const {
     return size_v_bytes;
 }
 
-void llama_kv_cache::state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) const {
+void llama_kv_cache::state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id) const {
     std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
     uint32_t cell_count = 0;
 
@@ -736,7 +736,7 @@ void llama_kv_cache::state_write(const io & io, const llama_hparams & hparams, l
     state_write_data(io, cell_ranges, hparams);
 }
 
-void llama_kv_cache::state_read(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) {
+void llama_kv_cache::state_read(llama_io_read_i & io, const llama_hparams & hparams, llama_seq_id seq_id) {
     uint32_t cell_count;
     io.read_to(&cell_count, sizeof(cell_count));
 
@@ -754,7 +754,7 @@ void llama_kv_cache::state_read(const io & io, const llama_hparams & hparams, ll
     }
 }
 
-void llama_kv_cache::state_write_meta(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+void llama_kv_cache::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
     for (const auto & range : cell_ranges) {
         for (uint32_t i = range.first; i < range.second; ++i) {
             const auto & cell = cells[i];
@@ -773,7 +773,7 @@ void llama_kv_cache::state_write_meta(const io & io, const std::vector<std::pair
     }
 }
 
-void llama_kv_cache::state_write_data(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const {
+void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const {
     const uint32_t v_trans = this->v_trans ? 1 : 0;
     const uint32_t n_layer = hparams.n_layer;
 
@@ -799,7 +799,7 @@ void llama_kv_cache::state_write_data(const io & io, const std::vector<std::pair
         for (const auto & range : cell_ranges) {
             const size_t range_size = range.second - range.first;
             const size_t buf_size = range_size * k_size_row;
-            io.write_tensor_data(k_l[il], range.first * k_size_row, buf_size);
+            io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
         }
     }
 
@@ -819,7 +819,7 @@ void llama_kv_cache::state_write_data(const io & io, const std::vector<std::pair
             for (const auto & range : cell_ranges) {
                 const size_t range_size = range.second - range.first;
                 const size_t buf_size = range_size * v_size_row;
-                io.write_tensor_data(v_l[il], range.first * v_size_row, buf_size);
+                io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
             }
         }
     } else {
@@ -846,14 +846,14 @@ void llama_kv_cache::state_write_data(const io & io, const std::vector<std::pair
                     const size_t range_size = range.second - range.first;
                     const size_t src_offset = (range.first + j * kv_size) * v_size_el;
                     const size_t buf_size = range_size * v_size_el;
-                    io.write_tensor_data(v_l[il], src_offset, buf_size);
+                    io.write_tensor(v_l[il], src_offset, buf_size);
                 }
             }
         }
     }
 }
 
-bool llama_kv_cache::state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
     if (dest_seq_id != -1) {
         // single sequence
 
@@ -955,7 +955,7 @@ bool llama_kv_cache::state_read_meta(const io & io, uint32_t cell_count, llama_s
     return true;
 }
 
-bool llama_kv_cache::state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count) {
+bool llama_kv_cache::state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count) {
     uint32_t v_trans;
     uint32_t n_layer;
     io.read_to(&v_trans, sizeof(v_trans));
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 5ffee62818b18..6ea4972979661 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "llama.h"
+#include "llama-io.h"
 
 #include "ggml-cpp.h"
 
@@ -114,16 +115,8 @@ struct llama_kv_cache {
     size_t size_k_bytes() const;
     size_t size_v_bytes() const;
 
-    struct io {
-        std::function<void(const void * src, size_t size)> write;
-        std::function<void(const struct ggml_tensor * tensor, size_t offset, size_t size)> write_tensor_data;
-
-        std::function<const uint8_t * (size_t size)> read;
-        std::function<void(void * dst, size_t size)> read_to;
-    };
-
-    void state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const;
-    void state_read (const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1);
+    void state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const;
+    void state_read (llama_io_read_i  & io, const llama_hparams & hparams, llama_seq_id seq_id = -1);
 
 private:
     ggml_type type_k = GGML_TYPE_F16;
@@ -132,11 +125,11 @@ struct llama_kv_cache {
     std::vector<ggml_context_ptr> ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
 
-    void state_write_meta(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-    void state_write_data(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const;
+    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const;
 
-    bool state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count);
+    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count);
 };
 
 //

From f7c7757babe54db018f8f16953148cb79a287d17 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 13 Feb 2025 12:37:28 +0200
Subject: [PATCH 40/84] context : abstract state read/write

ggml-ci
---
 src/llama-context.cpp | 2882 +++++++++++++++++++++--------------------
 src/llama-context.h   |   72 +-
 2 files changed, 1482 insertions(+), 1472 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d6618f1438869..bde6659531024 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -326,1027 +326,1169 @@ ggml_tensor * llama_context::build_rope_factors(int il) {
     return model.layers[il].rope_short;
 }
 
-void llama_context::perf_reset() {
-    t_start_us  = ggml_time_us();
-    t_eval_us   = n_eval = 0;
-    t_p_eval_us = n_p_eval = 0;
-}
-
 //
-// llama_context_kv_self
+// state
 //
 
-llama_context_kv_self::llama_context_kv_self(
-        const llama_model & model,
-        const llama_context_params & params) : llama_context(model) {
-    const auto & hparams = model.hparams;
-
-    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
-    cparams.n_threads        = params.n_threads;
-    cparams.n_threads_batch  = params.n_threads_batch;
-    cparams.yarn_ext_factor  = params.yarn_ext_factor;
-    cparams.yarn_attn_factor = params.yarn_attn_factor;
-    cparams.yarn_beta_fast   = params.yarn_beta_fast;
-    cparams.yarn_beta_slow   = params.yarn_beta_slow;
-    cparams.defrag_thold     = params.defrag_thold;
-    cparams.embeddings       = params.embeddings;
-    cparams.offload_kqv      = params.offload_kqv;
-    cparams.flash_attn       = params.flash_attn;
-    cparams.no_perf          = params.no_perf;
-    cparams.pooling_type     = params.pooling_type;
-
-    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
-    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
-    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
-
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams));
+class llama_io_write_dummy : public llama_io_write_i {
+public:
+    llama_io_write_dummy() = default;
 
-    // with causal attention, the batch size is limited by the context size
-    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
+    void write(const void * /* src */, size_t size) override {
+        size_written += size;
+    }
 
-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
+    void write_tensor(const ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
+        size_written += size;
     }
 
-    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+    size_t n_bytes() override {
+        return size_written;
+    }
 
-    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
-                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
-                                                              hparams.n_ctx_train;
+    size_t size_written = 0;
+};
 
-    cparams.cb_eval           = params.cb_eval;
-    cparams.cb_eval_user_data = params.cb_eval_user_data;
+class llama_io_write_buffer : public llama_io_write_i {
+public:
+    llama_io_write_buffer(
+            uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
 
-    auto rope_scaling_type = params.rope_scaling_type;
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
-        rope_scaling_type = hparams.rope_scaling_type_train;
+    void write(const void * src, size_t size) override {
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
+        }
+        memcpy(ptr, src, size);
+        ptr += size;
+        size_written += size;
+        buf_size -= size;
     }
 
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
-        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
+    void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
+        }
+        ggml_backend_tensor_get(tensor, ptr, offset, size);
+        ptr += size;
+        size_written += size;
+        buf_size -= size;
     }
 
-    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
-        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
+    size_t n_bytes() override {
+        return size_written;
     }
 
-    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
+    uint8_t * ptr;
+    size_t buf_size = 0;
+    size_t size_written = 0;
+};
 
-    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
-        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
-            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
-        } else {
-            cparams.pooling_type = hparams.pooling_type;
+class llama_io_read_buffer : public llama_io_read_i {
+public:
+    llama_io_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+
+    const uint8_t * read(size_t size) override {
+        const uint8_t * base_ptr = ptr;
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
         }
+        ptr += size;
+        size_read += size;
+        buf_size -= size;
+        return base_ptr;
     }
 
-    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
-        cparams.causal_attn = hparams.causal_attn;
-    } else {
-        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+    void read_to(void * dst, size_t size) override {
+        memcpy(dst, read(size), size);
     }
 
-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+    size_t n_bytes() override {
+        return size_read;
+    }
 
-    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
-    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
-    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
-    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
-    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
-    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
-    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
+    const uint8_t * ptr;
+    size_t buf_size = 0;
+    size_t size_read = 0;
+};
 
-    if (n_ctx_per_seq < hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+class llama_io_write_file : public llama_io_write_i {
+public:
+    llama_io_write_file(llama_file * f) : file(f) {}
+
+    void write(const void * src, size_t size) override {
+        file->write_raw(src, size);
+        size_written += size;
     }
 
-    if (n_ctx_per_seq > hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
+        temp_buffer.resize(size);
+        ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
+        write(temp_buffer.data(), temp_buffer.size());
     }
 
-    logits_all = params.logits_all;
+    size_t n_bytes() override {
+        return size_written;
+    }
 
-    // build worst-case graph for encoder if a model contains encoder
-    is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder()
+    llama_file * file;
+    size_t size_written = 0;
+    std::vector<uint8_t> temp_buffer;
+};
 
-    uint32_t kv_size = cparams.n_ctx;
-    ggml_type type_k = params.type_k;
-    ggml_type type_v = params.type_v;
+class llama_io_read_file : public llama_io_read_i {
+public:
+    llama_io_read_file(llama_file * f) : file(f) {}
 
-    // Mamba only needs a constant number of KV cache cells per sequence
-    if (llama_model_is_recurrent(&model)) {
-        // Mamba needs at least as many KV cells as there are sequences kept at any time
-        kv_size = std::max((uint32_t) 1, params.n_seq_max);
-        // it's probably best to keep as much precision as possible for the states
-        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
-        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
+    void read_to(void * dst, size_t size) override {
+        file->read_raw(dst, size);
+        size_read += size;
     }
 
-    GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
-    GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
+    const uint8_t * read(size_t size) override {
+        temp_buffer.resize(size);
+        read_to(temp_buffer.data(), size);
+        return temp_buffer.data();
+    }
 
-    if (!hparams.vocab_only) {
-        // GPU backends
-        for (auto * dev : model.devices) {
-            ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                throw std::runtime_error("failed to initialize backend");
-            }
-            backends.emplace_back(backend);
-        }
+    size_t n_bytes() override {
+        return size_read;
+    }
 
-        // add ACCEL backends (such as BLAS)
-        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
-                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                    throw std::runtime_error("failed to initialize backend");
-                }
-                backends.emplace_back(backend);
-            }
-        }
+    llama_file * file;
+    size_t size_read = 0;
+    std::vector<uint8_t> temp_buffer;
+};
 
-        // add CPU backend
-        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
-        if (backend_cpu == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
-            throw std::runtime_error("failed to initialize CPU backend");
-        }
-        backends.emplace_back(backend_cpu);
+size_t llama_context::state_get_size() {
+    llama_io_write_dummy io;
+    try {
+        return state_get_data(io);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
+        return 0;
+    }
+}
 
-        // create a list of the set_n_threads functions in the backends
-        for (auto & backend : backends) {
-            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
-            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
-            if (reg) {
-                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
-                if (ggml_backend_set_n_threads_fn) {
-                    set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
-                }
-            }
-        }
+size_t llama_context::state_get_data(uint8_t * dst, size_t size) {
+    llama_io_write_buffer io(dst, size);
+    try {
+        return state_get_data(io);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
 
-        llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data);
+size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
+    llama_io_read_buffer io(src, size);
+    try {
+        return state_set_data(io);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
 
-        if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
-            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
-            throw std::runtime_error("failed to initialize self-attention cache");
-        }
+size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
+    llama_io_write_dummy io;
+    try {
+        return state_seq_get_data(io, seq_id);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
+        return 0;
+    }
+}
 
-        {
-            const size_t memory_size_k = kv_self.size_k_bytes();
-            const size_t memory_size_v = kv_self.size_v_bytes();
+size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
+    llama_io_write_buffer io(dst, size);
+    try {
+        return state_seq_get_data(io, seq_id);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
 
-            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
-                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
+    llama_io_read_buffer io(src, size);
+    try {
+        return state_seq_set_data(io, seq_id);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+bool llama_context::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    llama_file file(filepath, "rb");
+
+    // sanity checks
+    {
+        const uint32_t magic   = file.read_u32();
+        const uint32_t version = file.read_u32();
+
+        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
+            LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+            return false;
         }
+    }
 
-        // graph outputs buffer
-        {
-            // resized during inference when a batch uses more outputs
-            if (reserve_outputs(params.n_seq_max) < params.n_seq_max) {
-                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
-                throw std::runtime_error("failed to reserve initial output buffer");
-            }
+    // load the prompt
+    {
+        const uint32_t n_token_count = file.read_u32();
 
-            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
-                    ggml_backend_buffer_name    (buf_output.get()),
-                    ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0);
+        if (n_token_count > n_token_capacity) {
+            LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            return false;
         }
 
-        // scheduler and compute buffers
-        {
-            // buffer types used for the compute buffer of each backend
-            std::vector<ggml_backend_buffer_type_t> backend_buft;
-            std::vector<ggml_backend_t> backend_ptrs;
-            for (auto & backend : backends) {
-                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
-                auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
-                if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
-                    // use the host buffer of the first device CPU for faster transfer of the intermediate state
-                    auto * dev = model.devices[0];
-                    auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
-                    if (host_buft) {
-                        buft = host_buft;
-                    }
-                }
-                backend_buft.push_back(buft);
-                backend_ptrs.push_back(backend.get());
-            }
+        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        *n_token_count_out = n_token_count;
+    }
 
-            const size_t max_nodes = model.max_nodes();
+    // restore the context state
+    {
+        const size_t n_state_size_cur = file.size() - file.tell();
 
-            // buffer used to store the computation graph and the tensor meta data
-            // TODO: move to base class
-            buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+        llama_io_read_file io( &file);
+        const size_t n_read = state_set_data(io);
 
-            // TODO: move these checks to ggml_backend_sched
-            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
-            bool pipeline_parallel =
-                model.n_devices() > 1 &&
-                model.params.n_gpu_layers > (int) model.hparams.n_layer &&
-                model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
-                params.offload_kqv;
+        if (n_read != n_state_size_cur) {
+            LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
+            return false;
+        }
+    }
 
-            // pipeline parallelism requires support for async compute and events in all devices
-            if (pipeline_parallel) {
-                for (auto & backend : backends) {
-                    auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
-                    if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
-                        // ignore CPU backend
-                        continue;
-                    }
-                    auto * dev = ggml_backend_get_device(backend.get());
-                    ggml_backend_dev_props props;
-                    ggml_backend_dev_get_props(dev, &props);
-                    if (!props.caps.async || !props.caps.events) {
-                        // device does not support async compute or events
-                        pipeline_parallel = false;
-                        break;
-                    }
-                }
-            }
+    return true;
+}
 
-            sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
+bool llama_context::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) {
+    llama_file file(filepath, "wb");
 
-            if (pipeline_parallel) {
-                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
-            }
+    file.write_u32(LLAMA_SESSION_MAGIC);
+    file.write_u32(LLAMA_SESSION_VERSION);
 
-            // initialize scheduler with the worst-case graph
-            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+    // save the prompt
+    file.write_u32((uint32_t) n_token_count);
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
 
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_pp = build_graph(ubatch_pp, true);
+    // save the context state using stream saving
+    llama_io_write_file io(&file);
+    state_get_data(io);
 
-            // reserve pp graph first so that buffers are only allocated once
-            ggml_backend_sched_reserve(sched.get(), gf_pp);
-            int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
-            int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
+    return true;
+}
 
-            // reserve with tg graph to get the number of splits and nodes
-            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_tg = build_graph(ubatch_tg, true);
-            ggml_backend_sched_reserve(sched.get(), gf_tg);
-            int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
-            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
+size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    llama_file file(filepath, "rb");
 
-            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-            gf_pp = build_graph(ubatch_pp, true);
-            if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-                throw std::runtime_error("failed to allocate compute buffers");
-            }
+    // version checks
+    {
+        const uint32_t magic   = file.read_u32();
+        const uint32_t version = file.read_u32();
 
-            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
-                ggml_backend_t backend = backend_ptrs[i];
-                ggml_backend_buffer_type_t buft = backend_buft[i];
-                size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
-                if (size > 1) {
-                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
-                            ggml_backend_buft_name(buft),
-                            size / 1024.0 / 1024.0);
-                }
-            }
+        if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
+            LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
+            return 0;
+        }
+    }
 
-            if (n_nodes_pp == n_nodes_tg) {
-                LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
-            } else {
-                LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
-            }
-            if (n_splits_pp == n_splits_tg) {
-                LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
-            } else {
-                LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
-            }
+    // load the prompt
+    {
+        const uint32_t n_token_count = file.read_u32();
+
+        if (n_token_count > n_token_capacity) {
+            LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            return 0;
         }
+
+        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        *n_token_count_out = n_token_count;
     }
-}
 
-llama_context_kv_self::~llama_context_kv_self() = default;
+    // restore the context state
+    {
+        const size_t state_size = file.size() - file.tell();
+        llama_io_read_file io(&file);
+        const size_t nread = state_seq_set_data(io, seq_id);
+        if (!nread) {
+            LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
+            return 0;
+        }
+        GGML_ASSERT(nread <= state_size);
+        GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
+    }
 
-uint32_t llama_context_kv_self::n_seq_max() const {
-    // TODO: add notion of n_seq_max to llama_kv_cache and use it here
-    return kv_self.size;
+    return file.tell();
 }
 
-llama_kv_cache * llama_context_kv_self::get_kv_self() {
-    return &kv_self;
-}
+size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) {
+    llama_file file(filepath, "wb");
 
-const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
-    return &kv_self;
+    file.write_u32(LLAMA_STATE_SEQ_MAGIC);
+    file.write_u32(LLAMA_STATE_SEQ_VERSION);
+
+    // save the prompt
+    file.write_u32((uint32_t) n_token_count);
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+    // save the context state using stream saving
+    llama_io_write_file io(&file);
+    state_seq_get_data(io, seq_id);
+
+    const size_t res = file.tell();
+    GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
+
+    return res;
 }
 
-float * llama_context_kv_self::get_logits() {
-    // reorder logits for backward compatibility
-    reorder_outputs();
+size_t llama_context::state_get_data(llama_io_write_i & io) {
+    // write model info
+    {
+        const std::string arch_str = llm_arch_name(model.arch);
+        io.write_string(arch_str);
+        // TODO: add more model-specific info which should prevent loading the session file if not identical
+    }
 
-    return logits;
+    return io.n_bytes();
 }
 
-float * llama_context_kv_self::get_logits_ith(int32_t i) {
-    int32_t j = -1;
+size_t llama_context::state_set_data(llama_io_read_i & io) {
+    // read model info
+    {
+        const std::string cur_arch_str = llm_arch_name(model.arch);
 
-    try {
-        if (logits == nullptr) {
-            throw std::runtime_error("no logits");
-        }
-
-        if (i < 0) {
-            j = n_outputs + i;
-            if (j < 0) {
-                throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
-            }
-        } else if ((size_t) i >= output_ids.size()) {
-            throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
-        } else {
-            j = output_ids[i];
-        }
-
-        if (j < 0) {
-            throw std::runtime_error(format("batch.logits[%d] != true", i));
-        }
-        if (j >= n_outputs) {
-            // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
+        std::string arch_str;
+        io.read_string(arch_str);
+        if (cur_arch_str != arch_str) {
+            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
         }
-
-        return logits + j*model.vocab.n_tokens();
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
-#ifndef NDEBUG
-        GGML_ABORT("fatal error");
-#else
-        return nullptr;
-#endif
+        // TODO: add more info which needs to be identical but which is not verified otherwise
     }
-}
-
-float * llama_context_kv_self::get_embeddings() {
-    // reorder embeddings for backward compatibility
-    reorder_outputs();
 
-    return embd;
+    return io.n_bytes();
 }
 
-float * llama_context_kv_self::get_embeddings_ith(int32_t i) {
-    int32_t j = -1;
-
-    try {
-        if (embd == nullptr) {
-            throw std::runtime_error("no embeddings");
-        }
-
-        if (i < 0) {
-            j = n_outputs + i;
-            if (j < 0) {
-                throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
-            }
-        } else if ((size_t) i >= output_ids.size()) {
-            throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
-        } else {
-            j = output_ids[i];
-        }
-
-        if (j < 0) {
-            throw std::runtime_error(format("batch.logits[%d] != true", i));
-        }
-        if (j >= n_outputs) {
-            // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
-        }
+size_t llama_context::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
+    GGML_UNUSED(seq_id);
 
-        return embd + j*model.hparams.n_embd;
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
-#ifndef NDEBUG
-        GGML_ABORT("fatal error");
-#else
-        return nullptr;
-#endif
-    }
+    return io.n_bytes();
 }
 
-float * llama_context_kv_self::get_embeddings_seq(llama_seq_id seq_id) {
-    auto it = embd_seq.find(seq_id);
-    if (it == embd_seq.end()) {
-        return nullptr;
-    }
+size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
+    GGML_UNUSED(seq_id);
 
-    return it->second.data();
+    return io.n_bytes();
 }
 
-ggml_context_ptr llama_context_kv_self::init() {
-    inp_tokens          = nullptr;
-    inp_embd            = nullptr;
-    inp_pos             = nullptr;
-    inp_out_ids         = nullptr;
-    inp_mean            = nullptr;
-    inp_cls             = nullptr;
-    inp_embd_enc        = nullptr;
-    inp_pos_bucket      = nullptr;
-    inp_KQ_mask         = nullptr;
-    inp_KQ_mask_cnv     = nullptr;
-    inp_KQ_mask_swa     = nullptr;
-    inp_KQ_mask_swa_cnv = nullptr;
-    inp_KQ_mask_cross   = nullptr;
-    inp_K_shift         = nullptr;
-    inp_s_copy          = nullptr;
-    inp_s_mask          = nullptr;
-
-    return llama_context::init();
+void llama_context::perf_reset() {
+    t_start_us  = ggml_time_us();
+    t_eval_us   = n_eval = 0;
+    t_p_eval_us = n_p_eval = 0;
 }
 
-struct llama_context_kv_self::batch_manager {
-    batch_manager(llama_context_kv_self & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
-        const auto & model   = lctx.model;
-        const auto & cparams = lctx.cparams;
-        const auto & hparams = lctx.model.hparams;
-
-        const auto & kv_self = lctx.kv_self;
-
-        const int64_t n_tokens_all = batch.n_tokens;
-        const int64_t n_embd       = hparams.n_embd;
+//
+// llama_context_kv_self
+//
 
-        GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+llama_context_kv_self::llama_context_kv_self(
+        const llama_model & model,
+        const llama_context_params & params) : llama_context(model) {
+    const auto & hparams = model.hparams;
 
-        if (batch.token) {
-            for (int64_t i = 0; i < n_tokens_all; ++i) {
-                if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                    LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
-                    throw std::runtime_error("invalid token");
-                }
-            }
-        }
+    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
+    cparams.n_threads        = params.n_threads;
+    cparams.n_threads_batch  = params.n_threads_batch;
+    cparams.yarn_ext_factor  = params.yarn_ext_factor;
+    cparams.yarn_attn_factor = params.yarn_attn_factor;
+    cparams.yarn_beta_fast   = params.yarn_beta_fast;
+    cparams.yarn_beta_slow   = params.yarn_beta_slow;
+    cparams.defrag_thold     = params.defrag_thold;
+    cparams.embeddings       = params.embeddings;
+    cparams.offload_kqv      = params.offload_kqv;
+    cparams.flash_attn       = params.flash_attn;
+    cparams.no_perf          = params.no_perf;
+    cparams.pooling_type     = params.pooling_type;
 
-        GGML_ASSERT(n_tokens_all <= cparams.n_batch);
+    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
+    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
-        GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams));
 
-        if (lctx.t_compute_start_us == 0) {
-            lctx.t_compute_start_us = ggml_time_us();
-        }
-        lctx.n_queued_tokens += n_tokens_all;
+    // with causal attention, the batch size is limited by the context size
+    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
 
-        // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
-        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
+    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
+    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
+    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
+        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
+        cparams.n_batch = GGML_KQ_MASK_PAD;
+    }
 
-        lctx.embd_seq.clear();
+    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
 
-        // count outputs
-        if (batch.logits && !embd_pooled) {
-            for (uint32_t i = 0; i < n_tokens_all; ++i) {
-                n_outputs_all += batch.logits[i] != 0;
-            }
-        } else if (lctx.logits_all || embd_pooled) {
-            n_outputs_all = n_tokens_all;
-        } else {
-            // keep last output only
-            n_outputs_all = 1;
-        }
+    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
+                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
+                                                              hparams.n_ctx_train;
 
-        const bool logits_all = n_outputs_all == n_tokens_all;
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
 
-        lctx.sbatch.from_batch(batch, n_embd,
-                /* simple_split */ !kv_self.recurrent,
-                /* logits_all   */ logits_all);
+    auto rope_scaling_type = params.rope_scaling_type;
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
+        rope_scaling_type = hparams.rope_scaling_type_train;
     }
 
-    ~batch_manager() {
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
+        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
     }
 
-    bool is_done() const {
-        return lctx.sbatch.n_tokens == 0;
+    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
+        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
     }
 
-    llama_ubatch next() {
-        llama_ubatch ubatch = llama_ubatch();
-
-        const auto & cparams = lctx.cparams;
-        const auto & kv_self = lctx.kv_self;
-
-        const auto & n_ubatch = cparams.n_ubatch;
-
-        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
 
-        if (kv_self.recurrent) {
-            if (embd_pooled) {
-                // Pooled embeddings cannot be split across ubatches (yet)
-                ubatch = lctx.sbatch.split_seq(n_ubatch);
-            } else {
-                // recurrent model architectures are easier to implement
-                // with equal-length sequences
-                ubatch = lctx.sbatch.split_equal(n_ubatch);
-            }
+    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
         } else {
-            ubatch = lctx.sbatch.split_simple(n_ubatch);
+            cparams.pooling_type = hparams.pooling_type;
         }
-
-        return ubatch;
     }
 
-    bool prepare(const llama_ubatch & ubatch) {
-        const auto & cparams = lctx.cparams;
-        const auto & hparams = lctx.model.hparams;
-        const auto & batch   = lctx.sbatch.batch;
+    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
+        cparams.causal_attn = hparams.causal_attn;
+    } else {
+        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+    }
 
-        const auto n_tokens_all = batch->n_tokens;
+    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
 
-        auto & kv_self = lctx.kv_self;
+    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
+    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
+    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
+    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
+    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
+    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
+    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
+    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
 
-        // count the outputs in this u_batch
-        {
-            int32_t n_outputs_new = 0;
+    if (n_ctx_per_seq < hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
 
-            if (n_outputs_all == n_tokens_all) {
-                n_outputs_new = ubatch.n_tokens;
-            } else {
-                GGML_ASSERT(ubatch.output);
-                for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
-                    n_outputs_new += (int32_t) (ubatch.output[i] != 0);
-                }
-            }
+    if (n_ctx_per_seq > hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
 
-            // needs to happen before the graph is built
-            lctx.n_outputs = n_outputs_new;
-        }
+    logits_all = params.logits_all;
 
-        // non-causal masks do not use the KV cache
-        if (hparams.causal_attn) {
-            lctx.kv_self_update();
+    // build worst-case graph for encoder if a model contains encoder
+    is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder()
 
-            // if we have enough unused cells before the current head ->
-            //   better to start searching from the beginning of the cache, hoping to fill it
-            if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
-                kv_self.head = 0;
+    uint32_t kv_size = cparams.n_ctx;
+    ggml_type type_k = params.type_k;
+    ggml_type type_v = params.type_v;
+
+    // Mamba only needs a constant number of KV cache cells per sequence
+    if (llama_model_is_recurrent(&model)) {
+        // Mamba needs at least as many KV cells as there are sequences kept at any time
+        kv_size = std::max((uint32_t) 1, params.n_seq_max);
+        // it's probably best to keep as much precision as possible for the states
+        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
+        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
+    }
+
+    GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
+    GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
+
+    if (!hparams.vocab_only) {
+        // GPU backends
+        for (auto * dev : model.devices) {
+            ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                throw std::runtime_error("failed to initialize backend");
             }
+            backends.emplace_back(backend);
+        }
 
-            const auto slot_info = kv_self.find_slot(ubatch);
-            if (!slot_info) {
-                return false;
+        // add ACCEL backends (such as BLAS)
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                    throw std::runtime_error("failed to initialize backend");
+                }
+                backends.emplace_back(backend);
             }
+        }
 
-            kv_slot_restorer.save(slot_info);
+        // add CPU backend
+        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        if (backend_cpu == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
+            throw std::runtime_error("failed to initialize CPU backend");
+        }
+        backends.emplace_back(backend_cpu);
 
-            if (!kv_self.recurrent) {
-                // a heuristic, to avoid attending the full cache if it is not yet utilized
-                // after enough generations, the benefit from this heuristic disappears
-                // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = kv_self.get_padding(cparams);
-                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
-                //kv_self.n = llama_kv_cache_cell_max(kv_self);
+        // create a list of the set_n_threads functions in the backends
+        for (auto & backend : backends) {
+            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
+            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+            if (reg) {
+                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+                if (ggml_backend_set_n_threads_fn) {
+                    set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
+                }
             }
         }
 
-        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
+        llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data);
 
-        // reserve a worst case graph if needed
-        if (lctx.need_reserve) {
-            LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
+        if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
+            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
+            throw std::runtime_error("failed to initialize self-attention cache");
+        }
 
-            const auto & cparams = lctx.cparams;
-            const auto & model   = lctx.model;
+        {
+            const size_t memory_size_k = kv_self.size_k_bytes();
+            const size_t memory_size_v = kv_self.size_v_bytes();
 
-            // build worst-case graph
-            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+        }
 
-            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+        // graph outputs buffer
+        {
+            // resized during inference when a batch uses more outputs
+            if (reserve_outputs(params.n_seq_max) < params.n_seq_max) {
+                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
+                throw std::runtime_error("failed to reserve initial output buffer");
+            }
 
-            ggml_cgraph * gf = lctx.build_graph(ubatch, true);
+            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
+                    ggml_backend_buffer_name    (buf_output.get()),
+                    ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0);
+        }
 
-            // initialize scheduler with the worst-case graph
-            ggml_backend_sched_reset(lctx.sched.get());
-            if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+        // scheduler and compute buffers
+        {
+            // buffer types used for the compute buffer of each backend
+            std::vector<ggml_backend_buffer_type_t> backend_buft;
+            std::vector<ggml_backend_t> backend_ptrs;
+            for (auto & backend : backends) {
+                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
+                auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
+                    // use the host buffer of the first device CPU for faster transfer of the intermediate state
+                    auto * dev = model.devices[0];
+                    auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
+                    if (host_buft) {
+                        buft = host_buft;
+                    }
+                }
+                backend_buft.push_back(buft);
+                backend_ptrs.push_back(backend.get());
             }
 
-            lctx.need_reserve = false;
-        }
+            const size_t max_nodes = model.max_nodes();
 
-        return true;
-    }
+            // buffer used to store the computation graph and the tensor meta data
+            // TODO: move to base class
+            buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
 
-    void restore() {
-        kv_slot_restorer.restore(lctx.kv_self);
-    }
+            // TODO: move these checks to ggml_backend_sched
+            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
+            bool pipeline_parallel =
+                model.n_devices() > 1 &&
+                model.params.n_gpu_layers > (int) model.hparams.n_layer &&
+                model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
+                params.offload_kqv;
 
-    void update(const llama_ubatch & ubatch) {
-        auto & kv_self = lctx.kv_self;
+            // pipeline parallelism requires support for async compute and events in all devices
+            if (pipeline_parallel) {
+                for (auto & backend : backends) {
+                    auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                    if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                        // ignore CPU backend
+                        continue;
+                    }
+                    auto * dev = ggml_backend_get_device(backend.get());
+                    ggml_backend_dev_props props;
+                    ggml_backend_dev_get_props(dev, &props);
+                    if (!props.caps.async || !props.caps.events) {
+                        // device does not support async compute or events
+                        pipeline_parallel = false;
+                        break;
+                    }
+                }
+            }
 
-        // update the kv ring buffer
-        {
-            kv_self.head += ubatch.n_tokens;
+            sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
 
-            // Ensure kv cache head points to a valid index.
-            if (kv_self.head >= kv_self.size) {
-                kv_self.head = 0;
+            if (pipeline_parallel) {
+                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
             }
-        }
-    }
 
-    void finalize() {
-        const auto & cparams = lctx.cparams;
+            // initialize scheduler with the worst-case graph
+            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
-        auto & kv_self = lctx.kv_self;
+            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf_pp = build_graph(ubatch_pp, true);
 
-        // decide if we need to defrag the kv cache
-        if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
-            // - do not defrag small contexts (i.e. < 2048 tokens)
-            // - count the padding towards the number of used tokens
-            const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + lctx.get_ctx_padding(cparams))/float(kv_self.n)) : 0.0f;
+            // reserve pp graph first so that buffers are only allocated once
+            ggml_backend_sched_reserve(sched.get(), gf_pp);
+            int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
+            int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
 
-            // queue defragmentation for next llama_kv_cache_update
-            if (fragmentation > cparams.defrag_thold) {
-                LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
+            // reserve with tg graph to get the number of splits and nodes
+            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf_tg = build_graph(ubatch_tg, true);
+            ggml_backend_sched_reserve(sched.get(), gf_tg);
+            int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
+            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
 
-                kv_self.defrag();
+            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
+            gf_pp = build_graph(ubatch_pp, true);
+            if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+                throw std::runtime_error("failed to allocate compute buffers");
+            }
+
+            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+                ggml_backend_t backend = backend_ptrs[i];
+                ggml_backend_buffer_type_t buft = backend_buft[i];
+                size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+                if (size > 1) {
+                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+                            ggml_backend_buft_name(buft),
+                            size / 1024.0 / 1024.0);
+                }
+            }
+
+            if (n_nodes_pp == n_nodes_tg) {
+                LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
+            } else {
+                LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+            }
+            if (n_splits_pp == n_splits_tg) {
+                LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
+            } else {
+                LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
             }
         }
     }
+}
 
-    int64_t n_outputs_all = 0;
+llama_context_kv_self::~llama_context_kv_self() = default;
 
-    llama_context_kv_self & lctx;
+uint32_t llama_context_kv_self::n_seq_max() const {
+    // TODO: add notion of n_seq_max to llama_kv_cache and use it here
+    return kv_self.size;
+}
 
-    const llama_batch & batch;
-
-    llama_kv_slot_restorer kv_slot_restorer;
-};
+llama_kv_cache * llama_context_kv_self::get_kv_self() {
+    return &kv_self;
+}
 
-std::unique_ptr<llama_context_kv_self::batch_manager> llama_context_kv_self::prepare_batch(const llama_batch & batch) {
-    return std::make_unique<batch_manager>(*this, batch);
+const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
+    return &kv_self;
 }
 
-int llama_context_kv_self::decode(llama_batch & inp_batch) {
-    is_encoding = false;
+float * llama_context_kv_self::get_logits() {
+    // reorder logits for backward compatibility
+    reorder_outputs();
 
-    if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
-        return -1;
-    }
+    return logits;
+}
 
-    // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
+float * llama_context_kv_self::get_logits_ith(int32_t i) {
+    int32_t j = -1;
 
-    const llama_batch & batch = batch_allocr.batch;
+    try {
+        if (logits == nullptr) {
+            throw std::runtime_error("no logits");
+        }
 
-    const auto & vocab   = model.vocab;
-    const auto & hparams = model.hparams;
+        if (i < 0) {
+            j = n_outputs + i;
+            if (j < 0) {
+                throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
+            }
+        } else if ((size_t) i >= output_ids.size()) {
+            throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
+        } else {
+            j = output_ids[i];
+        }
 
-    const int32_t n_vocab = vocab.n_tokens();
-    const int64_t n_embd  = hparams.n_embd;
+        if (j < 0) {
+            throw std::runtime_error(format("batch.logits[%d] != true", i));
+        }
+        if (j >= n_outputs) {
+            // This should not happen
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
+        }
 
-    // TODO: try catch
-    auto bman = prepare_batch(batch);
+        return logits + j*model.vocab.n_tokens();
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+        GGML_ABORT("fatal error");
+#else
+        return nullptr;
+#endif
+    }
+}
 
-    const auto n_outputs_all = bman->n_outputs_all;
+float * llama_context_kv_self::get_embeddings() {
+    // reorder embeddings for backward compatibility
+    reorder_outputs();
 
-    // reserve output buffer
-    // TODO: move to batch manager?
-    if (reserve_outputs(bman->n_outputs_all) < (size_t) n_outputs_all) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
-        return -2;
-    };
+    return embd;
+}
 
-    int64_t n_outputs_prev = 0;
+float * llama_context_kv_self::get_embeddings_ith(int32_t i) {
+    int32_t j = -1;
 
-    while (!bman->is_done()) {
-        llama_ubatch ubatch = bman->next();
+    try {
+        if (embd == nullptr) {
+            throw std::runtime_error("no embeddings");
+        }
 
-        if (!bman->prepare(ubatch)) {
-            LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
-            bman->restore();
-            return -3;
+        if (i < 0) {
+            j = n_outputs + i;
+            if (j < 0) {
+                throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
+            }
+        } else if ((size_t) i >= output_ids.size()) {
+            throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
+        } else {
+            j = output_ids[i];
         }
 
-        ggml_backend_sched_reset(sched.get());
-        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+        if (j < 0) {
+            throw std::runtime_error(format("batch.logits[%d] != true", i));
+        }
+        if (j >= n_outputs) {
+            // This should not happen
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
+        }
 
-        ggml_cgraph * gf = build_graph(ubatch, false);
+        return embd + j*model.hparams.n_embd;
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+        GGML_ABORT("fatal error");
+#else
+        return nullptr;
+#endif
+    }
+}
 
-        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+float * llama_context_kv_self::get_embeddings_seq(llama_seq_id seq_id) {
+    auto it = embd_seq.find(seq_id);
+    if (it == embd_seq.end()) {
+        return nullptr;
+    }
 
-        ggml_backend_sched_alloc_graph(sched.get(), gf);
+    return it->second.data();
+}
 
-        set_inputs(ubatch);
+ggml_context_ptr llama_context_kv_self::init() {
+    inp_tokens          = nullptr;
+    inp_embd            = nullptr;
+    inp_pos             = nullptr;
+    inp_out_ids         = nullptr;
+    inp_mean            = nullptr;
+    inp_cls             = nullptr;
+    inp_embd_enc        = nullptr;
+    inp_pos_bucket      = nullptr;
+    inp_KQ_mask         = nullptr;
+    inp_KQ_mask_cnv     = nullptr;
+    inp_KQ_mask_swa     = nullptr;
+    inp_KQ_mask_swa_cnv = nullptr;
+    inp_KQ_mask_cross   = nullptr;
+    inp_K_shift         = nullptr;
+    inp_s_copy          = nullptr;
+    inp_s_mask          = nullptr;
 
-        // the output is always the last tensor in the graph
-        struct ggml_tensor * t_logits = ggml_graph_node(gf, -1);
-        struct ggml_tensor * t_embd   = ggml_graph_node(gf, -2);
+    return llama_context::init();
+}
 
-        if (n_outputs == 0) {
-            // no output
-            t_logits  = nullptr;
-            t_embd = nullptr;
-        } else if (cparams.embeddings) {
-            t_logits  = nullptr; // do not extract logits for embedding case
-            t_embd = nullptr;
-            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-                    t_embd = ggml_graph_node(gf, i);
-                    break;
+struct llama_context_kv_self::batch_manager {
+    batch_manager(llama_context_kv_self & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
+        const auto & model   = lctx.model;
+        const auto & cparams = lctx.cparams;
+        const auto & hparams = lctx.model.hparams;
+
+        const auto & kv_self = lctx.kv_self;
+
+        const int64_t n_tokens_all = batch.n_tokens;
+        const int64_t n_embd       = hparams.n_embd;
+
+        GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+        if (batch.token) {
+            for (int64_t i = 0; i < n_tokens_all; ++i) {
+                if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                    LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
+                    throw std::runtime_error("invalid token");
                 }
             }
-            GGML_ASSERT(t_embd != nullptr && "missing embeddings tensor");
-        } else {
-            t_embd = nullptr; // do not extract embeddings when not needed
-            GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor");
         }
 
-        const auto compute_status = compute_graph(gf, ubatch.n_tokens > 1);
-        if (compute_status != GGML_STATUS_SUCCESS) {
-            bman->restore();
-            switch (compute_status) {
-                case GGML_STATUS_ABORTED:
-                    return 2;
-                case GGML_STATUS_ALLOC_FAILED:
-                    return -2;
-                case GGML_STATUS_FAILED:
-                default:
-                    return -3;
-            }
-        }
+        GGML_ASSERT(n_tokens_all <= cparams.n_batch);
 
-        bman->update(ubatch);
+        GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
 
-        // plot the computation graph in dot format (for debugging purposes)
-        //if (n_past%100 == 0) {
-        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
-        //}
+        if (lctx.t_compute_start_us == 0) {
+            lctx.t_compute_start_us = ggml_time_us();
+        }
+        lctx.n_queued_tokens += n_tokens_all;
 
-        // extract logits
-        if (t_logits) {
-            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
-            GGML_ASSERT(backend_res != nullptr);
-            GGML_ASSERT(logits != nullptr);
+        // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
 
-            float * logits_out = logits + n_outputs_prev*n_vocab;
+        lctx.embd_seq.clear();
 
-            if (n_outputs) {
-                GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
-                GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size);
-                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+        // count outputs
+        if (batch.logits && !embd_pooled) {
+            for (uint32_t i = 0; i < n_tokens_all; ++i) {
+                n_outputs_all += batch.logits[i] != 0;
             }
+        } else if (lctx.logits_all || embd_pooled) {
+            n_outputs_all = n_tokens_all;
+        } else {
+            // keep last output only
+            n_outputs_all = 1;
         }
 
-        // extract embeddings
-        if (t_embd) {
-            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
-            GGML_ASSERT(backend_embd != nullptr);
+        const bool logits_all = n_outputs_all == n_tokens_all;
 
-            switch (cparams.pooling_type) {
-                case LLAMA_POOLING_TYPE_NONE:
-                    {
-                        // extract token embeddings
-                        GGML_ASSERT(embd != nullptr);
-                        float * embd_out = embd + n_outputs_prev*n_embd;
+        lctx.sbatch.from_batch(batch, n_embd,
+                /* simple_split */ !kv_self.recurrent,
+                /* logits_all   */ logits_all);
+    }
 
-                        if (n_outputs) {
-                            GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
-                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_MEAN:
-                case LLAMA_POOLING_TYPE_CLS:
-                case LLAMA_POOLING_TYPE_LAST:
-                    {
-                        // extract sequence embeddings (cleared before processing each batch)
-                        auto & embd_seq_out = embd_seq;
+    ~batch_manager() {
+    }
 
-                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(n_embd);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_RANK:
-                    {
-                        // extract the rerank score - a single float per sequence
-                        auto & embd_seq_out = embd_seq;
+    bool is_done() const {
+        return lctx.sbatch.n_tokens == 0;
+    }
 
-                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(1);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_UNSPECIFIED:
-                    {
-                        GGML_ABORT("unknown pooling type");
-                    }
-            }
-        }
+    llama_ubatch next() {
+        llama_ubatch ubatch = llama_ubatch();
 
-        n_outputs_prev += n_outputs;
-    }
+        const auto & cparams = lctx.cparams;
+        const auto & kv_self = lctx.kv_self;
 
-    // set output mappings
-    {
-        bool sorted_output = true;
+        const auto & n_ubatch = cparams.n_ubatch;
 
-        GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
+        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
 
-        for (size_t i = 0; i < (size_t) n_outputs_all; ++i) {
-            size_t out_id = sbatch.out_ids[i];
-            output_ids[out_id] = i;
-            if (out_id != i) {
-                sorted_output = false;
+        if (kv_self.recurrent) {
+            if (embd_pooled) {
+                // Pooled embeddings cannot be split across ubatches (yet)
+                ubatch = lctx.sbatch.split_seq(n_ubatch);
+            } else {
+                // recurrent model architectures are easier to implement
+                // with equal-length sequences
+                ubatch = lctx.sbatch.split_equal(n_ubatch);
             }
+        } else {
+            ubatch = lctx.sbatch.split_simple(n_ubatch);
         }
 
-        if (sorted_output) {
-            sbatch.out_ids.clear();
-        }
+        return ubatch;
     }
 
-    // set to total number of outputs in the batch, for use in llama_get_logits_ith
-    n_outputs = n_outputs_all;
-
-    // wait for the computation to finish (automatically done when obtaining the model output)
-    //synchronize();
+    bool prepare(const llama_ubatch & ubatch) {
+        const auto & cparams = lctx.cparams;
+        const auto & hparams = lctx.model.hparams;
+        const auto & batch   = lctx.sbatch.batch;
 
-    bman->finalize();
+        const auto n_tokens_all = batch->n_tokens;
 
-    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-    // overlap with device computation.
-    ggml_backend_sched_reset(sched.get());
+        auto & kv_self = lctx.kv_self;
 
-    return 0;
-}
+        // count the outputs in this u_batch
+        {
+            int32_t n_outputs_new = 0;
 
-int llama_context_kv_self::encode(llama_batch & inp_batch) {
-    is_encoding = true;
+            if (n_outputs_all == n_tokens_all) {
+                n_outputs_new = ubatch.n_tokens;
+            } else {
+                GGML_ASSERT(ubatch.output);
+                for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
+                    n_outputs_new += (int32_t) (ubatch.output[i] != 0);
+                }
+            }
 
-    if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
-        return -1;
-    }
+            // needs to happen before the graph is built
+            lctx.n_outputs = n_outputs_new;
+        }
 
-    // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
+        // non-causal masks do not use the KV cache
+        if (hparams.causal_attn) {
+            lctx.kv_self_update();
 
-    const llama_batch & batch = batch_allocr.batch;
-    const uint32_t n_tokens = batch.n_tokens;
+            // if we have enough unused cells before the current head ->
+            //   better to start searching from the beginning of the cache, hoping to fill it
+            if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
+                kv_self.head = 0;
+            }
 
-    const auto & hparams = model.hparams;
+            const auto slot_info = kv_self.find_slot(ubatch);
+            if (!slot_info) {
+                return false;
+            }
 
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+            kv_slot_restorer.save(slot_info);
 
-    if (batch.token) {
-        for (uint32_t i = 0; i < n_tokens; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
-                return -1;
+            if (!kv_self.recurrent) {
+                // a heuristic, to avoid attending the full cache if it is not yet utilized
+                // after enough generations, the benefit from this heuristic disappears
+                // if we start defragmenting the cache, the benefit from this will be more important
+                const uint32_t pad = kv_self.get_padding(cparams);
+                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
+                //kv_self.n = llama_kv_cache_cell_max(kv_self);
             }
         }
-    }
 
-    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
-    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
+        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
-    if (t_compute_start_us == 0) {
-        t_compute_start_us = ggml_time_us();
-    }
+        // reserve a worst case graph if needed
+        if (lctx.need_reserve) {
+            LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
 
-    n_queued_tokens += n_tokens;
+            const auto & cparams = lctx.cparams;
+            const auto & model   = lctx.model;
 
-    const int64_t n_embd = hparams.n_embd;
+            // build worst-case graph
+            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
-    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
-    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
+            ggml_cgraph * gf = lctx.build_graph(ubatch, true);
 
-    // reserve output buffer
-    if (reserve_outputs(n_tokens) < n_tokens) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
-        return -2;
-    };
+            // initialize scheduler with the worst-case graph
+            ggml_backend_sched_reset(lctx.sched.get());
+            if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+            }
 
-    for (uint32_t i = 0; i < n_tokens; ++i) {
-        output_ids[i] = i;
+            lctx.need_reserve = false;
+        }
+
+        return true;
     }
 
-    inp_embd_enc = NULL;
-    n_outputs = n_tokens;
+    void restore() {
+        kv_slot_restorer.restore(lctx.kv_self);
+    }
 
-    //batch_manager->prepare(ubatch);
+    void update(const llama_ubatch & ubatch) {
+        auto & kv_self = lctx.kv_self;
 
-    // TODO: do reserve
-    GGML_ASSERT(need_reserve == false);
+        // update the kv ring buffer
+        {
+            kv_self.head += ubatch.n_tokens;
 
-    ggml_backend_sched_reset(sched.get());
-    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+            // Ensure kv cache head points to a valid index.
+            if (kv_self.head >= kv_self.size) {
+                kv_self.head = 0;
+            }
+        }
+    }
 
-    ggml_cgraph * gf = build_graph(ubatch, false);
+    void finalize() {
+        const auto & cparams = lctx.cparams;
 
-    ggml_backend_sched_alloc_graph(sched.get(), gf);
+        auto & kv_self = lctx.kv_self;
 
-    set_inputs(ubatch);
+        // decide if we need to defrag the kv cache
+        if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
+            // - do not defrag small contexts (i.e. < 2048 tokens)
+            // - count the padding towards the number of used tokens
+            const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + lctx.get_ctx_padding(cparams))/float(kv_self.n)) : 0.0f;
 
-    // the output embeddings after the final encoder normalization
-    struct ggml_tensor * t_embd = nullptr;
+            // queue defragmentation for next llama_kv_cache_update
+            if (fragmentation > cparams.defrag_thold) {
+                LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
 
-    // there are two cases here
-    if (llama_model_has_decoder(&model)) {
-        // first case is an encoder-decoder T5 model where embeddings are passed to decoder
-        t_embd = ggml_graph_node(gf, -1);
-        GGML_ASSERT(strcmp(t_embd->name, "result_norm") == 0 && "missing result_output tensor");
-    } else {
-        // second case is an encoder-only T5 model
-        if (cparams.embeddings) {
-            // only output embeddings if required
-            t_embd = ggml_graph_node(gf, -1);
-            if (strcmp(t_embd->name, "result_embd_pooled") != 0) {
-                t_embd = ggml_graph_node(gf, -2);
+                kv_self.defrag();
             }
-            GGML_ASSERT(strcmp(t_embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
         }
     }
 
-    const auto compute_status = compute_graph(gf, n_tokens > 1);
-    switch (compute_status) {
-        case GGML_STATUS_SUCCESS:
-            break;
-        case GGML_STATUS_ABORTED:
-            return 2;
-        case GGML_STATUS_ALLOC_FAILED:
-            return -2;
-        case GGML_STATUS_FAILED:
-        default:
-            return -3;
-    }
+    int64_t n_outputs_all = 0;
 
-    // extract embeddings
-    if (t_embd) {
-        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
-        GGML_ASSERT(backend_embd != nullptr);
+    llama_context_kv_self & lctx;
 
-        if (llama_model_has_decoder(&model)) {
-            embd_enc.resize(n_tokens*n_embd);
-            float * embd_out = embd_enc.data();
+    const llama_batch & batch;
 
-            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
-            GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+    llama_kv_slot_restorer kv_slot_restorer;
+};
 
-            // remember the sequence ids used during the encoding - needed for cross attention later
-            seq_ids_enc.resize(n_tokens);
-            for (uint32_t i = 0; i < n_tokens; i++) {
-                for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
-                    llama_seq_id seq_id = ubatch.seq_id[i][s];
-                    seq_ids_enc[i].insert(seq_id);
-                }
+std::unique_ptr<llama_context_kv_self::batch_manager> llama_context_kv_self::prepare_batch(const llama_batch & batch) {
+    return std::make_unique<batch_manager>(*this, batch);
+}
+
+int llama_context_kv_self::decode(llama_batch & inp_batch) {
+    is_encoding = false;
+
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
+
+    const llama_batch & batch = batch_allocr.batch;
+
+    const auto & vocab   = model.vocab;
+    const auto & hparams = model.hparams;
+
+    const int32_t n_vocab = vocab.n_tokens();
+    const int64_t n_embd  = hparams.n_embd;
+
+    // TODO: try catch
+    auto bman = prepare_batch(batch);
+
+    const auto n_outputs_all = bman->n_outputs_all;
+
+    // reserve output buffer
+    // TODO: move to batch manager?
+    if (reserve_outputs(bman->n_outputs_all) < (size_t) n_outputs_all) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
+        return -2;
+    };
+
+    int64_t n_outputs_prev = 0;
+
+    while (!bman->is_done()) {
+        llama_ubatch ubatch = bman->next();
+
+        if (!bman->prepare(ubatch)) {
+            LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
+            bman->restore();
+            return -3;
+        }
+
+        ggml_backend_sched_reset(sched.get());
+        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+        ggml_cgraph * gf = build_graph(ubatch, false);
+
+        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+        ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+        set_inputs(ubatch);
+
+        // the output is always the last tensor in the graph
+        struct ggml_tensor * t_logits = ggml_graph_node(gf, -1);
+        struct ggml_tensor * t_embd   = ggml_graph_node(gf, -2);
+
+        if (n_outputs == 0) {
+            // no output
+            t_logits  = nullptr;
+            t_embd = nullptr;
+        } else if (cparams.embeddings) {
+            t_logits  = nullptr; // do not extract logits for embedding case
+            t_embd = nullptr;
+            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+                    t_embd = ggml_graph_node(gf, i);
+                    break;
+                }
             }
+            GGML_ASSERT(t_embd != nullptr && "missing embeddings tensor");
         } else {
-            GGML_ASSERT(embd != nullptr);
+            t_embd = nullptr; // do not extract embeddings when not needed
+            GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor");
+        }
+
+        const auto compute_status = compute_graph(gf, ubatch.n_tokens > 1);
+        if (compute_status != GGML_STATUS_SUCCESS) {
+            bman->restore();
+            switch (compute_status) {
+                case GGML_STATUS_ABORTED:
+                    return 2;
+                case GGML_STATUS_ALLOC_FAILED:
+                    return -2;
+                case GGML_STATUS_FAILED:
+                default:
+                    return -3;
+            }
+        }
+
+        bman->update(ubatch);
+
+        // plot the computation graph in dot format (for debugging purposes)
+        //if (n_past%100 == 0) {
+        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+        //}
+
+        // extract logits
+        if (t_logits) {
+            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
+            GGML_ASSERT(backend_res != nullptr);
+            GGML_ASSERT(logits != nullptr);
+
+            float * logits_out = logits + n_outputs_prev*n_vocab;
+
+            if (n_outputs) {
+                GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
+                GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size);
+                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+            }
+        }
+
+        // extract embeddings
+        if (t_embd) {
+            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+            GGML_ASSERT(backend_embd != nullptr);
 
             switch (cparams.pooling_type) {
                 case LLAMA_POOLING_TYPE_NONE:
                     {
                         // extract token embeddings
                         GGML_ASSERT(embd != nullptr);
-                        float * embd_out = embd;
+                        float * embd_out = embd + n_outputs_prev*n_embd;
 
-                        GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                        if (n_outputs) {
+                            GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
+                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
+                        }
                     } break;
                 case LLAMA_POOLING_TYPE_MEAN:
                 case LLAMA_POOLING_TYPE_CLS:
                 case LLAMA_POOLING_TYPE_LAST:
                     {
-                        // extract sequence embeddings
+                        // extract sequence embeddings (cleared before processing each batch)
                         auto & embd_seq_out = embd_seq;
-                        embd_seq_out.clear();
 
-                        GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
-
-                        for (uint32_t i = 0; i < n_tokens; i++) {
-                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
                             if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                                 continue;
                             }
@@ -1356,93 +1498,306 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
                     } break;
                 case LLAMA_POOLING_TYPE_RANK:
                     {
-                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
-                        //       wait for an encoder model that requires this pooling type in order to test it
-                        //       https://github.com/ggerganov/llama.cpp/pull/9510
-                        GGML_ABORT("RANK pooling not implemented yet");
-                    }
+                        // extract the rerank score - a single float per sequence
+                        auto & embd_seq_out = embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(1);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                        }
+                    } break;
                 case LLAMA_POOLING_TYPE_UNSPECIFIED:
                     {
                         GGML_ABORT("unknown pooling type");
                     }
             }
         }
-    }
-
-    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-    // overlap with device computation.
-    ggml_backend_sched_reset(sched.get());
 
-    return 0;
-}
+        n_outputs_prev += n_outputs;
+    }
 
-enum ggml_status llama_context_kv_self::compute_graph(
-            ggml_cgraph * graph,
-                   bool   batched) {
-    int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
-    ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
+    // set output mappings
+    {
+        bool sorted_output = true;
 
-    if (backend_cpu != nullptr) {
-        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
-        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
-        set_threadpool_fn(backend_cpu, tp);
-    }
+        GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
 
-    // set the number of threads for all the backends
-    for (const auto & set_n_threads_fn : set_n_threads_fns) {
-        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
-    }
+        for (size_t i = 0; i < (size_t) n_outputs_all; ++i) {
+            size_t out_id = sbatch.out_ids[i];
+            output_ids[out_id] = i;
+            if (out_id != i) {
+                sorted_output = false;
+            }
+        }
 
-    auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph);
-    if (status != GGML_STATUS_SUCCESS) {
-        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
+        if (sorted_output) {
+            sbatch.out_ids.clear();
+        }
     }
 
-    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
+    // set to total number of outputs in the batch, for use in llama_get_logits_ith
+    n_outputs = n_outputs_all;
 
-    return status;
-}
+    // wait for the computation to finish (automatically done when obtaining the model output)
+    //synchronize();
 
-llama_pos llama_context_kv_self::pos_max() const {
-    return kv_self.pos_max();
-}
+    bman->finalize();
 
-uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) const {
-    return kv_self.get_padding(cparams);
-}
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
 
-void llama_context_kv_self::prepare_k_shift() {
+    return 0;
 }
 
-void llama_context_kv_self::prepare_defrag() {
-}
+int llama_context_kv_self::encode(llama_batch & inp_batch) {
+    is_encoding = true;
 
-// llama input
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
 
-void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) {
-    const llama_hparams & hparams = model.hparams;
+    // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
 
-    //
-    // set input data
-    //
+    const llama_batch & batch = batch_allocr.batch;
+    const uint32_t n_tokens = batch.n_tokens;
 
-    if (inp_K_shift) {
-        assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
+    const auto & hparams = model.hparams;
 
-        int32_t * data = (int32_t *) inp_K_shift->data;
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
-        for (uint32_t i = 0; i < kv_self.size; ++i) {
-            data[i] = kv_self.cells[i].delta;
+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
         }
-
-        // the K-shift graph requires just this input
-        return;
     }
 
-    if (ubatch.token) {
-        const int64_t n_tokens = ubatch.n_tokens;
+    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
+    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
 
-        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
+    if (t_compute_start_us == 0) {
+        t_compute_start_us = ggml_time_us();
+    }
+
+    n_queued_tokens += n_tokens;
+
+    const int64_t n_embd = hparams.n_embd;
+
+    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+
+    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
+
+    // reserve output buffer
+    if (reserve_outputs(n_tokens) < n_tokens) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+        return -2;
+    };
+
+    for (uint32_t i = 0; i < n_tokens; ++i) {
+        output_ids[i] = i;
+    }
+
+    inp_embd_enc = NULL;
+    n_outputs = n_tokens;
+
+    //batch_manager->prepare(ubatch);
+
+    // TODO: do reserve
+    GGML_ASSERT(need_reserve == false);
+
+    ggml_backend_sched_reset(sched.get());
+    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+    ggml_cgraph * gf = build_graph(ubatch, false);
+
+    ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+    set_inputs(ubatch);
+
+    // the output embeddings after the final encoder normalization
+    struct ggml_tensor * t_embd = nullptr;
+
+    // there are two cases here
+    if (llama_model_has_decoder(&model)) {
+        // first case is an encoder-decoder T5 model where embeddings are passed to decoder
+        t_embd = ggml_graph_node(gf, -1);
+        GGML_ASSERT(strcmp(t_embd->name, "result_norm") == 0 && "missing result_output tensor");
+    } else {
+        // second case is an encoder-only T5 model
+        if (cparams.embeddings) {
+            // only output embeddings if required
+            t_embd = ggml_graph_node(gf, -1);
+            if (strcmp(t_embd->name, "result_embd_pooled") != 0) {
+                t_embd = ggml_graph_node(gf, -2);
+            }
+            GGML_ASSERT(strcmp(t_embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
+        }
+    }
+
+    const auto compute_status = compute_graph(gf, n_tokens > 1);
+    switch (compute_status) {
+        case GGML_STATUS_SUCCESS:
+            break;
+        case GGML_STATUS_ABORTED:
+            return 2;
+        case GGML_STATUS_ALLOC_FAILED:
+            return -2;
+        case GGML_STATUS_FAILED:
+        default:
+            return -3;
+    }
+
+    // extract embeddings
+    if (t_embd) {
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+        GGML_ASSERT(backend_embd != nullptr);
+
+        if (llama_model_has_decoder(&model)) {
+            embd_enc.resize(n_tokens*n_embd);
+            float * embd_out = embd_enc.data();
+
+            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+            GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+            // remember the sequence ids used during the encoding - needed for cross attention later
+            seq_ids_enc.resize(n_tokens);
+            for (uint32_t i = 0; i < n_tokens; i++) {
+                for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
+                    llama_seq_id seq_id = ubatch.seq_id[i][s];
+                    seq_ids_enc[i].insert(seq_id);
+                }
+            }
+        } else {
+            GGML_ASSERT(embd != nullptr);
+
+            switch (cparams.pooling_type) {
+                case LLAMA_POOLING_TYPE_NONE:
+                    {
+                        // extract token embeddings
+                        GGML_ASSERT(embd != nullptr);
+                        float * embd_out = embd;
+
+                        GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                    } break;
+                case LLAMA_POOLING_TYPE_MEAN:
+                case LLAMA_POOLING_TYPE_CLS:
+                case LLAMA_POOLING_TYPE_LAST:
+                    {
+                        // extract sequence embeddings
+                        auto & embd_seq_out = embd_seq;
+                        embd_seq_out.clear();
+
+                        GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+                        for (uint32_t i = 0; i < n_tokens; i++) {
+                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(n_embd);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                        //       wait for an encoder model that requires this pooling type in order to test it
+                        //       https://github.com/ggerganov/llama.cpp/pull/9510
+                        GGML_ABORT("RANK pooling not implemented yet");
+                    }
+                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                    {
+                        GGML_ABORT("unknown pooling type");
+                    }
+            }
+        }
+    }
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    return 0;
+}
+
+enum ggml_status llama_context_kv_self::compute_graph(
+            ggml_cgraph * graph,
+                   bool   batched) {
+    int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
+    ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
+
+    if (backend_cpu != nullptr) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
+        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
+        set_threadpool_fn(backend_cpu, tp);
+    }
+
+    // set the number of threads for all the backends
+    for (const auto & set_n_threads_fn : set_n_threads_fns) {
+        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
+    }
+
+    auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph);
+    if (status != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
+    }
+
+    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
+
+    return status;
+}
+
+llama_pos llama_context_kv_self::pos_max() const {
+    return kv_self.pos_max();
+}
+
+uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) const {
+    return kv_self.get_padding(cparams);
+}
+
+void llama_context_kv_self::prepare_k_shift() {
+}
+
+void llama_context_kv_self::prepare_defrag() {
+}
+
+// llama input
+
+void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) {
+    const llama_hparams & hparams = model.hparams;
+
+    //
+    // set input data
+    //
+
+    if (inp_K_shift) {
+        assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
+
+        int32_t * data = (int32_t *) inp_K_shift->data;
+
+        for (uint32_t i = 0; i < kv_self.size; ++i) {
+            data[i] = kv_self.cells[i].delta;
+        }
+
+        // the K-shift graph requires just this input
+        return;
+    }
+
+    if (ubatch.token) {
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
     }
 
     if (ubatch.embd) {
@@ -2810,646 +3165,323 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer(
     ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
 
     // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
-    cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
-
-    // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
-    struct ggml_tensor * xz = build_lora_mm(ctx0, model.layers[il].ssm_in, cur);
-    // split the above in two
-    // => {d_inner, n_seq_tokens, n_seqs}
-    struct ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
-    struct ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
-
-    // conv
-    {
-        // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
-        struct ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
-
-        // copy last (d_conv - 1) columns back into the state cache
-        struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
-
-        ggml_build_forward_expand(graph,
-            ggml_cpy(ctx0, last_conv,
-                ggml_view_1d(ctx0, conv_states_all,
-                    (d_conv - 1)*(d_inner)*(n_seqs),
-                    kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
-
-        // 1D convolution
-        // The equivalent is to make a self-overlapping view of conv_x
-        // over d_conv columns at each stride in the 3rd dimension,
-        // then element-wise multiply that with the conv1d weight,
-        // then sum the elements of each row,
-        // (the last two steps are a dot product over rows (also doable with mul_mat))
-        // then permute away the ne[0] dimension,
-        // and then you're left with the resulting x tensor.
-        // For simultaneous sequences, all sequences need to have the same length.
-        x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
-
-        // bias
-        x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
-
-        x = ggml_silu(ctx0, x);
-    }
-
-    // ssm
-    {
-        // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
-        struct ggml_tensor * x_db = build_lora_mm(ctx0, model.layers[il].ssm_x, x);
-        // split
-        struct ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
-        struct ggml_tensor * B  = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
-        struct ggml_tensor * C  = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
-
-        // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
-        if (ssm_dt_b_c_rms) {
-            dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
-            B = ggml_rms_norm(ctx0, B, norm_rms_eps);
-            C = ggml_rms_norm(ctx0, C, norm_rms_eps);
-        }
-
-        // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
-        dt = build_lora_mm(ctx0, model.layers[il].ssm_dt, dt);
-        dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
-
-        // Custom operator to optimize the parallel associative scan
-        // as described in the Annex D of the Mamba paper.
-        // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
-        struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
-
-        // store last states
-        ggml_build_forward_expand(graph,
-            ggml_cpy(ctx0,
-                ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
-                ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
-
-        struct ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
-
-        // TODO: skip computing output earlier for unused tokens
-
-        // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
-        y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
-        y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
-
-        // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
-        cur = build_lora_mm(ctx0, model.layers[il].ssm_out, y);
-    }
-
-    // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
-    cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
-    //cb(cur, "mamba_out", il);
-
-    return cur;
-}
-
-
-ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load(
-        ggml_context * ctx0,
-         ggml_cgraph * graph,
-         ggml_tensor * state_copy,
-         ggml_tensor * state_mask,
-  const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
-    const auto & hparams = model.hparams;
-
-    const auto token_shift_count = hparams.token_shift_count;
-
-    const auto & n_tokens = ubatch.n_tokens;
-    const int64_t n_seqs  = ubatch.n_seqs;
-
-    struct ggml_tensor * token_shift_all = kv_self.k_l[il];
-
-    struct ggml_tensor * token_shift = build_copy_mask_state(
-            ctx0, graph, token_shift_all, state_copy, state_mask,
-            n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
-
-    token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
-
-    return token_shift;
-}
-
-
-ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store(
-        ggml_context * ctx0,
-         ggml_tensor * token_shift,
-  const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
-    const auto & hparams = model.hparams;
-
-    const auto token_shift_count = hparams.token_shift_count;
-    const auto n_embd = hparams.n_embd;
-
-    const auto & n_tokens = ubatch.n_tokens;
-    const int64_t n_seqs  = ubatch.n_seqs;
-
-    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
-
-    return ggml_cpy(
-        ctx0,
-        ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
-        ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
-    );
-}
-
-
-ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
-        ggml_context * ctx0,
-         ggml_cgraph * graph,
-         ggml_tensor * cur,
-         ggml_tensor * x_prev,
-         ggml_tensor * state_copy,
-         ggml_tensor * state_mask,
-  const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
-    const auto & hparams = model.hparams;
-
-    const auto n_tokens = ubatch.n_tokens;
-    const auto n_seqs = ubatch.n_seqs;
-    const auto n_embd = hparams.n_embd;
-    const auto head_size = hparams.wkv_head_size;
-    const auto n_head = n_embd / head_size;
-    const auto n_head_kv = hparams.n_head_kv(il);
-
-    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
-
-    const auto layer = &model.layers[il];
-
-    bool is_qrwkv = layer->time_mix_first == nullptr;
-
-    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
-    struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur);
-
-    xxx = ggml_reshape_4d(
-        ctx0,
-        ggml_tanh(
-            ctx0,
-            ggml_mul_mat(ctx0, layer->time_mix_w1, xxx)
-        ),
-        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
-    );
-
-    xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
-
-    xxx = ggml_mul_mat(
-        ctx0,
-        ggml_reshape_4d(
-            ctx0,
-            layer->time_mix_w2,
-            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
-        ),
-        xxx
-    );
-
-    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
-    if (layer->time_mix_lerp_fused) {
-        // fusing these weights makes some performance improvement
-        sx  = ggml_reshape_3d(ctx0, sx,  n_embd, 1, n_tokens);
-        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
-        xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur);
-        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
-        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
-        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
-        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
-        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-    } else {
-        // for backward compatibility
-        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
-        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
-        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
-        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
-        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-
-        xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur);
-        xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur);
-        xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur);
-        xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur);
-        xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur);
-    }
-
-    struct ggml_tensor * r = build_lora_mm(ctx0, layer->time_mix_receptance, xr);
-    struct ggml_tensor * k = build_lora_mm(ctx0, layer->time_mix_key,        xk);
-    struct ggml_tensor * v = build_lora_mm(ctx0, layer->time_mix_value,      xv);
-    if (layer->time_mix_receptance_b) {
-        r = ggml_add(ctx0, r, layer->time_mix_receptance_b);
-    }
-    if (layer->time_mix_key_b) {
-        k = ggml_add(ctx0, k, layer->time_mix_key_b);
-    }
-    if (layer->time_mix_value_b) {
-        v = ggml_add(ctx0, v, layer->time_mix_value_b);
-    }
-
-    struct ggml_tensor * g = build_lora_mm(ctx0, layer->time_mix_gate, xg);
-    if (is_qrwkv) {
-        g = ggml_sigmoid(ctx0, g);
-    } else {
-        g = ggml_silu(ctx0, g);
-    }
-
-    if (n_head_kv != 0 && n_head_kv != n_head) {
-        GGML_ASSERT(n_head % n_head_kv == 0);
-        k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
-        v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
-        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
-        k = ggml_repeat(ctx0, k, tmp);
-        v = ggml_repeat(ctx0, v, tmp);
-    }
-
-    k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
-    v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
-    r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
-
-    struct ggml_tensor * w = ggml_mul_mat(
-        ctx0,
-        layer->time_mix_decay_w2,
-        ggml_tanh(
-            ctx0,
-            ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw)
-        )
-    );
-
-    w = ggml_add(ctx0, w, layer->time_mix_decay);
-    w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
-    w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
-
-    if (is_qrwkv) {
-        // k = k * (1 - w)
-        k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
-    }
-
-    struct ggml_tensor * wkv_state = build_copy_mask_state(
-            ctx0, graph, kv_self.v_l[il], state_copy, state_mask,
-            n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
-
-    struct ggml_tensor * wkv_output;
-    if (is_qrwkv) {
-        wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
-    } else {
-        wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, wkv_state);
-    }
-    cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
-    wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
-
-    ggml_build_forward_expand(
-        graph,
-        ggml_cpy(
-            ctx0,
-            wkv_state,
-            ggml_view_1d(
-                ctx0,
-                kv_self.v_l[il],
-                hparams.n_embd_v_s() * n_seqs,
-                hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-            )
-        )
-    );
-
-    if (!is_qrwkv) {
-        // group norm with head_count groups
-        cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
-        cur = ggml_norm(ctx0, cur, 64e-5f);
-
-        // Convert back to regular vectors.
-        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b);
-    } else {
-        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-    }
-
-    cur = ggml_mul(ctx0, cur, g);
-    cur = build_lora_mm(ctx0, layer->time_mix_output, cur);
-
-    return cur;
-}
-
-//
-// state
-//
-
-// TODO: this needs a big rework
-
-class llama_io_write_dummy : public llama_io_write_i {
-public:
-    llama_io_write_dummy() = default;
-
-    void write(const void * /* src */, size_t size) override {
-        size_written += size;
-    }
-
-    void write_tensor(const ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
-        size_written += size;
-    }
+    cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
 
-    size_t n_bytes() override {
-        return size_written;
-    }
+    // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
+    struct ggml_tensor * xz = build_lora_mm(ctx0, model.layers[il].ssm_in, cur);
+    // split the above in two
+    // => {d_inner, n_seq_tokens, n_seqs}
+    struct ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
+    struct ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
 
-    size_t size_written = 0;
-};
+    // conv
+    {
+        // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
+        struct ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
 
-class llama_io_write_buffer : public llama_io_write_i {
-public:
-    llama_io_write_buffer(
-            uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+        // copy last (d_conv - 1) columns back into the state cache
+        struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
 
-    void write(const void * src, size_t size) override {
-        if (size > buf_size) {
-            throw std::runtime_error("unexpectedly reached end of buffer");
-        }
-        memcpy(ptr, src, size);
-        ptr += size;
-        size_written += size;
-        buf_size -= size;
-    }
+        ggml_build_forward_expand(graph,
+            ggml_cpy(ctx0, last_conv,
+                ggml_view_1d(ctx0, conv_states_all,
+                    (d_conv - 1)*(d_inner)*(n_seqs),
+                    kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
 
-    void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
-        if (size > buf_size) {
-            throw std::runtime_error("unexpectedly reached end of buffer");
-        }
-        ggml_backend_tensor_get(tensor, ptr, offset, size);
-        ptr += size;
-        size_written += size;
-        buf_size -= size;
-    }
+        // 1D convolution
+        // The equivalent is to make a self-overlapping view of conv_x
+        // over d_conv columns at each stride in the 3rd dimension,
+        // then element-wise multiply that with the conv1d weight,
+        // then sum the elements of each row,
+        // (the last two steps are a dot product over rows (also doable with mul_mat))
+        // then permute away the ne[0] dimension,
+        // and then you're left with the resulting x tensor.
+        // For simultaneous sequences, all sequences need to have the same length.
+        x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
 
-    size_t n_bytes() override {
-        return size_written;
-    }
+        // bias
+        x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
 
-    uint8_t * ptr;
-    size_t buf_size = 0;
-    size_t size_written = 0;
-};
+        x = ggml_silu(ctx0, x);
+    }
 
-class llama_io_read_buffer : public llama_io_read_i {
-public:
-    llama_io_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+    // ssm
+    {
+        // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
+        struct ggml_tensor * x_db = build_lora_mm(ctx0, model.layers[il].ssm_x, x);
+        // split
+        struct ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
+        struct ggml_tensor * B  = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
+        struct ggml_tensor * C  = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
 
-    const uint8_t * read(size_t size) override {
-        const uint8_t * base_ptr = ptr;
-        if (size > buf_size) {
-            throw std::runtime_error("unexpectedly reached end of buffer");
+        // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
+        if (ssm_dt_b_c_rms) {
+            dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
+            B = ggml_rms_norm(ctx0, B, norm_rms_eps);
+            C = ggml_rms_norm(ctx0, C, norm_rms_eps);
         }
-        ptr += size;
-        size_read += size;
-        buf_size -= size;
-        return base_ptr;
-    }
 
-    void read_to(void * dst, size_t size) override {
-        memcpy(dst, read(size), size);
-    }
+        // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
+        dt = build_lora_mm(ctx0, model.layers[il].ssm_dt, dt);
+        dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
 
-    size_t n_bytes() override {
-        return size_read;
-    }
+        // Custom operator to optimize the parallel associative scan
+        // as described in the Annex D of the Mamba paper.
+        // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+        struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
 
-    const uint8_t * ptr;
-    size_t buf_size = 0;
-    size_t size_read = 0;
-};
+        // store last states
+        ggml_build_forward_expand(graph,
+            ggml_cpy(ctx0,
+                ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
+                ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
 
-class llama_io_write_file : public llama_io_write_i {
-public:
-    llama_io_write_file(llama_file * f) : file(f) {}
+        struct ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
 
-    void write(const void * src, size_t size) override {
-        file->write_raw(src, size);
-        size_written += size;
-    }
+        // TODO: skip computing output earlier for unused tokens
 
-    void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
-        temp_buffer.resize(size);
-        ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
-        write(temp_buffer.data(), temp_buffer.size());
-    }
+        // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
+        y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
+        y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
 
-    size_t n_bytes() override {
-        return size_written;
+        // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+        cur = build_lora_mm(ctx0, model.layers[il].ssm_out, y);
     }
 
-    llama_file * file;
-    size_t size_written = 0;
-    std::vector<uint8_t> temp_buffer;
-};
+    // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+    cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+    //cb(cur, "mamba_out", il);
 
-class llama_io_read_file : public llama_io_read_i {
-public:
-    llama_io_read_file(llama_file * f) : file(f) {}
+    return cur;
+}
 
-    void read_to(void * dst, size_t size) override {
-        file->read_raw(dst, size);
-        size_read += size;
-    }
 
-    const uint8_t * read(size_t size) override {
-        temp_buffer.resize(size);
-        read_to(temp_buffer.data(), size);
-        return temp_buffer.data();
-    }
+ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
 
-    size_t n_bytes() override {
-        return size_read;
-    }
+    const auto token_shift_count = hparams.token_shift_count;
 
-    llama_file * file;
-    size_t size_read = 0;
-    std::vector<uint8_t> temp_buffer;
-};
+    const auto & n_tokens = ubatch.n_tokens;
+    const int64_t n_seqs  = ubatch.n_seqs;
 
-size_t llama_context_kv_self::state_get_size() {
-    llama_io_write_dummy io;
-    try {
-        return state_get_data(io);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
-        return 0;
-    }
-}
+    struct ggml_tensor * token_shift_all = kv_self.k_l[il];
 
-size_t llama_context_kv_self::state_get_data(uint8_t * dst, size_t size) {
-    llama_io_write_buffer io(dst, size);
-    try {
-        return state_get_data(io);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
-        return 0;
-    }
-}
+    struct ggml_tensor * token_shift = build_copy_mask_state(
+            ctx0, graph, token_shift_all, state_copy, state_mask,
+            n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
 
-size_t llama_context_kv_self::state_set_data(const uint8_t * src, size_t size) {
-    llama_io_read_buffer io(src, size);
-    try {
-        return state_set_data(io);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
-        return 0;
-    }
-}
+    token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
 
-size_t llama_context_kv_self::state_seq_get_size(llama_seq_id seq_id) {
-    llama_io_write_dummy io;
-    try {
-        return state_seq_get_data(io, seq_id);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
-        return 0;
-    }
+    return token_shift;
 }
 
-size_t llama_context_kv_self::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
-    llama_io_write_buffer io(dst, size);
-    try {
-        return state_seq_get_data(io, seq_id);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
-        return 0;
-    }
-}
 
-size_t llama_context_kv_self::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
-    llama_io_read_buffer io(src, size);
-    try {
-        return state_seq_set_data(io, seq_id);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
-        return 0;
-    }
-}
+ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store(
+        ggml_context * ctx0,
+         ggml_tensor * token_shift,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
 
-bool llama_context_kv_self::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_file file(filepath, "rb");
+    const auto token_shift_count = hparams.token_shift_count;
+    const auto n_embd = hparams.n_embd;
 
-    // sanity checks
-    {
-        const uint32_t magic   = file.read_u32();
-        const uint32_t version = file.read_u32();
+    const auto & n_tokens = ubatch.n_tokens;
+    const int64_t n_seqs  = ubatch.n_seqs;
 
-        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
-            LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
-            return false;
-        }
-    }
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    return ggml_cpy(
+        ctx0,
+        ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
+        ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
+    );
+}
 
-    // load the prompt
-    {
-        const uint32_t n_token_count = file.read_u32();
 
-        if (n_token_count > n_token_capacity) {
-            LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
-            return false;
-        }
+ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * cur,
+         ggml_tensor * x_prev,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
 
-        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
-        *n_token_count_out = n_token_count;
-    }
+    const auto n_tokens = ubatch.n_tokens;
+    const auto n_seqs = ubatch.n_seqs;
+    const auto n_embd = hparams.n_embd;
+    const auto head_size = hparams.wkv_head_size;
+    const auto n_head = n_embd / head_size;
+    const auto n_head_kv = hparams.n_head_kv(il);
 
-    // restore the context state
-    {
-        const size_t n_state_size_cur = file.size() - file.tell();
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
 
-        llama_io_read_file io( &file);
-        const size_t n_read = state_set_data(io);
+    const auto layer = &model.layers[il];
 
-        if (n_read != n_state_size_cur) {
-            LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
-            return false;
-        }
-    }
+    bool is_qrwkv = layer->time_mix_first == nullptr;
 
-    return true;
-}
+    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+    struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur);
 
-bool llama_context_kv_self::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) {
-    llama_file file(filepath, "wb");
+    xxx = ggml_reshape_4d(
+        ctx0,
+        ggml_tanh(
+            ctx0,
+            ggml_mul_mat(ctx0, layer->time_mix_w1, xxx)
+        ),
+        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
+    );
 
-    file.write_u32(LLAMA_SESSION_MAGIC);
-    file.write_u32(LLAMA_SESSION_VERSION);
+    xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
 
-    // save the prompt
-    file.write_u32((uint32_t) n_token_count);
-    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+    xxx = ggml_mul_mat(
+        ctx0,
+        ggml_reshape_4d(
+            ctx0,
+            layer->time_mix_w2,
+            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
+        ),
+        xxx
+    );
 
-    // save the context state using stream saving
-    llama_io_write_file io(&file);
-    state_get_data(io);
+    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
+    if (layer->time_mix_lerp_fused) {
+        // fusing these weights makes some performance improvement
+        sx  = ggml_reshape_3d(ctx0, sx,  n_embd, 1, n_tokens);
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+        xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur);
+        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+    } else {
+        // for backward compatibility
+        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
 
-    return true;
-}
+        xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur);
+        xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur);
+        xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur);
+        xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur);
+        xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur);
+    }
 
-size_t llama_context_kv_self::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_file file(filepath, "rb");
+    struct ggml_tensor * r = build_lora_mm(ctx0, layer->time_mix_receptance, xr);
+    struct ggml_tensor * k = build_lora_mm(ctx0, layer->time_mix_key,        xk);
+    struct ggml_tensor * v = build_lora_mm(ctx0, layer->time_mix_value,      xv);
+    if (layer->time_mix_receptance_b) {
+        r = ggml_add(ctx0, r, layer->time_mix_receptance_b);
+    }
+    if (layer->time_mix_key_b) {
+        k = ggml_add(ctx0, k, layer->time_mix_key_b);
+    }
+    if (layer->time_mix_value_b) {
+        v = ggml_add(ctx0, v, layer->time_mix_value_b);
+    }
 
-    // version checks
-    {
-        const uint32_t magic   = file.read_u32();
-        const uint32_t version = file.read_u32();
+    struct ggml_tensor * g = build_lora_mm(ctx0, layer->time_mix_gate, xg);
+    if (is_qrwkv) {
+        g = ggml_sigmoid(ctx0, g);
+    } else {
+        g = ggml_silu(ctx0, g);
+    }
 
-        if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
-            LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
-            return 0;
-        }
+    if (n_head_kv != 0 && n_head_kv != n_head) {
+        GGML_ASSERT(n_head % n_head_kv == 0);
+        k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
+        v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
+        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
+        k = ggml_repeat(ctx0, k, tmp);
+        v = ggml_repeat(ctx0, v, tmp);
     }
 
-    // load the prompt
-    {
-        const uint32_t n_token_count = file.read_u32();
+    k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
+    v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
+    r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
 
-        if (n_token_count > n_token_capacity) {
-            LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
-            return 0;
-        }
+    struct ggml_tensor * w = ggml_mul_mat(
+        ctx0,
+        layer->time_mix_decay_w2,
+        ggml_tanh(
+            ctx0,
+            ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw)
+        )
+    );
 
-        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
-        *n_token_count_out = n_token_count;
-    }
+    w = ggml_add(ctx0, w, layer->time_mix_decay);
+    w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
+    w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
 
-    // restore the context state
-    {
-        const size_t state_size = file.size() - file.tell();
-        llama_io_read_file io(&file);
-        const size_t nread = state_seq_set_data(io, seq_id);
-        if (!nread) {
-            LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
-            return 0;
-        }
-        GGML_ASSERT(nread <= state_size);
-        GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
+    if (is_qrwkv) {
+        // k = k * (1 - w)
+        k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
     }
 
-    return file.tell();
-}
+    struct ggml_tensor * wkv_state = build_copy_mask_state(
+            ctx0, graph, kv_self.v_l[il], state_copy, state_mask,
+            n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
 
-size_t llama_context_kv_self::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) {
-    llama_file file(filepath, "wb");
+    struct ggml_tensor * wkv_output;
+    if (is_qrwkv) {
+        wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
+    } else {
+        wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, wkv_state);
+    }
+    cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
+    wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
 
-    file.write_u32(LLAMA_STATE_SEQ_MAGIC);
-    file.write_u32(LLAMA_STATE_SEQ_VERSION);
+    ggml_build_forward_expand(
+        graph,
+        ggml_cpy(
+            ctx0,
+            wkv_state,
+            ggml_view_1d(
+                ctx0,
+                kv_self.v_l[il],
+                hparams.n_embd_v_s() * n_seqs,
+                hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+            )
+        )
+    );
 
-    // save the prompt
-    file.write_u32((uint32_t) n_token_count);
-    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+    if (!is_qrwkv) {
+        // group norm with head_count groups
+        cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
+        cur = ggml_norm(ctx0, cur, 64e-5f);
 
-    // save the context state using stream saving
-    llama_io_write_file io(&file);
-    state_seq_get_data(io, seq_id);
+        // Convert back to regular vectors.
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b);
+    } else {
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    }
 
-    const size_t res = file.tell();
-    GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
+    cur = ggml_mul(ctx0, cur, g);
+    cur = build_lora_mm(ctx0, layer->time_mix_output, cur);
 
-    return res;
+    return cur;
 }
 
-size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
-    synchronize();
+// state save/load
 
-    // write model info
-    {
-        const std::string arch_str = llm_arch_name(model.arch);
-        io.write_string(arch_str);
-        // TODO: add more model-specific info which should prevent loading the session file if not identical
-    }
+size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
+    llama_context::state_get_data(io);
 
     // write output ids
     {
@@ -3492,7 +3524,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
         }
     }
 
-    // write mbeddings
+    // write embeddings
     {
         const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd);
 
@@ -3509,19 +3541,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
 }
 
 size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
-    synchronize();
-
-    // read model info
-    {
-        const std::string cur_arch_str = llm_arch_name(model.arch);
-
-        std::string arch_str;
-        io.read_string(arch_str);
-        if (cur_arch_str != arch_str) {
-            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
-        }
-        // TODO: add more info which needs to be identical but which is not verified otherwise
-    }
+    llama_context::state_set_data(io);
 
     // read output ids
     {
@@ -3584,7 +3604,7 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
 }
 
 size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
-    synchronize();
+    llama_context::state_seq_get_data(io, seq_id);
 
     kv_self.state_write(io, model.hparams, seq_id);
 
@@ -3592,7 +3612,7 @@ size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_se
 }
 
 size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
-    synchronize();
+    llama_context::state_seq_set_data(io, seq_id);
 
     kv_self.state_read(io, model.hparams, seq_id);
 
@@ -3937,15 +3957,21 @@ size_t llama_state_get_size(struct llama_context * ctx) {
 }
 
 size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
+    ctx->synchronize();
+
     return ctx->state_get_data(dst, size);
 }
 
 // Sets the state reading from the specified source address
 size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
+    ctx->synchronize();
+
     return ctx->state_set_data(src, size);
 }
 
 bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    ctx->synchronize();
+
     try {
         return ctx->state_load_file(path_session, tokens_out, n_token_capacity, n_token_count_out);
     } catch (const std::exception & err) {
@@ -3955,6 +3981,8 @@ bool llama_state_load_file(struct llama_context * ctx, const char * path_session
 }
 
 bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    ctx->synchronize();
+
     try {
         return ctx->state_save_file(path_session, tokens, n_token_count);
     } catch (const std::exception & err) {
@@ -3968,14 +3996,20 @@ size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id)
 }
 
 size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
+    ctx->synchronize();
+
     return ctx->state_seq_get_data(seq_id, dst, size);
 }
 
 size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
+    ctx->synchronize();
+
     return ctx->state_seq_set_data(seq_id, src, size);
 }
 
 size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
+    ctx->synchronize();
+
     try {
         return ctx->state_seq_save_file(seq_id, filepath, tokens, n_token_count);
     } catch (const std::exception & err) {
@@ -3985,6 +4019,8 @@ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepa
 }
 
 size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    ctx->synchronize();
+
     try {
         return ctx->state_seq_load_file(dest_seq_id, filepath, tokens_out, n_token_capacity, n_token_count_out);
     } catch (const std::exception & err) {
diff --git a/src/llama-context.h b/src/llama-context.h
index 204793d75a5b1..235fcfee4fb91 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -144,37 +144,37 @@ struct llama_context : public llama_graph_i {
 
     // state save/load
 
-    virtual size_t state_get_size()                                 = 0;
-    virtual size_t state_get_data(      uint8_t * dst, size_t size) = 0;
-    virtual size_t state_set_data(const uint8_t * src, size_t size) = 0;
+    virtual size_t state_get_size();
+    virtual size_t state_get_data(      uint8_t * dst, size_t size);
+    virtual size_t state_set_data(const uint8_t * src, size_t size);
 
-    virtual size_t state_seq_get_size(llama_seq_id seq_id)                                   = 0;
-    virtual size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) = 0;
-    virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) = 0;
+    virtual size_t state_seq_get_size(llama_seq_id seq_id);
+    virtual size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size);
+    virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
 
     virtual bool state_load_file(
             const char * filepath,
            llama_token * tokens_out,
                 size_t   n_token_capacity,
-                size_t * n_token_count_out) = 0;
+                size_t * n_token_count_out);
 
     virtual bool state_save_file(
             const char * filepath,
      const llama_token * tokens,
-                size_t   n_token_count) = 0;
+                size_t   n_token_count);
 
     virtual size_t state_seq_load_file(
           llama_seq_id   seq_id,
             const char * filepath,
            llama_token * tokens_out,
                 size_t   n_token_capacity,
-                size_t * n_token_count_out) = 0;
+                size_t * n_token_count_out);
 
     virtual size_t state_seq_save_file(
           llama_seq_id   seq_id,
             const char * filepath,
      const llama_token * tokens,
-                size_t   n_token_count) = 0;
+                size_t   n_token_count);
 
     // perf
 
@@ -183,6 +183,14 @@ struct llama_context : public llama_graph_i {
 
 protected:
 
+    // state save/load
+
+    virtual size_t state_get_data(llama_io_write_i & io);
+    virtual size_t state_set_data(llama_io_read_i  & io);
+
+    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id);
+    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id);
+
     // members
 
     const llama_model & model;
@@ -471,46 +479,12 @@ class llama_context_kv_self : public llama_context {
                      int   il,
                     bool   worst_case) override;
 
-    // state save/load
-
-    virtual size_t state_get_size()                                 override;
-    virtual size_t state_get_data(      uint8_t * dst, size_t size) override;
-    virtual size_t state_set_data(const uint8_t * src, size_t size) override;
-
-    virtual size_t state_seq_get_size(llama_seq_id seq_id)                                   override;
-    virtual size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) override;
-    virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override;
-
-    virtual bool state_load_file(
-            const char * filepath,
-           llama_token * tokens_out,
-                size_t   n_token_capacity,
-                size_t * n_token_count_out) override;
-
-    virtual bool state_save_file(
-            const char * filepath,
-     const llama_token * tokens,
-                size_t   n_token_count) override;
-
-    virtual size_t state_seq_load_file(
-          llama_seq_id   seq_id,
-            const char * filepath,
-           llama_token * tokens_out,
-                size_t   n_token_capacity,
-                size_t * n_token_count_out) override;
-
-    virtual size_t state_seq_save_file(
-          llama_seq_id   seq_id,
-            const char * filepath,
-     const llama_token * tokens,
-                size_t   n_token_count) override;
-
-private:
-    size_t state_get_data(llama_io_write_i & io);
-    size_t state_set_data(llama_io_read_i  & io);
+protected:
+    virtual size_t state_get_data(llama_io_write_i & io) override;
+    virtual size_t state_set_data(llama_io_read_i  & io) override;
 
-    size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id);
-    size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id);
+    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
+    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
 };
 
 // For internal test use

From e08f38df69b0cf47b461c16d2541e78ddd3b9cb7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 13 Feb 2025 12:50:53 +0200
Subject: [PATCH 41/84] context : minor cleanup

ggml-ci
---
 src/llama-context.cpp | 57 +++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index bde6659531024..e234e3683bc39 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -10,30 +10,6 @@
 #include <stdexcept>
 #include <cinttypes>
 
-static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
-    // TODO move to hparams if a T5 variant appears that uses a different value
-    const int64_t max_distance = 128;
-
-    if (bidirectional) {
-        n_buckets >>= 1;
-    }
-
-    const int64_t max_exact = n_buckets >> 1;
-
-    int32_t relative_position = x - y;
-    int32_t relative_bucket = 0;
-    if (bidirectional) {
-        relative_bucket += (relative_position > 0) * n_buckets;
-        relative_position = abs(relative_position);
-    } else {
-        relative_position = -std::min<int32_t>(relative_position, 0);
-    }
-    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
-    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
-    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
-    return relative_bucket;
-}
-
 //
 // llama_context
 //
@@ -346,6 +322,7 @@ class llama_io_write_dummy : public llama_io_write_i {
         return size_written;
     }
 
+private:
     size_t size_written = 0;
 };
 
@@ -378,6 +355,7 @@ class llama_io_write_buffer : public llama_io_write_i {
         return size_written;
     }
 
+private:
     uint8_t * ptr;
     size_t buf_size = 0;
     size_t size_written = 0;
@@ -406,6 +384,7 @@ class llama_io_read_buffer : public llama_io_read_i {
         return size_read;
     }
 
+private:
     const uint8_t * ptr;
     size_t buf_size = 0;
     size_t size_read = 0;
@@ -430,6 +409,7 @@ class llama_io_write_file : public llama_io_write_i {
         return size_written;
     }
 
+private:
     llama_file * file;
     size_t size_written = 0;
     std::vector<uint8_t> temp_buffer;
@@ -454,6 +434,7 @@ class llama_io_read_file : public llama_io_read_i {
         return size_read;
     }
 
+private:
     llama_file * file;
     size_t size_read = 0;
     std::vector<uint8_t> temp_buffer;
@@ -2132,6 +2113,30 @@ void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) {
         GGML_ASSERT(ggml_backend_buffer_is_host(inp_pos_bucket->buffer));
         GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
 
+        static const auto relative_position_bucket = [](llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+            // TODO move to hparams if a T5 variant appears that uses a different value
+            const int64_t max_distance = 128;
+
+            if (bidirectional) {
+                n_buckets >>= 1;
+            }
+
+            const int64_t max_exact = n_buckets >> 1;
+
+            int32_t relative_position = x - y;
+            int32_t relative_bucket = 0;
+            if (bidirectional) {
+                relative_bucket += (relative_position > 0) * n_buckets;
+                relative_position = abs(relative_position);
+            } else {
+                relative_position = -std::min<int32_t>(relative_position, 0);
+            }
+            int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
+            relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
+            relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
+            return relative_bucket;
+        };
+
         int32_t * data = (int32_t *) inp_pos_bucket->data;
 
         if (!is_encoding) {
@@ -2139,7 +2144,7 @@ void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) {
             for (int h = 0; h < 1; ++h) {
                 for (int j = 0; j < n_tokens; ++j) {
                     for (int i = 0; i < n_kv; ++i) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding);
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding);
                     }
                 }
             }
@@ -2147,7 +2152,7 @@ void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) {
             for (int h = 0; h < 1; ++h) {
                 for (int j = 0; j < n_tokens; ++j) {
                     for (int i = 0; i < n_tokens; ++i) {
-                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding);
+                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding);
                     }
                 }
             }

From 107d1e2c32612552676db06c028a2cf4d7f2aa03 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 13 Feb 2025 15:42:14 +0200
Subject: [PATCH 42/84] context : move output functionality to base class

ggml-ci
---
 src/llama-context.cpp | 760 +++++++++++++++++++++---------------------
 src/llama-context.h   |  97 +++---
 2 files changed, 419 insertions(+), 438 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index e234e3683bc39..33c256feddc8a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -58,6 +58,105 @@ enum llama_pooling_type llama_context::pooling_type() const {
     return cparams.pooling_type;
 }
 
+float * llama_context::get_logits() {
+    // reorder logits for backward compatibility
+    output_reorder();
+
+    return logits;
+}
+
+float * llama_context::get_logits_ith(int32_t i) {
+    int32_t j = -1;
+
+    try {
+        if (logits == nullptr) {
+            throw std::runtime_error("no logits");
+        }
+
+        if (i < 0) {
+            j = n_outputs + i;
+            if (j < 0) {
+                throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
+            }
+        } else if ((size_t) i >= output_ids.size()) {
+            throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
+        } else {
+            j = output_ids[i];
+        }
+
+        if (j < 0) {
+            throw std::runtime_error(format("batch.logits[%d] != true", i));
+        }
+        if (j >= n_outputs) {
+            // This should not happen
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
+        }
+
+        return logits + j*model.vocab.n_tokens();
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+        GGML_ABORT("fatal error");
+#else
+        return nullptr;
+#endif
+    }
+}
+
+float * llama_context::get_embeddings() {
+    // reorder embeddings for backward compatibility
+    output_reorder();
+
+    return embd;
+}
+
+float * llama_context::get_embeddings_ith(int32_t i) {
+    int32_t j = -1;
+
+    try {
+        if (embd == nullptr) {
+            throw std::runtime_error("no embeddings");
+        }
+
+        if (i < 0) {
+            j = n_outputs + i;
+            if (j < 0) {
+                throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
+            }
+        } else if ((size_t) i >= output_ids.size()) {
+            throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
+        } else {
+            j = output_ids[i];
+        }
+
+        if (j < 0) {
+            throw std::runtime_error(format("batch.logits[%d] != true", i));
+        }
+        if (j >= n_outputs) {
+            // This should not happen
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
+        }
+
+        return embd + j*model.hparams.n_embd;
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+        GGML_ABORT("fatal error");
+#else
+        return nullptr;
+#endif
+    }
+}
+
+float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
+    auto it = embd_seq.find(seq_id);
+    if (it == embd_seq.end()) {
+        return nullptr;
+    }
+
+    return it->second.data();
+}
+
 int64_t llama_context::n_pos_per_token() const {
     return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
 }
@@ -631,6 +730,58 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
         // TODO: add more model-specific info which should prevent loading the session file if not identical
     }
 
+    // write output ids
+    {
+        output_reorder();
+
+        const uint32_t n_outputs = this->n_outputs;
+        const auto & output_ids  = this->output_ids;
+
+        std::vector<int32_t> w_output_pos;
+
+        GGML_ASSERT(n_outputs <= output_size);
+
+        w_output_pos.resize(n_outputs);
+
+        // build a more compact representation of the output ids
+        for (size_t i = 0; i < n_batch(); ++i) {
+            // map an output id to a position in the batch
+            int32_t pos = output_ids[i];
+            if (pos >= 0) {
+                GGML_ASSERT((uint32_t) pos < n_outputs);
+                w_output_pos[pos] = i;
+            }
+        }
+
+        io.write(&n_outputs, sizeof(n_outputs));
+
+        if (n_outputs) {
+            io.write(w_output_pos.data(), n_outputs * sizeof(int32_t));
+        }
+    }
+
+    // write logits
+    {
+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
+
+        io.write(&logits_size, sizeof(logits_size));
+
+        if (logits_size) {
+            io.write(logits, logits_size * sizeof(float));
+        }
+    }
+
+    // write embeddings
+    {
+        const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd);
+
+        io.write(&embd_size, sizeof(embd_size));
+
+        if (embd_size) {
+            io.write(embd, embd_size * sizeof(float));
+        }
+    }
+
     return io.n_bytes();
 }
 
@@ -647,6 +798,61 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
         // TODO: add more info which needs to be identical but which is not verified otherwise
     }
 
+    // read output ids
+    {
+        std::vector<int32_t> output_pos;
+
+        uint32_t n_outputs;
+        io.read_to(&n_outputs, sizeof(n_outputs));
+
+        if (n_outputs > output_reserve(n_outputs)) {
+            throw std::runtime_error("could not reserve outputs");
+        }
+
+        if (n_outputs) {
+            output_pos.resize(n_outputs);
+            io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
+
+            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
+                int32_t id = output_pos[i];
+                if ((uint32_t) id >= n_batch()) {
+                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch()));
+                }
+                this->output_ids[id] = i;
+            }
+
+            this->n_outputs = n_outputs;
+        }
+    }
+
+    // read logits
+    {
+        uint64_t logits_size;
+        io.read_to(&logits_size, sizeof(logits_size));
+
+        if (this->logits_size < logits_size) {
+            throw std::runtime_error("logits buffer too small");
+        }
+
+        if (logits_size) {
+            io.read_to(this->logits, logits_size * sizeof(float));
+        }
+    }
+
+    // read embeddings
+    {
+        uint64_t embd_size;
+        io.read_to(&embd_size, sizeof(embd_size));
+
+        if (this->embd_size < embd_size) {
+            throw std::runtime_error("embeddings buffer too small");
+        }
+
+        if (embd_size) {
+            io.read_to(this->embd, embd_size * sizeof(float));
+        }
+    }
+
     return io.n_bytes();
 }
 
@@ -852,7 +1058,7 @@ llama_context_kv_self::llama_context_kv_self(
         // graph outputs buffer
         {
             // resized during inference when a batch uses more outputs
-            if (reserve_outputs(params.n_seq_max) < params.n_seq_max) {
+            if (output_reserve(params.n_seq_max) < params.n_seq_max) {
                 LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
                 throw std::runtime_error("failed to reserve initial output buffer");
             }
@@ -988,105 +1194,6 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
     return &kv_self;
 }
 
-float * llama_context_kv_self::get_logits() {
-    // reorder logits for backward compatibility
-    reorder_outputs();
-
-    return logits;
-}
-
-float * llama_context_kv_self::get_logits_ith(int32_t i) {
-    int32_t j = -1;
-
-    try {
-        if (logits == nullptr) {
-            throw std::runtime_error("no logits");
-        }
-
-        if (i < 0) {
-            j = n_outputs + i;
-            if (j < 0) {
-                throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
-            }
-        } else if ((size_t) i >= output_ids.size()) {
-            throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
-        } else {
-            j = output_ids[i];
-        }
-
-        if (j < 0) {
-            throw std::runtime_error(format("batch.logits[%d] != true", i));
-        }
-        if (j >= n_outputs) {
-            // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
-        }
-
-        return logits + j*model.vocab.n_tokens();
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
-#ifndef NDEBUG
-        GGML_ABORT("fatal error");
-#else
-        return nullptr;
-#endif
-    }
-}
-
-float * llama_context_kv_self::get_embeddings() {
-    // reorder embeddings for backward compatibility
-    reorder_outputs();
-
-    return embd;
-}
-
-float * llama_context_kv_self::get_embeddings_ith(int32_t i) {
-    int32_t j = -1;
-
-    try {
-        if (embd == nullptr) {
-            throw std::runtime_error("no embeddings");
-        }
-
-        if (i < 0) {
-            j = n_outputs + i;
-            if (j < 0) {
-                throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
-            }
-        } else if ((size_t) i >= output_ids.size()) {
-            throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
-        } else {
-            j = output_ids[i];
-        }
-
-        if (j < 0) {
-            throw std::runtime_error(format("batch.logits[%d] != true", i));
-        }
-        if (j >= n_outputs) {
-            // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
-        }
-
-        return embd + j*model.hparams.n_embd;
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
-#ifndef NDEBUG
-        GGML_ABORT("fatal error");
-#else
-        return nullptr;
-#endif
-    }
-}
-
-float * llama_context_kv_self::get_embeddings_seq(llama_seq_id seq_id) {
-    auto it = embd_seq.find(seq_id);
-    if (it == embd_seq.end()) {
-        return nullptr;
-    }
-
-    return it->second.data();
-}
-
 ggml_context_ptr llama_context_kv_self::init() {
     inp_tokens          = nullptr;
     inp_embd            = nullptr;
@@ -1357,7 +1464,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
     // reserve output buffer
     // TODO: move to batch manager?
-    if (reserve_outputs(bman->n_outputs_all) < (size_t) n_outputs_all) {
+    if (output_reserve(bman->n_outputs_all) < (size_t) n_outputs_all) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
         return -2;
     };
@@ -1579,7 +1686,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
 
     // reserve output buffer
-    if (reserve_outputs(n_tokens) < n_tokens) {
+    if (output_reserve(n_tokens) < n_tokens) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
         return -2;
     };
@@ -1712,33 +1819,6 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     return 0;
 }
 
-enum ggml_status llama_context_kv_self::compute_graph(
-            ggml_cgraph * graph,
-                   bool   batched) {
-    int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
-    ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
-
-    if (backend_cpu != nullptr) {
-        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
-        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
-        set_threadpool_fn(backend_cpu, tp);
-    }
-
-    // set the number of threads for all the backends
-    for (const auto & set_n_threads_fn : set_n_threads_fns) {
-        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
-    }
-
-    auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph);
-    if (status != GGML_STATUS_SUCCESS) {
-        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
-    }
-
-    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
-
-    return status;
-}
-
 llama_pos llama_context_kv_self::pos_max() const {
     return kv_self.pos_max();
 }
@@ -1747,12 +1827,6 @@ uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) c
     return kv_self.get_padding(cparams);
 }
 
-void llama_context_kv_self::prepare_k_shift() {
-}
-
-void llama_context_kv_self::prepare_defrag() {
-}
-
 // llama input
 
 void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) {
@@ -2192,117 +2266,10 @@ void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) {
             for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
                 for (int j = 0; j < n_output_enc; ++j) {
                     data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
-                }
-            }
-        }
-    }
-}
-
-void llama_context_kv_self::reorder_outputs() {
-    std::vector<size_t> & out_ids = sbatch.out_ids;
-    if (!out_ids.empty()) {
-        const uint32_t n_vocab = model.vocab.n_tokens();
-        const uint32_t n_embd  = model.hparams.n_embd;
-
-        GGML_ASSERT((size_t) n_outputs == out_ids.size());
-
-        // TODO: is there something more efficient which also minimizes swaps?
-        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-        for (int32_t i = 0; i < n_outputs - 1; ++i) {
-            int32_t j_min = i;
-            for (int32_t j = i + 1; j < n_outputs; ++j) {
-                if (out_ids[j] < out_ids[j_min]) {
-                    j_min = j;
-                }
-            }
-            if (j_min == i) { continue; }
-            std::swap(out_ids[i], out_ids[j_min]);
-            if (logits_size > 0) {
-                for (uint32_t k = 0; k < n_vocab; k++) {
-                    std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
-                }
-            }
-            if (embd_size > 0) {
-                for (uint32_t k = 0; k < n_embd; k++) {
-                    std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
-                }
-            }
-        }
-        std::fill(output_ids.begin(), output_ids.end(), -1);
-        for (int32_t i = 0; i < n_outputs; ++i) {
-            output_ids[out_ids[i]] = i;
-        }
-        out_ids.clear();
-    }
-}
-
-size_t llama_context_kv_self::reserve_outputs(size_t n_outputs) {
-    const auto & hparams = model.hparams;
-    const auto & vocab   = model.vocab;
-
-    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
-
-    const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_tokens();
-    const auto n_embd  = hparams.n_embd;
-
-    // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
-
-    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
-
-    if (output_ids.empty()) {
-        // init, never resized afterwards
-        output_ids.resize(n_batch);
-    }
-
-    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
-    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
-
-    // alloc only when more than the current capacity is required
-    // TODO: also consider shrinking the buffer
-    if (!buf_output || prev_size < new_size) {
-        if (buf_output) {
-#ifndef NDEBUG
-            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
-#endif
-            buf_output = nullptr;
-            logits = nullptr;
-            embd = nullptr;
-        }
-
-        auto * buft = ggml_backend_cpu_buffer_type();
-        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
-        auto * output_dev = model.dev_output();
-        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
-        if (output_dev_host_buft) {
-            buft = output_dev_host_buft;
-        }
-        buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
-        if (buf_output == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
-            return 0;
+                }
+            }
         }
     }
-
-    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
-
-    logits = has_logits ? output_base               : nullptr;
-    embd   = has_embd   ? output_base + logits_size : nullptr;
-
-    output_size = n_outputs_max;
-
-    // set all ids as invalid (negative)
-    std::fill(output_ids.begin(), output_ids.end(), -1);
-
-    ggml_backend_buffer_clear(buf_output.get(), 0);
-
-    n_outputs = 0;
-
-    return n_outputs_max;
 }
 
 void llama_context_kv_self::kv_self_update() {
@@ -2315,8 +2282,6 @@ void llama_context_kv_self::kv_self_update() {
 
         // apply K-shift if needed
         if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
-            prepare_k_shift();
-
             ggml_backend_sched_reset(sched.get());
 
             auto ctx = init();
@@ -2346,8 +2311,6 @@ void llama_context_kv_self::kv_self_update() {
 
     // defragment the KV cache if needed
     if (kv.do_defrag) {
-        prepare_defrag();
-
         ggml_backend_sched_reset(sched.get());
 
         auto ctx = init();
@@ -3333,20 +3296,20 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
 
     const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
 
-    const auto layer = &model.layers[il];
+    const auto & layer = model.layers[il];
 
-    bool is_qrwkv = layer->time_mix_first == nullptr;
+    bool is_qrwkv = layer.time_mix_first == nullptr;
 
     struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
-    struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur);
+    struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
 
     xxx = ggml_reshape_4d(
         ctx0,
         ggml_tanh(
             ctx0,
-            ggml_mul_mat(ctx0, layer->time_mix_w1, xxx)
+            ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
         ),
-        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
+        layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
     );
 
     xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
@@ -3355,18 +3318,18 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
         ctx0,
         ggml_reshape_4d(
             ctx0,
-            layer->time_mix_w2,
-            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
+            layer.time_mix_w2,
+            layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
         ),
         xxx
     );
 
     struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
-    if (layer->time_mix_lerp_fused) {
+    if (layer.time_mix_lerp_fused) {
         // fusing these weights makes some performance improvement
         sx  = ggml_reshape_3d(ctx0, sx,  n_embd, 1, n_tokens);
         cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
-        xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur);
+        xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur);
         xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
         xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
         xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
@@ -3380,27 +3343,27 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
         xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
         xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
 
-        xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur);
-        xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur);
-        xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur);
-        xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur);
-        xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur);
+        xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur);
+        xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur);
+        xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur);
+        xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur);
+        xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur);
     }
 
-    struct ggml_tensor * r = build_lora_mm(ctx0, layer->time_mix_receptance, xr);
-    struct ggml_tensor * k = build_lora_mm(ctx0, layer->time_mix_key,        xk);
-    struct ggml_tensor * v = build_lora_mm(ctx0, layer->time_mix_value,      xv);
-    if (layer->time_mix_receptance_b) {
-        r = ggml_add(ctx0, r, layer->time_mix_receptance_b);
+    struct ggml_tensor * r = build_lora_mm(ctx0, layer.time_mix_receptance, xr);
+    struct ggml_tensor * k = build_lora_mm(ctx0, layer.time_mix_key,        xk);
+    struct ggml_tensor * v = build_lora_mm(ctx0, layer.time_mix_value,      xv);
+    if (layer.time_mix_receptance_b) {
+        r = ggml_add(ctx0, r, layer.time_mix_receptance_b);
     }
-    if (layer->time_mix_key_b) {
-        k = ggml_add(ctx0, k, layer->time_mix_key_b);
+    if (layer.time_mix_key_b) {
+        k = ggml_add(ctx0, k, layer.time_mix_key_b);
     }
-    if (layer->time_mix_value_b) {
-        v = ggml_add(ctx0, v, layer->time_mix_value_b);
+    if (layer.time_mix_value_b) {
+        v = ggml_add(ctx0, v, layer.time_mix_value_b);
     }
 
-    struct ggml_tensor * g = build_lora_mm(ctx0, layer->time_mix_gate, xg);
+    struct ggml_tensor * g = build_lora_mm(ctx0, layer.time_mix_gate, xg);
     if (is_qrwkv) {
         g = ggml_sigmoid(ctx0, g);
     } else {
@@ -3422,14 +3385,14 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
 
     struct ggml_tensor * w = ggml_mul_mat(
         ctx0,
-        layer->time_mix_decay_w2,
+        layer.time_mix_decay_w2,
         ggml_tanh(
             ctx0,
-            ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw)
+            ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw)
         )
     );
 
-    w = ggml_add(ctx0, w, layer->time_mix_decay);
+    w = ggml_add(ctx0, w, layer.time_mix_decay);
     w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
     w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
 
@@ -3446,7 +3409,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
     if (is_qrwkv) {
         wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
     } else {
-        wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, wkv_state);
+        wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state);
     }
     cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
     wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
@@ -3472,13 +3435,13 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
 
         // Convert back to regular vectors.
         cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b);
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
     } else {
         cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
     }
 
     cur = ggml_mul(ctx0, cur, g);
-    cur = build_lora_mm(ctx0, layer->time_mix_output, cur);
+    cur = build_lora_mm(ctx0, layer.time_mix_output, cur);
 
     return cur;
 }
@@ -3488,58 +3451,6 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
 size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
     llama_context::state_get_data(io);
 
-    // write output ids
-    {
-        reorder_outputs();
-
-        const uint32_t n_outputs = this->n_outputs;
-        const auto & output_ids  = this->output_ids;
-
-        std::vector<int32_t> w_output_pos;
-
-        GGML_ASSERT(n_outputs <= output_size);
-
-        w_output_pos.resize(n_outputs);
-
-        // build a more compact representation of the output ids
-        for (size_t i = 0; i < n_batch(); ++i) {
-            // map an output id to a position in the batch
-            int32_t pos = output_ids[i];
-            if (pos >= 0) {
-                GGML_ASSERT((uint32_t) pos < n_outputs);
-                w_output_pos[pos] = i;
-            }
-        }
-
-        io.write(&n_outputs, sizeof(n_outputs));
-
-        if (n_outputs) {
-            io.write(w_output_pos.data(), n_outputs * sizeof(int32_t));
-        }
-    }
-
-    // write logits
-    {
-        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
-
-        io.write(&logits_size, sizeof(logits_size));
-
-        if (logits_size) {
-            io.write(logits, logits_size * sizeof(float));
-        }
-    }
-
-    // write embeddings
-    {
-        const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd);
-
-        io.write(&embd_size, sizeof(embd_size));
-
-        if (embd_size) {
-            io.write(embd, embd_size * sizeof(float));
-        }
-    }
-
     kv_self.state_write(io, model.hparams);
 
     return io.n_bytes();
@@ -3548,61 +3459,6 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
 size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
     llama_context::state_set_data(io);
 
-    // read output ids
-    {
-        std::vector<int32_t> output_pos;
-
-        uint32_t n_outputs;
-        io.read_to(&n_outputs, sizeof(n_outputs));
-
-        if (n_outputs > reserve_outputs(n_outputs)) {
-            throw std::runtime_error("could not reserve outputs");
-        }
-
-        if (n_outputs) {
-            output_pos.resize(n_outputs);
-            io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
-
-            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
-                int32_t id = output_pos[i];
-                if ((uint32_t) id >= n_batch()) {
-                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch()));
-                }
-                this->output_ids[id] = i;
-            }
-
-            this->n_outputs = n_outputs;
-        }
-    }
-
-    // read logits
-    {
-        uint64_t logits_size;
-        io.read_to(&logits_size, sizeof(logits_size));
-
-        if (this->logits_size < logits_size) {
-            throw std::runtime_error("logits buffer too small");
-        }
-
-        if (logits_size) {
-            io.read_to(this->logits, logits_size * sizeof(float));
-        }
-    }
-
-    // read embeddings
-    {
-        uint64_t embd_size;
-        io.read_to(&embd_size, sizeof(embd_size));
-
-        if (this->embd_size < embd_size) {
-            throw std::runtime_error("embeddings buffer too small");
-        }
-
-        if (embd_size) {
-            io.read_to(this->embd, embd_size * sizeof(float));
-        }
-    }
-
     kv_self.state_read(io, model.hparams);
 
     return io.n_bytes();
@@ -3768,6 +3624,140 @@ int32_t llama_apply_adapter_cvec(
     return res ? 0 : -1;
 }
 
+enum ggml_status llama_context::compute_graph(
+            ggml_cgraph * graph,
+                   bool   batched) {
+    int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
+    ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
+
+    if (backend_cpu != nullptr) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
+        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
+        set_threadpool_fn(backend_cpu, tp);
+    }
+
+    // set the number of threads for all the backends
+    for (const auto & set_n_threads_fn : set_n_threads_fns) {
+        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
+    }
+
+    auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph);
+    if (status != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
+    }
+
+    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
+
+    return status;
+}
+
+size_t llama_context::output_reserve(size_t n_outputs) {
+    const auto & hparams = model.hparams;
+    const auto & vocab   = model.vocab;
+
+    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
+
+    const auto n_batch = cparams.n_batch;
+    const auto n_vocab = vocab.n_tokens();
+    const auto n_embd  = hparams.n_embd;
+
+    // TODO: use a per-batch flag for logits presence instead
+    const bool has_logits = !cparams.embeddings;
+    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+
+    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
+    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
+
+    if (output_ids.empty()) {
+        // init, never resized afterwards
+        output_ids.resize(n_batch);
+    }
+
+    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
+
+    // alloc only when more than the current capacity is required
+    // TODO: also consider shrinking the buffer
+    if (!buf_output || prev_size < new_size) {
+        if (buf_output) {
+#ifndef NDEBUG
+            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+            buf_output = nullptr;
+            logits = nullptr;
+            embd = nullptr;
+        }
+
+        auto * buft = ggml_backend_cpu_buffer_type();
+        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
+        auto * output_dev = model.dev_output();
+        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+        if (output_dev_host_buft) {
+            buft = output_dev_host_buft;
+        }
+        buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
+        if (buf_output == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
+            return 0;
+        }
+    }
+
+    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
+
+    logits = has_logits ? output_base               : nullptr;
+    embd   = has_embd   ? output_base + logits_size : nullptr;
+
+    output_size = n_outputs_max;
+
+    // set all ids as invalid (negative)
+    std::fill(output_ids.begin(), output_ids.end(), -1);
+
+    ggml_backend_buffer_clear(buf_output.get(), 0);
+
+    n_outputs = 0;
+
+    return n_outputs_max;
+}
+
+void llama_context::output_reorder() {
+    std::vector<size_t> & out_ids = sbatch.out_ids;
+    if (!out_ids.empty()) {
+        const uint32_t n_vocab = model.vocab.n_tokens();
+        const uint32_t n_embd  = model.hparams.n_embd;
+
+        GGML_ASSERT((size_t) n_outputs == out_ids.size());
+
+        // TODO: is there something more efficient which also minimizes swaps?
+        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+        for (int32_t i = 0; i < n_outputs - 1; ++i) {
+            int32_t j_min = i;
+            for (int32_t j = i + 1; j < n_outputs; ++j) {
+                if (out_ids[j] < out_ids[j_min]) {
+                    j_min = j;
+                }
+            }
+            if (j_min == i) { continue; }
+            std::swap(out_ids[i], out_ids[j_min]);
+            if (logits_size > 0) {
+                for (uint32_t k = 0; k < n_vocab; k++) {
+                    std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
+                }
+            }
+            if (embd_size > 0) {
+                for (uint32_t k = 0; k < n_embd; k++) {
+                    std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
+                }
+            }
+        }
+        std::fill(output_ids.begin(), output_ids.end(), -1);
+        for (int32_t i = 0; i < n_outputs; ++i) {
+            output_ids[out_ids[i]] = i;
+        }
+        out_ids.clear();
+    }
+}
+
 //
 // kv cache view
 //
diff --git a/src/llama-context.h b/src/llama-context.h
index 235fcfee4fb91..16d138b4cbd35 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -43,12 +43,12 @@ struct llama_context : public llama_graph_i {
 
     virtual enum llama_pooling_type pooling_type() const;
 
-    virtual float * get_logits()              = 0;
-    virtual float * get_logits_ith(int32_t i) = 0;
+    virtual float * get_logits();
+    virtual float * get_logits_ith(int32_t i);
 
-    virtual float * get_embeddings()                        = 0;
-    virtual float * get_embeddings_ith(int32_t i)           = 0;
-    virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0;
+    virtual float * get_embeddings();
+    virtual float * get_embeddings_ith(int32_t i);
+    virtual float * get_embeddings_seq(llama_seq_id seq_id);
 
     virtual int64_t n_pos_per_token() const; // vision
 
@@ -85,6 +85,19 @@ struct llama_context : public llama_graph_i {
                 int32_t   il_start,
                 int32_t   il_end);
 
+    // returns the result of ggml_backend_sched_graph_compute_async execution
+    virtual enum ggml_status compute_graph(
+                ggml_cgraph * graph,
+                       bool   batched);
+
+    // Make sure enough space is available for outputs.
+    // Returns max number of outputs for which space was reserved.
+    virtual size_t output_reserve(size_t n_outputs);
+
+    // make the outputs have the same order they had in the user-provided batch
+    // TODO: maybe remove this
+    virtual void output_reorder();
+
     // graph build API (generic)
 
     virtual void build_cb(
@@ -198,6 +211,7 @@ struct llama_context : public llama_graph_i {
     llama_cparams      cparams;
     llama_adapter_cvec cvec;
     llama_loras        loras;
+    llama_sbatch       sbatch;
 
     ggml_threadpool_t threadpool       = nullptr;
     ggml_threadpool_t threadpool_batch = nullptr;
@@ -215,6 +229,31 @@ struct llama_context : public llama_graph_i {
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
 
+    // host buffer for the model output (logits and embeddings)
+    ggml_backend_buffer_ptr buf_output;
+
+    // TODO: remove
+    bool logits_all = false;
+
+    // decode output (2-dimensional array: [n_outputs][n_vocab])
+    size_t  logits_size = 0; // capacity (of floats) for logits
+    float * logits      = nullptr;
+
+    // embeddings output (2-dimensional array: [n_outputs][n_embd])
+    // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
+    size_t  embd_size = 0; // capacity (of floats) for embeddings
+    float * embd      = nullptr;
+
+    // sequence embeddings output (map of [n_embd] vectors)
+    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
+    std::map<llama_seq_id, std::vector<float>> embd_seq;
+
+    size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
+    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
+
+    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
+
+    bool need_reserve       = false;
     bool has_evaluated_once = false;
 
     mutable int64_t t_start_us  = 0;
@@ -247,69 +286,21 @@ class llama_context_kv_self : public llama_context {
 
     virtual void kv_self_update() override;
 
-    virtual float * get_logits()              override;
-    virtual float * get_logits_ith(int32_t i) override;
-
-    virtual float * get_embeddings()                        override;
-    virtual float * get_embeddings_ith(int32_t i)           override;
-    virtual float * get_embeddings_seq(llama_seq_id seq_id) override;
-
     virtual ggml_context_ptr init() override;
 
     virtual int decode(llama_batch & inp_batch) override;
     virtual int encode(llama_batch & inp_batch) override;
 
-    llama_sbatch sbatch;
-
-    // host buffer for the model output (logits and embeddings)
-    ggml_backend_buffer_ptr buf_output;
-
-    // decode output (2-dimensional array: [n_outputs][n_vocab])
-    size_t  logits_size = 0; // capacity (of floats) for logits
-    float * logits      = nullptr;
-
-    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
-    size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
-    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
-
-    bool logits_all   = false;
-    bool need_reserve = false;
-
-    // embeddings output (2-dimensional array: [n_outputs][n_embd])
-    // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
-    size_t  embd_size = 0; // capacity (of floats) for embeddings
-    float * embd      = nullptr;
-
-    // sequence embeddings output (map of [n_embd] vectors)
-    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
-    std::map<llama_seq_id, std::vector<float>> embd_seq;
-
     virtual std::unique_ptr<batch_manager> prepare_batch(const llama_batch & batch);
 
-    // returns the result of ggml_backend_sched_graph_compute_async execution
-    enum ggml_status compute_graph(
-                ggml_cgraph * graph,
-                       bool   batched);
-
     // max token position across all sequences in the current context
     llama_pos pos_max() const;
 
     // certain implementations could require a padding for the context size
     uint32_t get_ctx_padding(const llama_cparams & cparams) const;
 
-    void prepare_k_shift();
-    void prepare_defrag();
-
     void set_inputs(const llama_ubatch & ubatch);
 
-    // make the outputs have the same order they had in the user-provided batch
-    // TODO: maybe remove this
-    void reorder_outputs();
-
-    // Make sure enough space is available for outputs.
-    // Returns max number of outputs for which space was reserved.
-    size_t reserve_outputs(size_t n_outputs);
-
     // input tensors
     struct ggml_tensor * inp_tokens;        // I32 [n_batch]
     struct ggml_tensor * inp_embd;          // F32 [n_embd, n_batch]

From ed3cb55abefed68e4123b269da7d840fc9531010 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 13 Feb 2025 15:53:15 +0200
Subject: [PATCH 43/84] context : abstract input

ggml-ci
---
 src/llama-context.cpp | 638 +++++++++++++++++++++---------------------
 src/llama-context.h   |  23 +-
 2 files changed, 334 insertions(+), 327 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 33c256feddc8a..485430095f2f9 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -269,6 +269,309 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
+enum ggml_status llama_context::compute_graph(
+            ggml_cgraph * graph,
+                   bool   batched) {
+    int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
+    ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
+
+    if (backend_cpu != nullptr) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
+        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
+        set_threadpool_fn(backend_cpu, tp);
+    }
+
+    // set the number of threads for all the backends
+    for (const auto & set_n_threads_fn : set_n_threads_fns) {
+        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
+    }
+
+    auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph);
+    if (status != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
+    }
+
+    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
+
+    return status;
+}
+
+void llama_context::input_set(const llama_ubatch & ubatch) {
+    const llama_hparams & hparams = model.hparams;
+
+    if (ubatch.token) {
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
+    }
+
+    if (ubatch.embd) {
+        const int64_t n_embd   = hparams.n_embd;
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd));
+    }
+
+    if (ubatch.pos && inp_pos) {
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos));
+    }
+
+    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs");
+
+        if (!inp_out_ids) {
+            LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__);
+        } else {
+            const int64_t n_tokens = ubatch.n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer));
+            int32_t * data = (int32_t *) inp_out_ids->data;
+
+            if (n_outputs == n_tokens) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = i;
+                }
+            } else if (ubatch.output) {
+                int32_t n_outputs = 0;
+                for (int i = 0; i < n_tokens; ++i) {
+                    if (ubatch.output[i]) {
+                        data[n_outputs++] = i;
+                    }
+                }
+                // the graph needs to have been passed the correct number of outputs
+                GGML_ASSERT(n_outputs == n_outputs);
+            } else if (n_outputs == 1) {
+                // only keep last output
+                data[0] = n_tokens - 1;
+            } else {
+                GGML_ASSERT(n_outputs == 0);
+            }
+        }
+    }
+
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
+
+        GGML_ASSERT(inp_mean);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer));
+
+        float * data = (float *) inp_mean->data;
+        memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean));
+
+        std::vector<uint64_t> sum(n_tokens, 0);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
+
+            sum[seq_id] += ubatch.n_seq_tokens;
+        }
+
+        std::vector<float> div(n_tokens, 0.0f);
+        for (int i = 0; i < n_tokens; ++i) {
+            const uint64_t s = sum[i];
+            if (s > 0) {
+                div[i] = 1.0f/float(s);
+            }
+        }
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
+            }
+        }
+    }
+
+    if (cparams.embeddings && (
+                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
+
+        GGML_ASSERT(inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
+
+        uint32_t * data = (uint32_t *) inp_cls->data;
+        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+
+                if (pos == 0) {
+                    data[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+    }
+
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
+
+        GGML_ASSERT(inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
+
+        uint32_t * data = (uint32_t *) inp_cls->data;
+        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
+
+        std::vector<int> last_pos(n_tokens, -1);
+        std::vector<int> last_row(n_tokens, -1);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+
+                if (pos >= last_pos[seq_id]) {
+                    last_pos[seq_id] = pos;
+                    last_row[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+
+        for (int i = 0; i < n_tokens; ++i) {
+            if (last_row[i] >= 0) {
+                data[i] = last_row[i];
+            }
+        }
+    }
+
+    GGML_ASSERT(
+            // (!a || b) is a logical implication (a -> b)
+            // !hparams.causal_attn -> !cparams.causal_attn
+            (hparams.causal_attn || !cparams.causal_attn) &&
+            "causal attention is not supported by this model"
+            );
+}
+
+size_t llama_context::output_reserve(size_t n_outputs) {
+    const auto & hparams = model.hparams;
+    const auto & vocab   = model.vocab;
+
+    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
+
+    const auto n_batch = cparams.n_batch;
+    const auto n_vocab = vocab.n_tokens();
+    const auto n_embd  = hparams.n_embd;
+
+    // TODO: use a per-batch flag for logits presence instead
+    const bool has_logits = !cparams.embeddings;
+    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+
+    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
+    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
+
+    if (output_ids.empty()) {
+        // init, never resized afterwards
+        output_ids.resize(n_batch);
+    }
+
+    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
+
+    // alloc only when more than the current capacity is required
+    // TODO: also consider shrinking the buffer
+    if (!buf_output || prev_size < new_size) {
+        if (buf_output) {
+#ifndef NDEBUG
+            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+            buf_output = nullptr;
+            logits = nullptr;
+            embd = nullptr;
+        }
+
+        auto * buft = ggml_backend_cpu_buffer_type();
+        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
+        auto * output_dev = model.dev_output();
+        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+        if (output_dev_host_buft) {
+            buft = output_dev_host_buft;
+        }
+        buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
+        if (buf_output == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
+            return 0;
+        }
+    }
+
+    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
+
+    logits = has_logits ? output_base               : nullptr;
+    embd   = has_embd   ? output_base + logits_size : nullptr;
+
+    output_size = n_outputs_max;
+
+    // set all ids as invalid (negative)
+    std::fill(output_ids.begin(), output_ids.end(), -1);
+
+    ggml_backend_buffer_clear(buf_output.get(), 0);
+
+    n_outputs = 0;
+
+    return n_outputs_max;
+}
+
+void llama_context::output_reorder() {
+    std::vector<size_t> & out_ids = sbatch.out_ids;
+    if (!out_ids.empty()) {
+        const uint32_t n_vocab = model.vocab.n_tokens();
+        const uint32_t n_embd  = model.hparams.n_embd;
+
+        GGML_ASSERT((size_t) n_outputs == out_ids.size());
+
+        // TODO: is there something more efficient which also minimizes swaps?
+        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+        for (int32_t i = 0; i < n_outputs - 1; ++i) {
+            int32_t j_min = i;
+            for (int32_t j = i + 1; j < n_outputs; ++j) {
+                if (out_ids[j] < out_ids[j_min]) {
+                    j_min = j;
+                }
+            }
+            if (j_min == i) { continue; }
+            std::swap(out_ids[i], out_ids[j_min]);
+            if (logits_size > 0) {
+                for (uint32_t k = 0; k < n_vocab; k++) {
+                    std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
+                }
+            }
+            if (embd_size > 0) {
+                for (uint32_t k = 0; k < n_embd; k++) {
+                    std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
+                }
+            }
+        }
+        std::fill(output_ids.begin(), output_ids.end(), -1);
+        for (int32_t i = 0; i < n_outputs; ++i) {
+            output_ids[out_ids[i]] = i;
+        }
+        out_ids.clear();
+    }
+}
+
+
 void llama_context::build_cb(
          ggml_tensor * cur,
           const char * name,
@@ -1489,7 +1792,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-        set_inputs(ubatch);
+        input_set(ubatch);
 
         // the output is always the last tensor in the graph
         struct ggml_tensor * t_logits = ggml_graph_node(gf, -1);
@@ -1710,7 +2013,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-    set_inputs(ubatch);
+    input_set(ubatch);
 
     // the output embeddings after the final encoder normalization
     struct ggml_tensor * t_embd = nullptr;
@@ -1829,84 +2132,24 @@ uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) c
 
 // llama input
 
-void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) {
+void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     const llama_hparams & hparams = model.hparams;
-
-    //
-    // set input data
-    //
-
-    if (inp_K_shift) {
-        assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
-
-        int32_t * data = (int32_t *) inp_K_shift->data;
-
-        for (uint32_t i = 0; i < kv_self.size; ++i) {
-            data[i] = kv_self.cells[i].delta;
-        }
-
-        // the K-shift graph requires just this input
-        return;
-    }
-
-    if (ubatch.token) {
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
-    }
-
-    if (ubatch.embd) {
-        const int64_t n_embd   = hparams.n_embd;
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd));
-    }
-
-    if (ubatch.pos && inp_pos) {
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos));
-    }
-
-    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs");
-
-        if (!inp_out_ids) {
-            LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__);
-        } else {
-            const int64_t n_tokens = ubatch.n_tokens;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer));
-            int32_t * data = (int32_t *) inp_out_ids->data;
-
-            if (n_outputs == n_tokens) {
-                for (int i = 0; i < n_tokens; ++i) {
-                    data[i] = i;
-                }
-            } else if (ubatch.output) {
-                int32_t n_outputs = 0;
-                for (int i = 0; i < n_tokens; ++i) {
-                    if (ubatch.output[i]) {
-                        data[n_outputs++] = i;
-                    }
-                }
-                // the graph needs to have been passed the correct number of outputs
-                GGML_ASSERT(n_outputs == n_outputs);
-            } else if (n_outputs == 1) {
-                // only keep last output
-                data[0] = n_tokens - 1;
-            } else {
-                GGML_ASSERT(n_outputs == 0);
-            }
+
+    if (inp_K_shift) {
+        assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
+
+        int32_t * data = (int32_t *) inp_K_shift->data;
+
+        for (uint32_t i = 0; i < kv_self.size; ++i) {
+            data[i] = kv_self.cells[i].delta;
         }
+
+        // the K-shift graph requires just this input
+        return;
     }
 
-    GGML_ASSERT(
-            // (!a || b) is a logical implication (a -> b)
-            // !hparams.causal_attn -> !cparams.causal_attn
-            (hparams.causal_attn || !cparams.causal_attn) &&
-            "causal attention is not supported by this model"
-            );
+    // call base functionality
+    llama_context::input_set(ubatch);
 
     if (inp_KQ_mask || inp_KQ_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
@@ -2029,111 +2272,6 @@ void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) {
         }
     }
 
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp_mean);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer));
-
-        float * data = (float *) inp_mean->data;
-        memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean));
-
-        std::vector<uint64_t> sum(n_tokens, 0);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
-            sum[seq_id] += ubatch.n_seq_tokens;
-        }
-
-        std::vector<float> div(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
-            const uint64_t s = sum[i];
-            if (s > 0) {
-                div[i] = 1.0f/float(s);
-            }
-        }
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
-            }
-        }
-    }
-
-    if (cparams.embeddings && (
-                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
-                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
-
-        uint32_t * data = (uint32_t *) inp_cls->data;
-        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
-
-                if (pos == 0) {
-                    data[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-    }
-
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
-
-        uint32_t * data = (uint32_t *) inp_cls->data;
-        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
-
-        std::vector<int> last_pos(n_tokens, -1);
-        std::vector<int> last_row(n_tokens, -1);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
-
-                if (pos >= last_pos[seq_id]) {
-                    last_pos[seq_id] = pos;
-                    last_row[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-
-        for (int i = 0; i < n_tokens; ++i) {
-            if (last_row[i] >= 0) {
-                data[i] = last_row[i];
-            }
-        }
-    }
-
     if (kv_self.recurrent) {
         const int64_t n_kv = kv_self.n;
 
@@ -2293,7 +2431,7 @@ void llama_context_kv_self::kv_self_update() {
 
             ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-            set_inputs({});
+            input_set({});
 
             compute_graph(gf, false);
 
@@ -2323,7 +2461,7 @@ void llama_context_kv_self::kv_self_update() {
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
         // no input
-        //set_inputs({});
+        //input_set({});
 
         compute_graph(gf, false);
 
@@ -3624,140 +3762,6 @@ int32_t llama_apply_adapter_cvec(
     return res ? 0 : -1;
 }
 
-enum ggml_status llama_context::compute_graph(
-            ggml_cgraph * graph,
-                   bool   batched) {
-    int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
-    ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
-
-    if (backend_cpu != nullptr) {
-        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
-        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
-        set_threadpool_fn(backend_cpu, tp);
-    }
-
-    // set the number of threads for all the backends
-    for (const auto & set_n_threads_fn : set_n_threads_fns) {
-        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
-    }
-
-    auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph);
-    if (status != GGML_STATUS_SUCCESS) {
-        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
-    }
-
-    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
-
-    return status;
-}
-
-size_t llama_context::output_reserve(size_t n_outputs) {
-    const auto & hparams = model.hparams;
-    const auto & vocab   = model.vocab;
-
-    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
-
-    const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_tokens();
-    const auto n_embd  = hparams.n_embd;
-
-    // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
-
-    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
-
-    if (output_ids.empty()) {
-        // init, never resized afterwards
-        output_ids.resize(n_batch);
-    }
-
-    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
-    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
-
-    // alloc only when more than the current capacity is required
-    // TODO: also consider shrinking the buffer
-    if (!buf_output || prev_size < new_size) {
-        if (buf_output) {
-#ifndef NDEBUG
-            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
-#endif
-            buf_output = nullptr;
-            logits = nullptr;
-            embd = nullptr;
-        }
-
-        auto * buft = ggml_backend_cpu_buffer_type();
-        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
-        auto * output_dev = model.dev_output();
-        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
-        if (output_dev_host_buft) {
-            buft = output_dev_host_buft;
-        }
-        buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
-        if (buf_output == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
-            return 0;
-        }
-    }
-
-    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
-
-    logits = has_logits ? output_base               : nullptr;
-    embd   = has_embd   ? output_base + logits_size : nullptr;
-
-    output_size = n_outputs_max;
-
-    // set all ids as invalid (negative)
-    std::fill(output_ids.begin(), output_ids.end(), -1);
-
-    ggml_backend_buffer_clear(buf_output.get(), 0);
-
-    n_outputs = 0;
-
-    return n_outputs_max;
-}
-
-void llama_context::output_reorder() {
-    std::vector<size_t> & out_ids = sbatch.out_ids;
-    if (!out_ids.empty()) {
-        const uint32_t n_vocab = model.vocab.n_tokens();
-        const uint32_t n_embd  = model.hparams.n_embd;
-
-        GGML_ASSERT((size_t) n_outputs == out_ids.size());
-
-        // TODO: is there something more efficient which also minimizes swaps?
-        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-        for (int32_t i = 0; i < n_outputs - 1; ++i) {
-            int32_t j_min = i;
-            for (int32_t j = i + 1; j < n_outputs; ++j) {
-                if (out_ids[j] < out_ids[j_min]) {
-                    j_min = j;
-                }
-            }
-            if (j_min == i) { continue; }
-            std::swap(out_ids[i], out_ids[j_min]);
-            if (logits_size > 0) {
-                for (uint32_t k = 0; k < n_vocab; k++) {
-                    std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
-                }
-            }
-            if (embd_size > 0) {
-                for (uint32_t k = 0; k < n_embd; k++) {
-                    std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
-                }
-            }
-        }
-        std::fill(output_ids.begin(), output_ids.end(), -1);
-        for (int32_t i = 0; i < n_outputs; ++i) {
-            output_ids[out_ids[i]] = i;
-        }
-        out_ids.clear();
-    }
-}
-
 //
 // kv cache view
 //
diff --git a/src/llama-context.h b/src/llama-context.h
index 16d138b4cbd35..f8040138222c2 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -90,6 +90,8 @@ struct llama_context : public llama_graph_i {
                 ggml_cgraph * graph,
                        bool   batched);
 
+    virtual void input_set(const llama_ubatch & ubatch);
+
     // Make sure enough space is available for outputs.
     // Returns max number of outputs for which space was reserved.
     virtual size_t output_reserve(size_t n_outputs);
@@ -204,6 +206,15 @@ struct llama_context : public llama_graph_i {
     virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id);
     virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id);
 
+    // input tensors
+
+    struct ggml_tensor * inp_tokens;  // I32 [n_batch]
+    struct ggml_tensor * inp_embd;    // F32 [n_embd, n_batch]
+    struct ggml_tensor * inp_pos;     // I32 [n_batch]
+    struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
+    struct ggml_tensor * inp_mean;    // F32 [n_batch, n_batch]
+    struct ggml_tensor * inp_cls;     // I32 [n_batch]
+
     // members
 
     const llama_model & model;
@@ -288,6 +299,8 @@ class llama_context_kv_self : public llama_context {
 
     virtual ggml_context_ptr init() override;
 
+    virtual void input_set(const llama_ubatch & ubatch) override;
+
     virtual int decode(llama_batch & inp_batch) override;
     virtual int encode(llama_batch & inp_batch) override;
 
@@ -299,16 +312,6 @@ class llama_context_kv_self : public llama_context {
     // certain implementations could require a padding for the context size
     uint32_t get_ctx_padding(const llama_cparams & cparams) const;
 
-    void set_inputs(const llama_ubatch & ubatch);
-
-    // input tensors
-    struct ggml_tensor * inp_tokens;        // I32 [n_batch]
-    struct ggml_tensor * inp_embd;          // F32 [n_embd, n_batch]
-    struct ggml_tensor * inp_pos;           // I32 [n_batch]
-    struct ggml_tensor * inp_out_ids;       // I32 [n_outputs]
-    struct ggml_tensor * inp_mean;          // F32 [n_batch, n_batch]
-    struct ggml_tensor * inp_cls;           // I32 [n_batch]
-
     // === unified KV cache ===
 
     llama_kv_cache kv_self;

From 131743ff4f17bfe65c5bf6b79187ad9fd7fcdb55 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 13 Feb 2025 17:13:42 +0200
Subject: [PATCH 44/84] context : abstract constructor and init

ggml-ci
---
 src/llama-context.cpp | 657 ++++++++++++++++++++++--------------------
 src/llama-context.h   |  25 +-
 src/llama.cpp         |   2 +-
 3 files changed, 359 insertions(+), 325 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 485430095f2f9..31085f644ba0f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -14,14 +14,290 @@
 // llama_context
 //
 
-llama_context::llama_context(const llama_model & model) :
+llama_context::llama_context(
+        const llama_model & model,
+        const llama_context_params & params) :
     model     (model),
     t_start_us(model.t_start_us),
     t_load_us (model.t_load_us) {
+    const auto & hparams = model.hparams;
+
+    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
+    cparams.n_threads        = params.n_threads;
+    cparams.n_threads_batch  = params.n_threads_batch;
+    cparams.yarn_ext_factor  = params.yarn_ext_factor;
+    cparams.yarn_attn_factor = params.yarn_attn_factor;
+    cparams.yarn_beta_fast   = params.yarn_beta_fast;
+    cparams.yarn_beta_slow   = params.yarn_beta_slow;
+    cparams.defrag_thold     = params.defrag_thold;
+    cparams.embeddings       = params.embeddings;
+    cparams.offload_kqv      = params.offload_kqv;
+    cparams.flash_attn       = params.flash_attn;
+    cparams.no_perf          = params.no_perf;
+    cparams.pooling_type     = params.pooling_type;
+
+    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
+    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
+
+    // with causal attention, the batch size is limited by the context size
+    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
+
+    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
+    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
+    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
+    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
+        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
+        cparams.n_batch = GGML_KQ_MASK_PAD;
+    }
+
+    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+
+    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
+                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
+                                                              hparams.n_ctx_train;
+
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
+
+    auto rope_scaling_type = params.rope_scaling_type;
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
+        rope_scaling_type = hparams.rope_scaling_type_train;
+    }
+
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
+        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
+    }
+
+    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
+        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
+    }
+
+    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
+
+    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
+        } else {
+            cparams.pooling_type = hparams.pooling_type;
+        }
+    }
+
+    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
+        cparams.causal_attn = hparams.causal_attn;
+    } else {
+        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+    }
+
+    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
+    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
+    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
+    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
+    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
+    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
+    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
+    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
+    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
+
+    if (n_ctx_per_seq < hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
+
+    if (n_ctx_per_seq > hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
+
+    logits_all = params.logits_all;
+
+    if (!hparams.vocab_only) {
+        // GPU backends
+        for (auto * dev : model.devices) {
+            ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                throw std::runtime_error("failed to initialize backend");
+            }
+            backends.emplace_back(backend);
+        }
+
+        // add ACCEL backends (such as BLAS)
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                    throw std::runtime_error("failed to initialize backend");
+                }
+                backends.emplace_back(backend);
+            }
+        }
+
+        // add CPU backend
+        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        if (backend_cpu == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
+            throw std::runtime_error("failed to initialize CPU backend");
+        }
+        backends.emplace_back(backend_cpu);
+
+        // create a list of the set_n_threads functions in the backends
+        for (auto & backend : backends) {
+            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
+            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+            if (reg) {
+                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+                if (ggml_backend_set_n_threads_fn) {
+                    set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
+                }
+            }
+        }
+
+        llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data);
+
+        // graph outputs buffer
+        {
+            // resized during inference when a batch uses more outputs
+            if (output_reserve(params.n_seq_max) < params.n_seq_max) {
+                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
+                throw std::runtime_error("failed to reserve initial output buffer");
+            }
+
+            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
+                    ggml_backend_buffer_name    (buf_output.get()),
+                    ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0);
+        }
+    }
 }
 
 llama_context::~llama_context() = default;
 
+void llama_context::init() {
+    const auto & hparams = model.hparams;
+
+    if (hparams.vocab_only) {
+        LLAMA_LOG_WARN("%s: model is vocab-only -- no computation will be performed\n", __func__);
+        return;
+    }
+
+    // buffer types used for the compute buffer of each backend
+    std::vector<ggml_backend_buffer_type_t> backend_buft;
+    std::vector<ggml_backend_t>             backend_ptrs;
+    for (auto & backend : backends) {
+        auto * buft = ggml_backend_get_default_buffer_type(backend.get());
+        auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+        if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
+            // use the host buffer of the first device CPU for faster transfer of the intermediate state
+            auto * dev = model.devices[0];
+            auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
+            if (host_buft) {
+                buft = host_buft;
+            }
+        }
+        backend_buft.push_back(buft);
+        backend_ptrs.push_back(backend.get());
+    }
+
+    const size_t max_nodes = model.max_nodes();
+
+    // buffer used to store the computation graph and the tensor meta data
+    // TODO: move to base class
+    buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+
+    // TODO: move these checks to ggml_backend_sched
+    // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
+    bool pipeline_parallel =
+        model.n_devices() > 1 &&
+        model.params.n_gpu_layers > (int) model.hparams.n_layer &&
+        model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
+        cparams.offload_kqv;
+
+    // pipeline parallelism requires support for async compute and events in all devices
+    if (pipeline_parallel) {
+        for (auto & backend : backends) {
+            auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+            if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                // ignore CPU backend
+                continue;
+            }
+            auto * dev = ggml_backend_get_device(backend.get());
+            ggml_backend_dev_props props;
+            ggml_backend_dev_get_props(dev, &props);
+            if (!props.caps.async || !props.caps.events) {
+                // device does not support async compute or events
+                pipeline_parallel = false;
+                break;
+            }
+        }
+    }
+
+    sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
+
+    if (pipeline_parallel) {
+        LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
+    }
+
+    // initialize scheduler with the worst-case graph
+    {
+        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+
+        // reserve pp graph first so that buffers are only allocated once
+        llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+        ggml_cgraph * gf_pp = build_graph(ubatch_pp, true);
+        if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
+            throw std::runtime_error("failed to allocate compute buffers");
+        }
+        int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
+        int n_nodes_pp  = ggml_graph_n_nodes(gf_pp);
+
+        // reserve with tg graph to get the number of splits and nodes
+        llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+        ggml_cgraph * gf_tg = build_graph(ubatch_tg, true);
+        if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
+            throw std::runtime_error("failed to allocate compute buffers");
+        }
+        int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
+        int n_nodes_tg  = ggml_graph_n_nodes(gf_tg);
+
+        // reserve again with pp graph to avoid ggml-alloc reallocations during inference
+        gf_pp = build_graph(ubatch_pp, true);
+        if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
+            throw std::runtime_error("failed to allocate compute buffers");
+        }
+
+        for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+            ggml_backend_t backend = backend_ptrs[i];
+            ggml_backend_buffer_type_t buft = backend_buft[i];
+            size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+            if (size > 1) {
+                LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+                        ggml_backend_buft_name(buft),
+                        size / 1024.0 / 1024.0);
+            }
+        }
+
+        if (n_nodes_pp == n_nodes_tg) {
+            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
+        } else {
+            LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+        }
+
+        if (n_splits_pp == n_splits_tg) {
+            LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
+        } else {
+            LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
+        }
+    }
+}
+
 const llama_model & llama_context::get_model() const {
     return model;
 }
@@ -161,46 +437,6 @@ int64_t llama_context::n_pos_per_token() const {
     return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
 }
 
-ggml_context_ptr llama_context::init() {
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute_meta.size(),
-        /*.mem_buffer =*/ buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    return ggml_context_ptr { ggml_init(params) };
-}
-
-void llama_context::synchronize() {
-    ggml_backend_sched_synchronize(sched.get());
-
-    // FIXME: if multiple single tokens are evaluated without a synchronization,
-    // the stats will be added to the prompt evaluation stats
-    // this should only happen when using batch size 1 to evaluate a batch
-
-    // add the evaluation to the stats
-    if (n_queued_tokens == 1) {
-        if (!cparams.no_perf) {
-            t_eval_us += ggml_time_us() - t_compute_start_us;
-        }
-        n_eval++;
-    } else if (n_queued_tokens > 1) {
-        if (!cparams.no_perf) {
-            t_p_eval_us += ggml_time_us() - t_compute_start_us;
-        }
-        n_p_eval += n_queued_tokens;
-    }
-
-    // get a more accurate load time, upon first eval
-    if (n_queued_tokens > 0 && !has_evaluated_once) {
-        t_load_us = ggml_time_us() - t_start_us;
-        has_evaluated_once = true;
-    }
-
-    n_queued_tokens = 0;
-    t_compute_start_us = 0;
-}
-
 void llama_context::attach_threadpool(
            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch) {
@@ -269,7 +505,54 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
-enum ggml_status llama_context::compute_graph(
+void llama_context::synchronize() {
+    ggml_backend_sched_synchronize(sched.get());
+
+    // FIXME: if multiple single tokens are evaluated without a synchronization,
+    // the stats will be added to the prompt evaluation stats
+    // this should only happen when using batch size 1 to evaluate a batch
+
+    // add the evaluation to the stats
+    if (n_queued_tokens == 1) {
+        if (!cparams.no_perf) {
+            t_eval_us += ggml_time_us() - t_compute_start_us;
+        }
+        n_eval++;
+    } else if (n_queued_tokens > 1) {
+        if (!cparams.no_perf) {
+            t_p_eval_us += ggml_time_us() - t_compute_start_us;
+        }
+        n_p_eval += n_queued_tokens;
+    }
+
+    // get a more accurate load time, upon first eval
+    if (n_queued_tokens > 0 && !has_evaluated_once) {
+        t_load_us = ggml_time_us() - t_start_us;
+        has_evaluated_once = true;
+    }
+
+    n_queued_tokens = 0;
+    t_compute_start_us = 0;
+}
+
+ggml_context_ptr llama_context::graph_init() {
+    inp_tokens  = nullptr;
+    inp_embd    = nullptr;
+    inp_pos     = nullptr;
+    inp_out_ids = nullptr;
+    inp_mean    = nullptr;
+    inp_cls     = nullptr;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_compute_meta.size(),
+        /*.mem_buffer =*/ buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    return ggml_context_ptr { ggml_init(params) };
+}
+
+enum ggml_status llama_context::graph_compute(
             ggml_cgraph * graph,
                    bool   batched) {
     int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
@@ -608,7 +891,7 @@ void llama_context::build_cb(
 }
 
 ggml_cgraph * llama_context::build_graph(const llama_ubatch & ubatch, bool worst_case) {
-    return model.build_graph(*this, cparams, ubatch, init(), worst_case);
+    return model.build_graph(*this, cparams, ubatch, graph_init(), worst_case);
 }
 
 llama_perf_context_data llama_context::perf_get_data() const {
@@ -1183,100 +1466,15 @@ void llama_context::perf_reset() {
 
 llama_context_kv_self::llama_context_kv_self(
         const llama_model & model,
-        const llama_context_params & params) : llama_context(model) {
+        const llama_context_params & params) :
+    llama_context(model, params) {
     const auto & hparams = model.hparams;
 
-    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
-    cparams.n_threads        = params.n_threads;
-    cparams.n_threads_batch  = params.n_threads_batch;
-    cparams.yarn_ext_factor  = params.yarn_ext_factor;
-    cparams.yarn_attn_factor = params.yarn_attn_factor;
-    cparams.yarn_beta_fast   = params.yarn_beta_fast;
-    cparams.yarn_beta_slow   = params.yarn_beta_slow;
-    cparams.defrag_thold     = params.defrag_thold;
-    cparams.embeddings       = params.embeddings;
-    cparams.offload_kqv      = params.offload_kqv;
-    cparams.flash_attn       = params.flash_attn;
-    cparams.no_perf          = params.no_perf;
-    cparams.pooling_type     = params.pooling_type;
-
-    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
-    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
-    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
-
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams));
-
-    // with causal attention, the batch size is limited by the context size
-    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
-
-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
-    }
-
-    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
-
-    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
-                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
-                                                              hparams.n_ctx_train;
-
-    cparams.cb_eval           = params.cb_eval;
-    cparams.cb_eval_user_data = params.cb_eval_user_data;
-
-    auto rope_scaling_type = params.rope_scaling_type;
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
-        rope_scaling_type = hparams.rope_scaling_type_train;
-    }
-
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
-        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
-    }
-
-    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
-        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
-    }
-
-    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
-
-    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
-        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
-            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
-        } else {
-            cparams.pooling_type = hparams.pooling_type;
-        }
-    }
+    LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
 
-    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
-        cparams.causal_attn = hparams.causal_attn;
-    } else {
-        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
-    }
+    cparams.n_ctx = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams));
 
-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
-    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
-    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
-    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
-    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
-    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
-    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
-    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
-
-    if (n_ctx_per_seq < hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
-    }
-
-    if (n_ctx_per_seq > hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
-    }
-
-    logits_all = params.logits_all;
+    LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
 
     // build worst-case graph for encoder if a model contains encoder
     is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder()
@@ -1298,51 +1496,6 @@ llama_context_kv_self::llama_context_kv_self(
     GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
 
     if (!hparams.vocab_only) {
-        // GPU backends
-        for (auto * dev : model.devices) {
-            ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                throw std::runtime_error("failed to initialize backend");
-            }
-            backends.emplace_back(backend);
-        }
-
-        // add ACCEL backends (such as BLAS)
-        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
-                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                    throw std::runtime_error("failed to initialize backend");
-                }
-                backends.emplace_back(backend);
-            }
-        }
-
-        // add CPU backend
-        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
-        if (backend_cpu == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
-            throw std::runtime_error("failed to initialize CPU backend");
-        }
-        backends.emplace_back(backend_cpu);
-
-        // create a list of the set_n_threads functions in the backends
-        for (auto & backend : backends) {
-            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
-            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
-            if (reg) {
-                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
-                if (ggml_backend_set_n_threads_fn) {
-                    set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
-                }
-            }
-        }
-
-        llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data);
-
         if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             throw std::runtime_error("failed to initialize self-attention cache");
@@ -1357,128 +1510,6 @@ llama_context_kv_self::llama_context_kv_self(
                 ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                 ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
         }
-
-        // graph outputs buffer
-        {
-            // resized during inference when a batch uses more outputs
-            if (output_reserve(params.n_seq_max) < params.n_seq_max) {
-                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
-                throw std::runtime_error("failed to reserve initial output buffer");
-            }
-
-            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
-                    ggml_backend_buffer_name    (buf_output.get()),
-                    ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0);
-        }
-
-        // scheduler and compute buffers
-        {
-            // buffer types used for the compute buffer of each backend
-            std::vector<ggml_backend_buffer_type_t> backend_buft;
-            std::vector<ggml_backend_t> backend_ptrs;
-            for (auto & backend : backends) {
-                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
-                auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
-                if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
-                    // use the host buffer of the first device CPU for faster transfer of the intermediate state
-                    auto * dev = model.devices[0];
-                    auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
-                    if (host_buft) {
-                        buft = host_buft;
-                    }
-                }
-                backend_buft.push_back(buft);
-                backend_ptrs.push_back(backend.get());
-            }
-
-            const size_t max_nodes = model.max_nodes();
-
-            // buffer used to store the computation graph and the tensor meta data
-            // TODO: move to base class
-            buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
-
-            // TODO: move these checks to ggml_backend_sched
-            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
-            bool pipeline_parallel =
-                model.n_devices() > 1 &&
-                model.params.n_gpu_layers > (int) model.hparams.n_layer &&
-                model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
-                params.offload_kqv;
-
-            // pipeline parallelism requires support for async compute and events in all devices
-            if (pipeline_parallel) {
-                for (auto & backend : backends) {
-                    auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
-                    if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
-                        // ignore CPU backend
-                        continue;
-                    }
-                    auto * dev = ggml_backend_get_device(backend.get());
-                    ggml_backend_dev_props props;
-                    ggml_backend_dev_get_props(dev, &props);
-                    if (!props.caps.async || !props.caps.events) {
-                        // device does not support async compute or events
-                        pipeline_parallel = false;
-                        break;
-                    }
-                }
-            }
-
-            sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
-
-            if (pipeline_parallel) {
-                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
-            }
-
-            // initialize scheduler with the worst-case graph
-            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_pp = build_graph(ubatch_pp, true);
-
-            // reserve pp graph first so that buffers are only allocated once
-            ggml_backend_sched_reserve(sched.get(), gf_pp);
-            int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
-            int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
-
-            // reserve with tg graph to get the number of splits and nodes
-            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_tg = build_graph(ubatch_tg, true);
-            ggml_backend_sched_reserve(sched.get(), gf_tg);
-            int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
-            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
-
-            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-            gf_pp = build_graph(ubatch_pp, true);
-            if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-                throw std::runtime_error("failed to allocate compute buffers");
-            }
-
-            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
-                ggml_backend_t backend = backend_ptrs[i];
-                ggml_backend_buffer_type_t buft = backend_buft[i];
-                size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
-                if (size > 1) {
-                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
-                            ggml_backend_buft_name(buft),
-                            size / 1024.0 / 1024.0);
-                }
-            }
-
-            if (n_nodes_pp == n_nodes_tg) {
-                LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
-            } else {
-                LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
-            }
-            if (n_splits_pp == n_splits_tg) {
-                LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
-            } else {
-                LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
-            }
-        }
     }
 }
 
@@ -1497,15 +1528,7 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
     return &kv_self;
 }
 
-ggml_context_ptr llama_context_kv_self::init() {
-    inp_tokens          = nullptr;
-    inp_embd            = nullptr;
-    inp_pos             = nullptr;
-    inp_out_ids         = nullptr;
-    inp_mean            = nullptr;
-    inp_cls             = nullptr;
-    inp_embd_enc        = nullptr;
-    inp_pos_bucket      = nullptr;
+ggml_context_ptr llama_context_kv_self::graph_init() {
     inp_KQ_mask         = nullptr;
     inp_KQ_mask_cnv     = nullptr;
     inp_KQ_mask_swa     = nullptr;
@@ -1514,8 +1537,10 @@ ggml_context_ptr llama_context_kv_self::init() {
     inp_K_shift         = nullptr;
     inp_s_copy          = nullptr;
     inp_s_mask          = nullptr;
+    inp_embd_enc        = nullptr;
+    inp_pos_bucket      = nullptr;
 
-    return llama_context::init();
+    return llama_context::graph_init();
 }
 
 struct llama_context_kv_self::batch_manager {
@@ -1817,7 +1842,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
             GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor");
         }
 
-        const auto compute_status = compute_graph(gf, ubatch.n_tokens > 1);
+        const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
             bman->restore();
             switch (compute_status) {
@@ -2035,7 +2060,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
         }
     }
 
-    const auto compute_status = compute_graph(gf, n_tokens > 1);
+    const auto compute_status = graph_compute(gf, n_tokens > 1);
     switch (compute_status) {
         case GGML_STATUS_SUCCESS:
             break;
@@ -2422,7 +2447,7 @@ void llama_context_kv_self::kv_self_update() {
         if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
             ggml_backend_sched_reset(sched.get());
 
-            auto ctx = init();
+            auto ctx = graph_init();
             auto ctx0 = ctx.get();
 
             ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -2433,7 +2458,7 @@ void llama_context_kv_self::kv_self_update() {
 
             input_set({});
 
-            compute_graph(gf, false);
+            graph_compute(gf, false);
 
             need_reserve = true;
         }
@@ -2451,7 +2476,7 @@ void llama_context_kv_self::kv_self_update() {
     if (kv.do_defrag) {
         ggml_backend_sched_reset(sched.get());
 
-        auto ctx = init();
+        auto ctx = graph_init();
         auto ctx0 = ctx.get();
 
         ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -2463,7 +2488,7 @@ void llama_context_kv_self::kv_self_update() {
         // no input
         //input_set({});
 
-        compute_graph(gf, false);
+        graph_compute(gf, false);
 
         kv.do_defrag = false;
 
diff --git a/src/llama-context.h b/src/llama-context.h
index f8040138222c2..e70c99f331cd3 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -21,9 +21,16 @@ class llama_io_write_i;
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
 struct llama_context : public llama_graph_i {
-    llama_context(const llama_model & model);
+    llama_context(
+            const llama_model & model,
+            const llama_context_params & params);
+
     virtual ~llama_context();
 
+    // init scheduler and compute buffers
+    // call once after the context is constructed
+    virtual void init();
+
     const llama_model   & get_model()   const;
     const llama_cparams & get_cparams() const;
 
@@ -52,10 +59,6 @@ struct llama_context : public llama_graph_i {
 
     virtual int64_t n_pos_per_token() const; // vision
 
-    virtual ggml_context_ptr init();
-
-    virtual void synchronize();
-
     virtual void attach_threadpool(
             ggml_threadpool_t   threadpool,
             ggml_threadpool_t   threadpool_batch);
@@ -85,8 +88,14 @@ struct llama_context : public llama_graph_i {
                 int32_t   il_start,
                 int32_t   il_end);
 
+    ////
+
+    virtual void synchronize();
+
+    virtual ggml_context_ptr graph_init();
+
     // returns the result of ggml_backend_sched_graph_compute_async execution
-    virtual enum ggml_status compute_graph(
+    virtual enum ggml_status graph_compute(
                 ggml_cgraph * graph,
                        bool   batched);
 
@@ -297,7 +306,7 @@ class llama_context_kv_self : public llama_context {
 
     virtual void kv_self_update() override;
 
-    virtual ggml_context_ptr init() override;
+    virtual ggml_context_ptr graph_init() override;
 
     virtual void input_set(const llama_ubatch & ubatch) override;
 
@@ -312,7 +321,7 @@ class llama_context_kv_self : public llama_context {
     // certain implementations could require a padding for the context size
     uint32_t get_ctx_padding(const llama_cparams & cparams) const;
 
-    // === unified KV cache ===
+    // === KV cache ===
 
     llama_kv_cache kv_self;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index d20a2a6d50f60..a677902f0ba7c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -328,6 +328,7 @@ struct llama_context * llama_init_from_model(
     try {
         // TODO: add logic which llama_context implementation to construct
         ctx = new llama_context_kv_self(*model, params);
+        ctx->init();
     } catch (const std::exception & e) {
         LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what());
         return nullptr;
@@ -410,7 +411,6 @@ const char * llama_print_system_info(void) {
     static std::string s;
     s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
 
-
     for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
         auto * reg = ggml_backend_reg_get(i);
         auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");

From d5e8e1a2ba315599d09e6d5fbb37a2b98f841c07 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 14 Feb 2025 16:10:55 +0200
Subject: [PATCH 45/84] context : remove batch_manager

ggml-ci
---
 src/llama-batch.h     |   4 +-
 src/llama-context.cpp | 334 ++++++++++++++++++------------------------
 src/llama-context.h   |  61 ++++----
 src/llama-kv-cache.h  |   6 +-
 4 files changed, 178 insertions(+), 227 deletions(-)

diff --git a/src/llama-batch.h b/src/llama-batch.h
index 773c3808b770f..f1df40d27086e 100644
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -42,9 +42,9 @@ struct llama_sbatch {
     bool logits_all; // TODO: remove once lctx.logits_all is removed too
 
     // sorted indices into the batch
-    std::vector<size_t> ids;
+    std::vector<int64_t> ids;
     // batch indices of the output
-    std::vector<size_t> out_ids;
+    std::vector<int64_t> out_ids;
     std::vector<llama_sbatch_seq> seq;
 
     const llama_batch * batch = nullptr;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 31085f644ba0f..f3fa4c592c86b 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -161,7 +161,7 @@ llama_context::llama_context(
         // graph outputs buffer
         {
             // resized during inference when a batch uses more outputs
-            if (output_reserve(params.n_seq_max) < params.n_seq_max) {
+            if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
                 LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
                 throw std::runtime_error("failed to reserve initial output buffer");
             }
@@ -747,11 +747,11 @@ void llama_context::input_set(const llama_ubatch & ubatch) {
             );
 }
 
-size_t llama_context::output_reserve(size_t n_outputs) {
+int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto & hparams = model.hparams;
     const auto & vocab   = model.vocab;
 
-    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
+    const int64_t n_outputs_max = std::max<int64_t>(n_outputs, cparams.n_seq_max);
 
     const auto n_batch = cparams.n_batch;
     const auto n_vocab = vocab.n_tokens();
@@ -817,7 +817,7 @@ size_t llama_context::output_reserve(size_t n_outputs) {
 }
 
 void llama_context::output_reorder() {
-    std::vector<size_t> & out_ids = sbatch.out_ids;
+    auto & out_ids = sbatch.out_ids;
     if (!out_ids.empty()) {
         const uint32_t n_vocab = model.vocab.n_tokens();
         const uint32_t n_embd  = model.hparams.n_embd;
@@ -1320,8 +1320,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
     {
         output_reorder();
 
-        const uint32_t n_outputs = this->n_outputs;
-        const auto & output_ids  = this->output_ids;
+        const auto n_outputs    = this->n_outputs;
+        const auto & output_ids = this->output_ids;
 
         std::vector<int32_t> w_output_pos;
 
@@ -1334,7 +1334,7 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
             // map an output id to a position in the batch
             int32_t pos = output_ids[i];
             if (pos >= 0) {
-                GGML_ASSERT((uint32_t) pos < n_outputs);
+                GGML_ASSERT(pos < n_outputs);
                 w_output_pos[pos] = i;
             }
         }
@@ -1386,15 +1386,15 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
 
     // read output ids
     {
-        std::vector<int32_t> output_pos;
-
-        uint32_t n_outputs;
+        auto n_outputs = this->n_outputs;
         io.read_to(&n_outputs, sizeof(n_outputs));
 
         if (n_outputs > output_reserve(n_outputs)) {
             throw std::runtime_error("could not reserve outputs");
         }
 
+        std::vector<int32_t> output_pos;
+
         if (n_outputs) {
             output_pos.resize(n_outputs);
             io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
@@ -1543,73 +1543,112 @@ ggml_context_ptr llama_context_kv_self::graph_init() {
     return llama_context::graph_init();
 }
 
-struct llama_context_kv_self::batch_manager {
-    batch_manager(llama_context_kv_self & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
-        const auto & model   = lctx.model;
-        const auto & cparams = lctx.cparams;
-        const auto & hparams = lctx.model.hparams;
+int llama_context_kv_self::decode(llama_batch & inp_batch) {
+    is_encoding = false;
 
-        const auto & kv_self = lctx.kv_self;
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
 
-        const int64_t n_tokens_all = batch.n_tokens;
-        const int64_t n_embd       = hparams.n_embd;
+    // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
 
-        GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+    const llama_batch & batch = batch_allocr.batch;
 
-        if (batch.token) {
-            for (int64_t i = 0; i < n_tokens_all; ++i) {
-                if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                    LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
-                    throw std::runtime_error("invalid token");
-                }
-            }
+    const auto & vocab   = model.vocab;
+    const auto & hparams = model.hparams;
+
+    const int32_t n_vocab = vocab.n_tokens();
+
+    const int64_t n_tokens_all = batch.n_tokens;
+    const int64_t n_embd       = hparams.n_embd;
+
+    // TODO: remove this stuff
+    class batch_guard {
+    public:
+        batch_guard(llama_kv_cache & kv_self) : kv_slot_restorer(kv_self) {
         }
 
-        GGML_ASSERT(n_tokens_all <= cparams.n_batch);
+        ~batch_guard() {
+            if (!is_done) {
+                kv_slot_restorer.restore();
+            }
+        }
 
-        GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
+        void done() {
+            is_done = true;
+        }
 
-        if (lctx.t_compute_start_us == 0) {
-            lctx.t_compute_start_us = ggml_time_us();
+        void save(const llama_kv_cache_slot_info & slot_info) {
+            kv_slot_restorer.save(slot_info);
         }
-        lctx.n_queued_tokens += n_tokens_all;
 
-        // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
-        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+    private:
+        bool is_done = false;
 
-        lctx.embd_seq.clear();
+        llama_kv_slot_restorer kv_slot_restorer;
+    };
+
+    batch_guard bg(kv_self);
 
-        // count outputs
-        if (batch.logits && !embd_pooled) {
-            for (uint32_t i = 0; i < n_tokens_all; ++i) {
-                n_outputs_all += batch.logits[i] != 0;
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+    if (batch.token) {
+        for (int64_t i = 0; i < n_tokens_all; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
+                throw std::runtime_error("invalid token");
             }
-        } else if (lctx.logits_all || embd_pooled) {
-            n_outputs_all = n_tokens_all;
-        } else {
-            // keep last output only
-            n_outputs_all = 1;
         }
+    }
 
-        const bool logits_all = n_outputs_all == n_tokens_all;
+    GGML_ASSERT(n_tokens_all <= cparams.n_batch);
 
-        lctx.sbatch.from_batch(batch, n_embd,
-                /* simple_split */ !kv_self.recurrent,
-                /* logits_all   */ logits_all);
-    }
+    GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
 
-    ~batch_manager() {
+    if (t_compute_start_us == 0) {
+        t_compute_start_us = ggml_time_us();
     }
+    n_queued_tokens += n_tokens_all;
+
+    // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
 
-    bool is_done() const {
-        return lctx.sbatch.n_tokens == 0;
+    embd_seq.clear();
+
+    int64_t n_outputs_all = 0;
+
+    // count outputs
+    if (batch.logits && !embd_pooled) {
+        for (uint32_t i = 0; i < n_tokens_all; ++i) {
+            n_outputs_all += batch.logits[i] != 0;
+        }
+    } else if (logits_all || embd_pooled) {
+        n_outputs_all = n_tokens_all;
+    } else {
+        // keep last output only
+        n_outputs_all = 1;
     }
 
-    llama_ubatch next() {
-        llama_ubatch ubatch = llama_ubatch();
+    const bool logits_all = n_outputs_all == n_tokens_all;
+
+    sbatch.from_batch(batch, n_embd,
+            /* simple_split */ !kv_self.recurrent,
+            /* logits_all   */ logits_all);
+
+    // reserve output buffer
+    // TODO: move to batch manager?
+    if (output_reserve(n_outputs_all) < n_outputs_all) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
+        return -2;
+    };
+
+    int64_t n_outputs_prev = 0;
 
-        const auto & cparams = lctx.cparams;
-        const auto & kv_self = lctx.kv_self;
+    while (sbatch.n_tokens > 0) {
+        llama_ubatch ubatch = llama_ubatch();
 
         const auto & n_ubatch = cparams.n_ubatch;
 
@@ -1618,28 +1657,16 @@ struct llama_context_kv_self::batch_manager {
         if (kv_self.recurrent) {
             if (embd_pooled) {
                 // Pooled embeddings cannot be split across ubatches (yet)
-                ubatch = lctx.sbatch.split_seq(n_ubatch);
+                ubatch = sbatch.split_seq(n_ubatch);
             } else {
                 // recurrent model architectures are easier to implement
                 // with equal-length sequences
-                ubatch = lctx.sbatch.split_equal(n_ubatch);
+                ubatch = sbatch.split_equal(n_ubatch);
             }
         } else {
-            ubatch = lctx.sbatch.split_simple(n_ubatch);
+            ubatch = sbatch.split_simple(n_ubatch);
         }
 
-        return ubatch;
-    }
-
-    bool prepare(const llama_ubatch & ubatch) {
-        const auto & cparams = lctx.cparams;
-        const auto & hparams = lctx.model.hparams;
-        const auto & batch   = lctx.sbatch.batch;
-
-        const auto n_tokens_all = batch->n_tokens;
-
-        auto & kv_self = lctx.kv_self;
-
         // count the outputs in this u_batch
         {
             int32_t n_outputs_new = 0;
@@ -1654,12 +1681,12 @@ struct llama_context_kv_self::batch_manager {
             }
 
             // needs to happen before the graph is built
-            lctx.n_outputs = n_outputs_new;
+            n_outputs = n_outputs_new;
         }
 
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
-            lctx.kv_self_update();
+            kv_self_update();
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
@@ -1669,10 +1696,11 @@ struct llama_context_kv_self::batch_manager {
 
             const auto slot_info = kv_self.find_slot(ubatch);
             if (!slot_info) {
-                return false;
+                LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
+                return -3;
             }
 
-            kv_slot_restorer.save(slot_info);
+            bg.save(slot_info);
 
             if (!kv_self.recurrent) {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
@@ -1687,12 +1715,9 @@ struct llama_context_kv_self::batch_manager {
         //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
         // reserve a worst case graph if needed
-        if (lctx.need_reserve) {
+        if (need_reserve) {
             LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
 
-            const auto & cparams = lctx.cparams;
-            const auto & model   = lctx.model;
-
             // build worst-case graph
             uint32_t n_seqs = 1; // TODO: worst-case number of sequences
             uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
@@ -1700,112 +1725,15 @@ struct llama_context_kv_self::batch_manager {
             llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
-            ggml_cgraph * gf = lctx.build_graph(ubatch, true);
+            ggml_cgraph * gf = build_graph(ubatch, true);
 
             // initialize scheduler with the worst-case graph
-            ggml_backend_sched_reset(lctx.sched.get());
-            if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
+            ggml_backend_sched_reset(sched.get());
+            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
             }
 
-            lctx.need_reserve = false;
-        }
-
-        return true;
-    }
-
-    void restore() {
-        kv_slot_restorer.restore(lctx.kv_self);
-    }
-
-    void update(const llama_ubatch & ubatch) {
-        auto & kv_self = lctx.kv_self;
-
-        // update the kv ring buffer
-        {
-            kv_self.head += ubatch.n_tokens;
-
-            // Ensure kv cache head points to a valid index.
-            if (kv_self.head >= kv_self.size) {
-                kv_self.head = 0;
-            }
-        }
-    }
-
-    void finalize() {
-        const auto & cparams = lctx.cparams;
-
-        auto & kv_self = lctx.kv_self;
-
-        // decide if we need to defrag the kv cache
-        if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
-            // - do not defrag small contexts (i.e. < 2048 tokens)
-            // - count the padding towards the number of used tokens
-            const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + lctx.get_ctx_padding(cparams))/float(kv_self.n)) : 0.0f;
-
-            // queue defragmentation for next llama_kv_cache_update
-            if (fragmentation > cparams.defrag_thold) {
-                LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
-
-                kv_self.defrag();
-            }
-        }
-    }
-
-    int64_t n_outputs_all = 0;
-
-    llama_context_kv_self & lctx;
-
-    const llama_batch & batch;
-
-    llama_kv_slot_restorer kv_slot_restorer;
-};
-
-std::unique_ptr<llama_context_kv_self::batch_manager> llama_context_kv_self::prepare_batch(const llama_batch & batch) {
-    return std::make_unique<batch_manager>(*this, batch);
-}
-
-int llama_context_kv_self::decode(llama_batch & inp_batch) {
-    is_encoding = false;
-
-    if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
-        return -1;
-    }
-
-    // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
-
-    const llama_batch & batch = batch_allocr.batch;
-
-    const auto & vocab   = model.vocab;
-    const auto & hparams = model.hparams;
-
-    const int32_t n_vocab = vocab.n_tokens();
-    const int64_t n_embd  = hparams.n_embd;
-
-    // TODO: try catch
-    auto bman = prepare_batch(batch);
-
-    const auto n_outputs_all = bman->n_outputs_all;
-
-    // reserve output buffer
-    // TODO: move to batch manager?
-    if (output_reserve(bman->n_outputs_all) < (size_t) n_outputs_all) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
-        return -2;
-    };
-
-    int64_t n_outputs_prev = 0;
-
-    while (!bman->is_done()) {
-        llama_ubatch ubatch = bman->next();
-
-        if (!bman->prepare(ubatch)) {
-            LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
-            bman->restore();
-            return -3;
+            need_reserve = false;
         }
 
         ggml_backend_sched_reset(sched.get());
@@ -1844,7 +1772,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
-            bman->restore();
             switch (compute_status) {
                 case GGML_STATUS_ABORTED:
                     return 2;
@@ -1856,7 +1783,15 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
             }
         }
 
-        bman->update(ubatch);
+        // update the kv ring buffer
+        {
+            kv_self.head += ubatch.n_tokens;
+
+            // Ensure kv cache head points to a valid index.
+            if (kv_self.head >= kv_self.size) {
+                kv_self.head = 0;
+            }
+        }
 
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {
@@ -1936,14 +1871,17 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         n_outputs_prev += n_outputs;
     }
 
+    // finalize the batch processing
+    bg.done();
+
     // set output mappings
     {
         bool sorted_output = true;
 
         GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
 
-        for (size_t i = 0; i < (size_t) n_outputs_all; ++i) {
-            size_t out_id = sbatch.out_ids[i];
+        for (int64_t i = 0; i < n_outputs_all; ++i) {
+            int64_t out_id = sbatch.out_ids[i];
             output_ids[out_id] = i;
             if (out_id != i) {
                 sorted_output = false;
@@ -1961,7 +1899,19 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
     // wait for the computation to finish (automatically done when obtaining the model output)
     //synchronize();
 
-    bman->finalize();
+    // decide if we need to defrag the kv cache
+    if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
+        // - do not defrag small contexts (i.e. < 2048 tokens)
+        // - count the padding towards the number of used tokens
+        const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + get_ctx_padding(cparams))/float(kv_self.n)) : 0.0f;
+
+        // queue defragmentation for next llama_kv_cache_update
+        if (fragmentation > cparams.defrag_thold) {
+            LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
+
+            kv_self.defrag();
+        }
+    }
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
@@ -1983,14 +1933,14 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
-    const uint32_t n_tokens = batch.n_tokens;
+    const int32_t n_tokens = batch.n_tokens;
 
     const auto & hparams = model.hparams;
 
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
     if (batch.token) {
-        for (uint32_t i = 0; i < n_tokens; ++i) {
+        for (int32_t i = 0; i < n_tokens; ++i) {
             if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
                 LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
                 return -1;
@@ -1999,7 +1949,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     }
 
     // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
-    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
+    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
 
     if (t_compute_start_us == 0) {
         t_compute_start_us = ggml_time_us();
@@ -2019,7 +1969,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
         return -2;
     };
 
-    for (uint32_t i = 0; i < n_tokens; ++i) {
+    for (int32_t i = 0; i < n_tokens; ++i) {
         output_ids[i] = i;
     }
 
@@ -2087,7 +2037,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
 
             // remember the sequence ids used during the encoding - needed for cross attention later
             seq_ids_enc.resize(n_tokens);
-            for (uint32_t i = 0; i < n_tokens; i++) {
+            for (int32_t i = 0; i < n_tokens; i++) {
                 for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
                     llama_seq_id seq_id = ubatch.seq_id[i][s];
                     seq_ids_enc[i].insert(seq_id);
@@ -2116,7 +2066,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
 
                         GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
 
-                        for (uint32_t i = 0; i < n_tokens; i++) {
+                        for (int32_t i = 0; i < n_tokens; i++) {
                             const llama_seq_id seq_id = ubatch.seq_id[i][0];
                             if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                                 continue;
@@ -2448,7 +2398,7 @@ void llama_context_kv_self::kv_self_update() {
             ggml_backend_sched_reset(sched.get());
 
             auto ctx = graph_init();
-            auto ctx0 = ctx.get();
+            auto * ctx0 = ctx.get();
 
             ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
@@ -2477,7 +2427,7 @@ void llama_context_kv_self::kv_self_update() {
         ggml_backend_sched_reset(sched.get());
 
         auto ctx = graph_init();
-        auto ctx0 = ctx.get();
+        auto * ctx0 = ctx.get();
 
         ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
diff --git a/src/llama-context.h b/src/llama-context.h
index e70c99f331cd3..f2ebf4f13321f 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -92,6 +92,7 @@ struct llama_context : public llama_graph_i {
 
     virtual void synchronize();
 
+    // zero-out inputs and create ggml_context
     virtual ggml_context_ptr graph_init();
 
     // returns the result of ggml_backend_sched_graph_compute_async execution
@@ -103,13 +104,40 @@ struct llama_context : public llama_graph_i {
 
     // Make sure enough space is available for outputs.
     // Returns max number of outputs for which space was reserved.
-    virtual size_t output_reserve(size_t n_outputs);
+    virtual int32_t output_reserve(int32_t n_outputs);
 
     // make the outputs have the same order they had in the user-provided batch
     // TODO: maybe remove this
     virtual void output_reorder();
 
+    // decode a batch of tokens by evaluating the transformer
+    // in case of unsuccessful decoding (error or warning),
+    // the kv_cache state will be returned to its original state
+    // (for non-recurrent models) or cleaned (for recurrent models)
+    //
+    //   - lctx:      llama context
+    //   - inp_batch: batch to evaluate
+    //
+    // return 0 on success
+    // return positive int on warning
+    // return negative int on error
+    //
+    virtual int decode(llama_batch & inp_batch) = 0;
+
+    // encode a batch of tokens by evaluating the encoder part of the transformer
+    //
+    //   - lctx:      llama context
+    //   - batch:     batch to evaluate
+    //
+    // return 0 on success
+    // return positive int on warning
+    // return negative int on error
+    //
+    virtual int encode(llama_batch & inp_batch) = 0;
+
+    //
     // graph build API (generic)
+    //
 
     virtual void build_cb(
              ggml_tensor * cur,
@@ -141,31 +169,6 @@ struct llama_context : public llama_graph_i {
 
     virtual ggml_tensor * build_rope_factors(int il);
 
-    // decode a batch of tokens by evaluating the transformer
-    // in case of unsuccessful decoding (error or warning),
-    // the kv_cache state will be returned to its original state
-    // (for non-recurrent models) or cleaned (for recurrent models)
-    //
-    //   - lctx:      llama context
-    //   - inp_batch: batch to evaluate
-    //
-    // return 0 on success
-    // return positive int on warning
-    // return negative int on error
-    //
-    virtual int decode(llama_batch & inp_batch) = 0;
-
-    // encode a batch of tokens by evaluating the encoder part of the transformer
-    //
-    //   - lctx:      llama context
-    //   - batch:     batch to evaluate
-    //
-    // return 0 on success
-    // return positive int on warning
-    // return negative int on error
-    //
-    virtual int encode(llama_batch & inp_batch) = 0;
-
     // state save/load
 
     virtual size_t state_get_size();
@@ -268,7 +271,7 @@ struct llama_context : public llama_graph_i {
     // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
     std::map<llama_seq_id, std::vector<float>> embd_seq;
 
-    size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
+    int32_t output_size = 0; // capacity (of tokens positions) for the output buffers
     int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
 
     std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
@@ -291,8 +294,6 @@ struct llama_context : public llama_graph_i {
 // transformer with a self-attention KV cache
 class llama_context_kv_self : public llama_context {
 public:
-    struct batch_manager;
-
     llama_context_kv_self(
             const llama_model & model,
             const llama_context_params & params);
@@ -313,8 +314,6 @@ class llama_context_kv_self : public llama_context {
     virtual int decode(llama_batch & inp_batch) override;
     virtual int encode(llama_batch & inp_batch) override;
 
-    virtual std::unique_ptr<batch_manager> prepare_batch(const llama_batch & batch);
-
     // max token position across all sequences in the current context
     llama_pos pos_max() const;
 
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 6ea4972979661..3bb07ca9da431 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -150,7 +150,9 @@ struct llama_kv_slot_restorer {
 
     bool do_restore = false;
 
-    explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
+    llama_kv_cache & cache;
+
+    explicit llama_kv_slot_restorer(llama_kv_cache & cache) : cache(cache) {
         old_state.head = cache.head;
         old_state.n    = cache.n;
     }
@@ -167,7 +169,7 @@ struct llama_kv_slot_restorer {
 
     // must be explicitly called to restore the kv_cache state
     // and rollback changes from all llama_kv_cache_find_slot calls
-    void restore(struct llama_kv_cache & cache) {
+    void restore() {
         if (do_restore) {
             cache.head = old_state.head;
             cache.n    = old_state.n;

From 828064564cb661c763d7fb8ac9f0095666b143c3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 14 Feb 2025 16:48:21 +0200
Subject: [PATCH 46/84] context : move common inputs to base class

ggml-ci
---
 src/llama-context.cpp | 178 +++++++++++++++++++++---------------------
 src/llama-context.h   |  44 +++++------
 2 files changed, 111 insertions(+), 111 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f3fa4c592c86b..01dd19e559481 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -987,6 +987,95 @@ ggml_tensor * llama_context::build_rope_factors(int il) {
     return model.layers[il].rope_short;
 }
 
+ggml_tensor * llama_context::build_inp_embd(
+        ggml_context * ctx0,
+         ggml_tensor * tok_embd,
+  const llama_ubatch & ubatch) {
+    const auto & hparams = model.hparams;
+
+    const int64_t n_embd = hparams.n_embd;
+
+    struct ggml_tensor * inpL;
+
+    if (ubatch.token) {
+        inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+        //cb(inp_tokens, "inp_tokens", -1);
+        ggml_set_input(inp_tokens);
+
+        inpL = ggml_get_rows(ctx0, tok_embd, inp_tokens);
+
+        // apply lora for embedding tokens if needed
+        for (const auto & lora : loras) {
+            struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
+            if (lw == nullptr) {
+                continue;
+            }
+
+            const float adapter_scale = lora.second;
+            const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+            struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
+                        ctx0, lw->b, // non-transposed lora_b
+                        ggml_get_rows(ctx0, lw->a, inp_tokens)
+                        ), scale);
+
+            inpL = ggml_add(ctx0, inpL, inpL_delta);
+        }
+    } else {
+        inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+        inpL = inp_embd;
+        ggml_set_input(inp_embd);
+    }
+
+    // For Granite architecture
+    if (hparams.f_embedding_scale != 0.0f) {
+        inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale);
+    }
+
+    //cb(inpL, "inp_embd", -1);
+
+    return inpL;
+}
+
+ggml_tensor * llama_context::build_inp_pos(
+        ggml_context * ctx0,
+             int32_t   n_tokens) {
+    inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
+    ggml_set_input(inp_pos);
+
+    return inp_pos;
+}
+
+ggml_tensor * llama_context::build_inp_out_ids(
+        ggml_context * ctx0,
+             int32_t   n_tokens,
+                bool   worst_case) {
+    const int32_t n_out_ids = worst_case ? n_tokens : n_outputs;
+
+    inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids);
+    ggml_set_input(inp_out_ids);
+
+    return inp_out_ids;
+}
+
+ggml_tensor * llama_context::build_inp_mean(
+        ggml_context * ctx0,
+             int32_t   n_tokens) {
+    inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+    ggml_set_input(inp_mean);
+
+    return inp_mean;
+}
+
+ggml_tensor * llama_context::build_inp_cls(
+        ggml_context * ctx0,
+             int32_t   n_tokens) {
+    inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp_cls);
+
+    return inp_cls;
+}
+
 //
 // state
 //
@@ -2682,95 +2771,6 @@ ggml_tensor * llama_context_kv_self::build_soft_max_ext(
     return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
 }
 
-ggml_tensor * llama_context_kv_self::build_inp_embd(
-        ggml_context * ctx0,
-         ggml_tensor * tok_embd,
-  const llama_ubatch & ubatch) {
-    const auto & hparams = model.hparams;
-
-    const int64_t n_embd = hparams.n_embd;
-
-    struct ggml_tensor * inpL;
-
-    if (ubatch.token) {
-        inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-        //cb(inp_tokens, "inp_tokens", -1);
-        ggml_set_input(inp_tokens);
-
-        inpL = ggml_get_rows(ctx0, tok_embd, inp_tokens);
-
-        // apply lora for embedding tokens if needed
-        for (const auto & lora : loras) {
-            struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
-            if (lw == nullptr) {
-                continue;
-            }
-
-            const float adapter_scale = lora.second;
-            const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
-
-            struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
-                        ctx0, lw->b, // non-transposed lora_b
-                        ggml_get_rows(ctx0, lw->a, inp_tokens)
-                        ), scale);
-
-            inpL = ggml_add(ctx0, inpL, inpL_delta);
-        }
-    } else {
-        inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
-        inpL = inp_embd;
-        ggml_set_input(inp_embd);
-    }
-
-    // For Granite architecture
-    if (hparams.f_embedding_scale != 0.0f) {
-        inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale);
-    }
-
-    //cb(inpL, "inp_embd", -1);
-
-    return inpL;
-}
-
-ggml_tensor * llama_context_kv_self::build_inp_pos(
-        ggml_context * ctx0,
-             int32_t   n_tokens) {
-    inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
-    ggml_set_input(inp_pos);
-
-    return inp_pos;
-}
-
-ggml_tensor * llama_context_kv_self::build_inp_out_ids(
-        ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   worst_case) {
-    const int32_t n_out_ids = worst_case ? n_tokens : n_outputs;
-
-    inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids);
-    ggml_set_input(inp_out_ids);
-
-    return inp_out_ids;
-}
-
-ggml_tensor * llama_context_kv_self::build_inp_mean(
-        ggml_context * ctx0,
-             int32_t   n_tokens) {
-    inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
-    ggml_set_input(inp_mean);
-
-    return inp_mean;
-}
-
-ggml_tensor * llama_context_kv_self::build_inp_cls(
-        ggml_context * ctx0,
-             int32_t   n_tokens) {
-    inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_set_input(inp_cls);
-
-    return inp_cls;
-}
-
 void llama_context_kv_self::build_k_shift(
         ggml_context * ctx0,
          ggml_cgraph * graph) {
diff --git a/src/llama-context.h b/src/llama-context.h
index f2ebf4f13321f..e3483228d3d1a 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -169,6 +169,28 @@ struct llama_context : public llama_graph_i {
 
     virtual ggml_tensor * build_rope_factors(int il);
 
+    virtual ggml_tensor * build_inp_embd(
+            ggml_context * ctx0,
+             ggml_tensor * tok_embd,
+      const llama_ubatch & ubatch);
+
+    virtual ggml_tensor * build_inp_pos(
+            ggml_context * ctx0,
+                 int32_t   n_tokens);
+
+    virtual ggml_tensor * build_inp_out_ids(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   worst_case);
+
+    virtual ggml_tensor * build_inp_mean(
+            ggml_context * ctx0,
+                 int32_t   n_tokens);
+
+    virtual ggml_tensor * build_inp_cls(
+            ggml_context * ctx0,
+                 int32_t   n_tokens);
+
     // state save/load
 
     virtual size_t state_get_size();
@@ -330,28 +352,6 @@ class llama_context_kv_self : public llama_context {
     struct ggml_tensor * inp_KQ_mask_swa_cnv; //     [kv_size, n_batch]
     struct ggml_tensor * inp_K_shift;         // I32 [kv_size]
 
-    virtual ggml_tensor * build_inp_embd(
-            ggml_context * ctx0,
-             ggml_tensor * tok_embd,
-      const llama_ubatch & ubatch) override;
-
-    virtual ggml_tensor * build_inp_pos(
-            ggml_context * ctx0,
-                 int32_t   n_tokens) override;
-
-    virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case) override;
-
-    virtual ggml_tensor * build_inp_mean(
-            ggml_context * ctx0,
-                 int32_t   n_tokens) override;
-
-    virtual ggml_tensor * build_inp_cls(
-            ggml_context * ctx0,
-                 int32_t   n_tokens) override;
-
     virtual void build_attn_inp(
             ggml_context * ctx0,
                  int32_t   n_tokens,

From 1d801d27b9b9a79bc06255548792df9ae4f6c7fe Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 14 Feb 2025 17:22:55 +0200
Subject: [PATCH 47/84] graph : update attn/kv_self names

---
 src/llama-context.cpp | 12 ++++++------
 src/llama-context.h   |  6 +++---
 src/llama-graph.h     |  6 +++---
 src/llama-model.cpp   | 10 +++++-----
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 01dd19e559481..94d6d4f907d08 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2491,7 +2491,7 @@ void llama_context_kv_self::kv_self_update() {
 
             ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-            build_k_shift(ctx0, gf);
+            build_kv_self_shift(ctx0, gf);
 
             ggml_backend_sched_alloc_graph(sched.get(), gf);
 
@@ -2520,7 +2520,7 @@ void llama_context_kv_self::kv_self_update() {
 
         ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        build_defrag(ctx0, gf);
+        build_kv_self_defrag(ctx0, gf);
 
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
@@ -2762,7 +2762,7 @@ ggml_tensor * llama_context_kv_self::build_attn_qkv(
     return cur;
 }
 
-ggml_tensor * llama_context_kv_self::build_soft_max_ext(
+ggml_tensor * llama_context_kv_self::build_attn_soft_max(
         ggml_context * ctx0,
          ggml_tensor * kq,
              float     kq_scale) {
@@ -2771,7 +2771,7 @@ ggml_tensor * llama_context_kv_self::build_soft_max_ext(
     return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
 }
 
-void llama_context_kv_self::build_k_shift(
+void llama_context_kv_self::build_kv_self_shift(
         ggml_context * ctx0,
          ggml_cgraph * graph) {
     const auto & n_ctx      = cparams.n_ctx;
@@ -2843,7 +2843,7 @@ void llama_context_kv_self::build_k_shift(
     }
 }
 
-void llama_context_kv_self::build_defrag(
+void llama_context_kv_self::build_kv_self_defrag(
         ggml_context * ctx0,
          ggml_cgraph * graph) {
     const auto & hparams = model.hparams;
@@ -2860,7 +2860,7 @@ void llama_context_kv_self::build_defrag(
     // number of cells moved
     uint32_t n_moves = 0;
 
-    // each move requires 6*n_layer tensors (see build_defrag)
+    // each move requires 6*n_layer tensors (see build_kv_self_defrag)
     //   - source view, destination view, copy operation
     //   - x2 for keys and values
     //const uint32_t max_moves = model.max_nodes()/(6*n_layer);
diff --git a/src/llama-context.h b/src/llama-context.h
index e3483228d3d1a..7a10f84bd86bb 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -379,17 +379,17 @@ class llama_context_kv_self : public llama_context {
                  int       il,
                  bool      worst_case) override;
 
-    virtual ggml_tensor * build_soft_max_ext(
+    virtual ggml_tensor * build_attn_soft_max(
             ggml_context * ctx0,
              ggml_tensor * kq,
                  float     kq_scale) override;
 
-    virtual void build_k_shift(
+    virtual void build_kv_self_shift(
             ggml_context * ctx0,
              ggml_cgraph * graph) override;
 
     // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    virtual void build_defrag(
+    virtual void build_kv_self_defrag(
             ggml_context * ctx0,
              ggml_cgraph * graph) override;
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 5267d53da4c06..d60b57491f2cb 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -92,17 +92,17 @@ class llama_graph_i {
                  int       il,
                  bool      worst_case) = 0;
 
-    virtual ggml_tensor * build_soft_max_ext(
+    virtual ggml_tensor * build_attn_soft_max(
             ggml_context * ctx0,
              ggml_tensor * kq,
                  float     kq_scale) = 0;
 
-    virtual void build_k_shift(
+    virtual void build_kv_self_shift(
             ggml_context * ctx0,
              ggml_cgraph * graph) = 0;
 
     // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    virtual void build_defrag(
+    virtual void build_kv_self_defrag(
             ggml_context * ctx0,
              ggml_cgraph * graph) = 0;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ba11f1e1514cc..543e78d2b9c41 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4251,18 +4251,18 @@ struct llm_build_context {
         return cur;
     }
 
-    struct ggml_cgraph * build_k_shift() {
+    struct ggml_cgraph * build_kv_self_shift() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        lgf.build_k_shift(ctx0, gf);
+        lgf.build_kv_self_shift(ctx0, gf);
 
         return gf;
     }
 
-    struct ggml_cgraph * build_defrag() {
+    struct ggml_cgraph * build_kv_self_defrag() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        lgf.build_defrag(ctx0, gf);
+        lgf.build_kv_self_defrag(ctx0, gf);
 
         return gf;
     }
@@ -5638,7 +5638,7 @@ struct llm_build_context {
             cb(kq, "kq", il);
 
             //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
-            kq = lgf.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
+            kq = lgf.build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
             cb(kq, "kq_soft_max_ext", il);
 
             struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));

From c23590319a54f1bb0c92033fec750e029cdab956 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Feb 2025 11:16:53 +0200
Subject: [PATCH 48/84] graph : add llama_graph_result

ggml-ci
---
 src/llama-context.cpp |  67 ++++---
 src/llama-context.h   |   6 +-
 src/llama-graph.h     |   7 +
 src/llama-model.cpp   | 433 +++++++++++-------------------------------
 src/llama-model.h     |   4 +-
 5 files changed, 167 insertions(+), 350 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 94d6d4f907d08..55f1c03826468 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -246,31 +246,48 @@ void llama_context::init() {
         uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
         llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
+        int n_splits_pp = -1;
+        int n_nodes_pp  = -1;
+
+        int n_splits_tg = -1;
+        int n_nodes_tg  = -1;
+
         // reserve pp graph first so that buffers are only allocated once
-        llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-        ggml_cgraph * gf_pp = build_graph(ubatch_pp, true);
-        if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
-            throw std::runtime_error("failed to allocate compute buffers");
+        {
+            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            auto res_pp = graph_build(ubatch_pp, true);
+            auto & gf_pp = res_pp.gf;
+            if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
+                throw std::runtime_error("failed to allocate compute buffers");
+            }
+
+            n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
+            n_nodes_pp  = ggml_graph_n_nodes(gf_pp);
         }
-        int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
-        int n_nodes_pp  = ggml_graph_n_nodes(gf_pp);
 
         // reserve with tg graph to get the number of splits and nodes
-        llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-        ggml_cgraph * gf_tg = build_graph(ubatch_tg, true);
-        if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
-            throw std::runtime_error("failed to allocate compute buffers");
+        {
+            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            auto res_tg = graph_build(ubatch_tg, true);
+            auto & gf_tg = res_tg.gf;
+            if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
+                throw std::runtime_error("failed to allocate compute buffers");
+            }
+            n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
+            n_nodes_tg  = ggml_graph_n_nodes(gf_tg);
         }
-        int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
-        int n_nodes_tg  = ggml_graph_n_nodes(gf_tg);
 
         // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-        gf_pp = build_graph(ubatch_pp, true);
-        if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
-            throw std::runtime_error("failed to allocate compute buffers");
+        {
+            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            auto res_pp = graph_build(ubatch_pp, true);
+            auto & gf_pp = res_pp.gf;
+            if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
+                throw std::runtime_error("failed to allocate compute buffers");
+            }
         }
 
         for (size_t i = 0; i < backend_ptrs.size(); ++i) {
@@ -890,7 +907,7 @@ void llama_context::build_cb(
     }
 }
 
-ggml_cgraph * llama_context::build_graph(const llama_ubatch & ubatch, bool worst_case) {
+llama_graph_result llama_context::graph_build(const llama_ubatch & ubatch, bool worst_case) {
     return model.build_graph(*this, cparams, ubatch, graph_init(), worst_case);
 }
 
@@ -1814,11 +1831,11 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
             llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
-            ggml_cgraph * gf = build_graph(ubatch, true);
+            auto res = graph_build(ubatch, true);
 
             // initialize scheduler with the worst-case graph
             ggml_backend_sched_reset(sched.get());
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            if (!ggml_backend_sched_reserve(sched.get(), res.gf)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
             }
 
@@ -1828,7 +1845,9 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-        ggml_cgraph * gf = build_graph(ubatch, false);
+        auto res = graph_build(ubatch, false);
+
+        auto & gf = res.gf;
 
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
@@ -2073,7 +2092,9 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-    ggml_cgraph * gf = build_graph(ubatch, false);
+    auto res = graph_build(ubatch, false);
+
+    auto & gf = res.gf;
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 7a10f84bd86bb..981afcc005b06 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -95,6 +95,9 @@ struct llama_context : public llama_graph_i {
     // zero-out inputs and create ggml_context
     virtual ggml_context_ptr graph_init();
 
+    // TODO: add encode/decode graphs
+    virtual llama_graph_result graph_build(const llama_ubatch & ubatch, bool worst_case);
+
     // returns the result of ggml_backend_sched_graph_compute_async execution
     virtual enum ggml_status graph_compute(
                 ggml_cgraph * graph,
@@ -145,9 +148,6 @@ struct llama_context : public llama_graph_i {
       const llama_ubatch & ubatch,
                      int   il);
 
-    // TODO: add encode/decode graphs
-    virtual ggml_cgraph * build_graph(const llama_ubatch & ubatch, bool worst_case);
-
     // apply control vector for layer il
     virtual ggml_tensor * build_cvec(
             ggml_context * ctx0,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d60b57491f2cb..de3cd2f043458 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -10,6 +10,13 @@ struct ggml_context;
 struct ggml_tensor;
 struct llama_ubatch;
 
+struct llama_graph_result {
+    ggml_cgraph * gf = nullptr;
+
+    ggml_tensor * t_logits = nullptr;
+    ggml_tensor * t_embd   = nullptr;
+};
+
 // TODO: can become more granular in the future
 class llama_graph_i {
 public:
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 543e78d2b9c41..4950af59bf01e 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4251,22 +4251,6 @@ struct llm_build_context {
         return cur;
     }
 
-    struct ggml_cgraph * build_kv_self_shift() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        lgf.build_kv_self_shift(ctx0, gf);
-
-        return gf;
-    }
-
-    struct ggml_cgraph * build_kv_self_defrag() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
-        lgf.build_kv_self_defrag(ctx0, gf);
-
-        return gf;
-    }
-
     struct ggml_tensor * build_inp_pos() {
         ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens);
         cb(cur, "inp_pos", -1);
@@ -4295,7 +4279,7 @@ struct llm_build_context {
         return cur;
     }
 
-    struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
+    void append_pooling(struct ggml_cgraph * gf) {
         // find result_norm tensor for input
         struct ggml_tensor * inp = nullptr;
         for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
@@ -4356,8 +4340,6 @@ struct llm_build_context {
         cb(cur, "result_embd_pooled", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
     //struct ggml_tensor * build_pos_bucket(bool causal) {
@@ -4406,9 +4388,7 @@ struct llm_build_context {
         return cur;
     }
 
-    struct ggml_cgraph * build_llama() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_llama(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4563,13 +4543,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_deci() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_deci(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4719,13 +4695,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_baichuan() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_baichuan(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4834,13 +4806,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_xverse() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_xverse(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -4937,13 +4905,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_falcon() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_falcon(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5057,13 +5021,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_grok() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_grok(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5211,13 +5171,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_dbrx() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_dbrx(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5334,13 +5290,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_starcoder() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_starcoder(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5438,13 +5390,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_refact() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_refact(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
@@ -5532,13 +5480,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_bert() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_bert(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
 
@@ -5726,13 +5670,9 @@ struct llm_build_context {
         cb(cur, "result_embd", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_bloom() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_bloom(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5827,13 +5767,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_mpt() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_mpt(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5967,13 +5903,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_stablelm() {
-        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
+    void build_stablelm(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
@@ -6117,13 +6049,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_qwen() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_qwen(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
@@ -6229,13 +6157,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_qwen2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_qwen2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6341,12 +6265,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_qwen2vl() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    void build_qwen2vl(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6457,13 +6378,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_qwen2moe() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_qwen2moe(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6601,13 +6518,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_phi2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_phi2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -6722,13 +6635,11 @@ struct llm_build_context {
 
         cur = ggml_add(ctx0, cur, model.output_b);
         cb(cur, "result_output", -1);
+
         ggml_build_forward_expand(gf, cur);
-        return gf;
     }
 
-    struct ggml_cgraph * build_phi3() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_phi3(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -6866,14 +6777,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-
-    struct ggml_cgraph * build_plamo() {
-        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
+    void build_plamo(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6971,13 +6877,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_gpt2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_gpt2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7076,13 +6978,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_codeshell() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_codeshell(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7187,13 +7085,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_orion() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_orion(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -7305,13 +7199,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_internlm2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_internlm2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -7423,13 +7313,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_minicpm3() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_minicpm3(ggml_cgraph * gf) {
         //TODO: if the model varies, these parameters need to be read from the model
         const int64_t n_embd_base = 256;
         const float scale_embd  = 12.0f;
@@ -7633,13 +7519,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_gemma() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_gemma(ggml_cgraph * gf) {
         const int64_t n_embd_head_k = hparams.n_embd_head_k;
 
         struct ggml_tensor * cur;
@@ -7741,13 +7623,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_gemma2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_gemma2(ggml_cgraph * gf) {
         const int64_t n_embd_head_k = hparams.n_embd_head_k;
 
         struct ggml_tensor * cur;
@@ -7871,14 +7749,10 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-
-    struct ggml_cgraph * build_starcoder2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    // TODO: move up next to build_starcoder
+    void build_starcoder2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -7991,13 +7865,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_mamba() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_mamba(ggml_cgraph * gf) {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
@@ -8045,14 +7915,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_command_r() {
-
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_command_r(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         const float f_logit_scale = hparams.f_logit_scale;
@@ -8193,14 +8058,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
-
     }
 
-    struct ggml_cgraph * build_cohere2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_cohere2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         const float f_logit_scale = hparams.f_logit_scale;
@@ -8322,8 +8182,6 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
     // ref: https://allenai.org/olmo
@@ -8332,9 +8190,7 @@ struct llm_build_context {
     //   * clamp qkv
     //   * removed bias
     //   * removed MoE
-    struct ggml_cgraph * build_olmo() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_olmo(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8447,13 +8303,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_olmo2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_olmo2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8566,17 +8418,13 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
     // based on the build_qwen2moe() function, changes:
     //   * removed shared experts
     //   * removed bias
     //   * added q, k norm
-    struct ggml_cgraph * build_olmoe() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_olmoe(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -8692,13 +8540,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_openelm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_openelm(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
@@ -8817,13 +8661,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_gptneox() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_gptneox(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8960,13 +8800,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_arctic() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_arctic(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -9089,13 +8925,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_deepseek() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_deepseek(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -9244,13 +9076,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_deepseek2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_deepseek2(ggml_cgraph * gf) {
         bool is_lite = (hparams.n_layer == 27);
 
         // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
@@ -9471,13 +9299,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_bitnet() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_bitnet(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
@@ -9622,12 +9446,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-        return gf;
     }
 
-    //struct ggml_cgraph * build_t5_enc() {
-    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    //void build_t5_enc(ggml_cgraph * gf) {
     //    const int64_t n_embd_head = hparams.n_embd_head_v;
     //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
     //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9749,13 +9570,9 @@ struct llm_build_context {
     //    cb(cur, "result_norm", -1);
 
     //    ggml_build_forward_expand(gf, cur);
-
-    //    return gf;
     //}
 
-    //struct ggml_cgraph * build_t5_dec() {
-    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    //void build_t5_dec(ggml_cgraph * gf) {
     //    const int64_t n_embd_head = hparams.n_embd_head_v;
     //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
     //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9954,9 +9771,7 @@ struct llm_build_context {
     //    return gf;
     //}
 
-    struct ggml_cgraph * build_jais() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_jais(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10041,13 +9856,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_chatglm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_chatglm(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10170,13 +9981,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_nemotron() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_nemotron(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         //GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -10290,13 +10097,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_exaone() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_exaone(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -10412,13 +10215,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    ggml_cgraph * build_rwkv6() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_rwkv6(ggml_cgraph * gf) {
         GGML_ASSERT(hparams.token_shift_count == 2);
 
         struct ggml_tensor * cur;
@@ -10502,14 +10301,10 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
     // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
-    ggml_cgraph * build_rwkv6qwen2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_rwkv6qwen2(ggml_cgraph * gf) {
         GGML_ASSERT(n_embd == hparams.n_embd_k_s());
 
         struct ggml_tensor * cur;
@@ -10586,8 +10381,6 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
     // ref: https://github.com/facebookresearch/chameleon
@@ -10596,9 +10389,7 @@ struct llm_build_context {
     //   * swin-norm
     //   * removed bias
     //   * removed MoE
-    struct ggml_cgraph * build_chameleon() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_chameleon(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -10759,13 +10550,9 @@ struct llm_build_context {
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 
-    struct ggml_cgraph * build_wavtokenizer_dec() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
-
+    void build_wavtokenizer_dec(ggml_cgraph * gf) {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
@@ -10911,231 +10698,233 @@ struct llm_build_context {
         cb(cur, "result_embd", -1);
 
         ggml_build_forward_expand(gf, cur);
-
-        return gf;
     }
 };
 
-ggml_cgraph * llama_model::build_graph(
+llama_graph_result llama_model::build_graph(
          llama_graph_i &  lgf,
    const llama_cparams &  cparams,
    const llama_ubatch  &  ubatch,
       ggml_context_ptr && ctx,
                   bool    worst_case) const {
-    struct ggml_cgraph * result = NULL;
+    llama_graph_result result = {};
 
     struct llm_build_context llm(lgf, *this, cparams, ubatch, std::move(ctx), worst_case);
 
+    auto & gf = result.gf;
+
+    gf = ggml_new_graph_custom(llm.ctx0, max_nodes(), false);
+
     switch (arch) {
         case LLM_ARCH_LLAMA:
         case LLM_ARCH_MINICPM:
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
             {
-                result = llm.build_llama();
+                llm.build_llama(gf);
             } break;
         case LLM_ARCH_DECI:
             {
-                result = llm.build_deci();
+                llm.build_deci(gf);
             } break;
         case LLM_ARCH_BAICHUAN:
             {
-                result = llm.build_baichuan();
+                llm.build_baichuan(gf);
             } break;
         case LLM_ARCH_FALCON:
             {
-                result = llm.build_falcon();
+                llm.build_falcon(gf);
             } break;
         case LLM_ARCH_GROK:
             {
-                result = llm.build_grok();
+                llm.build_grok(gf);
             } break;
         case LLM_ARCH_STARCODER:
             {
-                result = llm.build_starcoder();
+                llm.build_starcoder(gf);
             } break;
         case LLM_ARCH_REFACT:
             {
-                result = llm.build_refact();
+                llm.build_refact(gf);
             } break;
         case LLM_ARCH_BERT:
         case LLM_ARCH_JINA_BERT_V2:
         case LLM_ARCH_NOMIC_BERT:
             {
-                result = llm.build_bert();
+                llm.build_bert(gf);
             } break;
         case LLM_ARCH_BLOOM:
             {
-                result = llm.build_bloom();
+                llm.build_bloom(gf);
             } break;
         case LLM_ARCH_MPT:
             {
-                result = llm.build_mpt();
+                llm.build_mpt(gf);
             } break;
          case LLM_ARCH_STABLELM:
             {
-                result = llm.build_stablelm();
+                llm.build_stablelm(gf);
             } break;
         case LLM_ARCH_QWEN:
             {
-                result = llm.build_qwen();
+                llm.build_qwen(gf);
             } break;
         case LLM_ARCH_QWEN2:
             {
-                result = llm.build_qwen2();
+                llm.build_qwen2(gf);;
             } break;
         case LLM_ARCH_QWEN2VL:
             {
-                result = llm.build_qwen2vl();
+                llm.build_qwen2vl(gf);
             } break;
         case LLM_ARCH_QWEN2MOE:
             {
-                result = llm.build_qwen2moe();
+                llm.build_qwen2moe(gf);
             } break;
         case LLM_ARCH_PHI2:
             {
-                result = llm.build_phi2();
+                llm.build_phi2(gf);
             } break;
         case LLM_ARCH_PHI3:
         case LLM_ARCH_PHIMOE:
             {
-                result = llm.build_phi3();
+                llm.build_phi3(gf);
             } break;
         case LLM_ARCH_PLAMO:
             {
-                result = llm.build_plamo();
+                llm.build_plamo(gf);
             } break;
         case LLM_ARCH_GPT2:
             {
-                result = llm.build_gpt2();
+                llm.build_gpt2(gf);
             } break;
         case LLM_ARCH_CODESHELL:
             {
-                result = llm.build_codeshell();
+                llm.build_codeshell(gf);
             } break;
         case LLM_ARCH_ORION:
             {
-                result = llm.build_orion();
+                llm.build_orion(gf);
             } break;
         case LLM_ARCH_INTERNLM2:
             {
-                result = llm.build_internlm2();
+                llm.build_internlm2(gf);
             } break;
         case LLM_ARCH_MINICPM3:
             {
-                result = llm.build_minicpm3();
+                llm.build_minicpm3(gf);
             } break;
         case LLM_ARCH_GEMMA:
             {
-                result = llm.build_gemma();
+                llm.build_gemma(gf);
             } break;
         case LLM_ARCH_GEMMA2:
             {
-                result = llm.build_gemma2();
+                llm.build_gemma2(gf);
             } break;
         case LLM_ARCH_STARCODER2:
             {
-                result = llm.build_starcoder2();
+                llm.build_starcoder2(gf);
             } break;
         case LLM_ARCH_MAMBA:
             {
-                result = llm.build_mamba();
+                llm.build_mamba(gf);
             } break;
         case LLM_ARCH_XVERSE:
             {
-                result = llm.build_xverse();
+                llm.build_xverse(gf);
             } break;
         case LLM_ARCH_COMMAND_R:
             {
-                result = llm.build_command_r();
+                llm.build_command_r(gf);
             } break;
         case LLM_ARCH_COHERE2:
             {
-                result = llm.build_cohere2();
+                llm.build_cohere2(gf);
             } break;
         case LLM_ARCH_DBRX:
             {
-                result = llm.build_dbrx();
+                llm.build_dbrx(gf);
             } break;
         case LLM_ARCH_OLMO:
             {
-                result = llm.build_olmo();
+                llm.build_olmo(gf);
             } break;
         case LLM_ARCH_OLMO2:
             {
-                result = llm.build_olmo2();
+                llm.build_olmo2(gf);
             } break;
         case LLM_ARCH_OLMOE:
             {
-                result = llm.build_olmoe();
+                llm.build_olmoe(gf);
             } break;
         case LLM_ARCH_OPENELM:
             {
-                result = llm.build_openelm();
+                llm.build_openelm(gf);
             } break;
         case LLM_ARCH_GPTNEOX:
             {
-                result = llm.build_gptneox();
+                llm.build_gptneox(gf);
             } break;
         case LLM_ARCH_ARCTIC:
             {
-                result = llm.build_arctic();
+                llm.build_arctic(gf);
             } break;
         case LLM_ARCH_DEEPSEEK:
             {
-                result = llm.build_deepseek();
+                llm.build_deepseek(gf);
             } break;
         case LLM_ARCH_DEEPSEEK2:
             {
-                result = llm.build_deepseek2();
+                llm.build_deepseek2(gf);
             } break;
         case LLM_ARCH_CHATGLM:
             {
-                result = llm.build_chatglm();
+                llm.build_chatglm(gf);
             } break;
         case LLM_ARCH_BITNET:
             {
-                result = llm.build_bitnet();
+                llm.build_bitnet(gf);
             } break;
         //case LLM_ARCH_T5:
         //    {
         //        if (lctx.is_encoding) {
-        //            result = llm.build_t5_enc();
+        //            llm.build_t5_enc(gf);
         //        } else {
-        //            result = llm.build_t5_dec();
+        //            llm.build_t5_dec(gf);
         //        }
         //    } break;
         //case LLM_ARCH_T5ENCODER:
         //    {
-        //        result = llm.build_t5_enc();
+        //        llm.build_t5_enc(gf);
         //    } break;
         case LLM_ARCH_JAIS:
             {
-                result = llm.build_jais();
+                llm.build_jais(gf);
             } break;
         case LLM_ARCH_NEMOTRON:
             {
-                result = llm.build_nemotron();
+                llm.build_nemotron(gf);
             } break;
         case LLM_ARCH_EXAONE:
             {
-                result = llm.build_exaone();
+                llm.build_exaone(gf);
             } break;
         case LLM_ARCH_RWKV6:
             {
-                result = llm.build_rwkv6();
+                llm.build_rwkv6(gf);
             } break;
         case LLM_ARCH_RWKV6QWEN2:
             {
-                result = llm.build_rwkv6qwen2();
+                llm.build_rwkv6qwen2(gf);
             } break;
         case LLM_ARCH_CHAMELEON:
             {
-                result = llm.build_chameleon();
+                llm.build_chameleon(gf);
             } break;
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
-                result = llm.build_wavtokenizer_dec();
+                llm.build_wavtokenizer_dec(gf);
             } break;
         default:
             GGML_ABORT("fatal error");
@@ -11143,7 +10932,7 @@ ggml_cgraph * llama_model::build_graph(
 
     // add on pooling layer
     if (cparams.embeddings) {
-        result = llm.append_pooling(result);
+        llm.append_pooling(gf);
     }
 
     return result;
diff --git a/src/llama-model.h b/src/llama-model.h
index 0374b484b10ab..a3267bbbbb44a 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -16,6 +16,7 @@ class  llama_graph_i;
 struct llama_cparams;
 struct llama_ubatch;
 struct llama_model_loader;
+struct llama_graph_result;
 
 // available models
 enum llm_type {
@@ -368,8 +369,7 @@ struct llama_model {
     const struct ggml_tensor * get_tensor(const char * name) const;
 
     // TODO: add encode/decode graphs
-    // TODO: return a struct containing the graph and the output tensors, such as logits, embeddings, etc.
-    ggml_cgraph * build_graph(
+    llama_graph_result build_graph(
              llama_graph_i &  lgf,
        const llama_cparams &  cparams,
        const llama_ubatch  &  ubatch,

From 172f61690cb612be187980c5174707aeb5871714 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Feb 2025 13:48:43 +0200
Subject: [PATCH 49/84] cont : return important tensors

ggml-ci
---
 src/llama-context.cpp |  29 +++--
 src/llama-context.h   |   5 +-
 src/llama-graph.h     |   6 +-
 src/llama-model.cpp   | 289 ++++++++++++++++++++++++++++++++++++++----
 src/llama-model.h     |  10 +-
 5 files changed, 293 insertions(+), 46 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 55f1c03826468..d39263d288f8b 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -255,7 +255,8 @@ void llama_context::init() {
         // reserve pp graph first so that buffers are only allocated once
         {
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto res_pp = graph_build(ubatch_pp, true);
+            auto ctx = graph_init();
+            auto res_pp = graph_build(ctx, ubatch_pp, true);
             auto & gf_pp = res_pp.gf;
             if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
@@ -269,7 +270,8 @@ void llama_context::init() {
         // reserve with tg graph to get the number of splits and nodes
         {
             llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto res_tg = graph_build(ubatch_tg, true);
+            auto ctx = graph_init();
+            auto res_tg = graph_build(ctx, ubatch_tg, true);
             auto & gf_tg = res_tg.gf;
             if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
@@ -282,7 +284,8 @@ void llama_context::init() {
         // reserve again with pp graph to avoid ggml-alloc reallocations during inference
         {
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto res_pp = graph_build(ubatch_pp, true);
+            auto ctx = graph_init();
+            auto res_pp = graph_build(ctx, ubatch_pp, true);
             auto & gf_pp = res_pp.gf;
             if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
@@ -569,6 +572,13 @@ ggml_context_ptr llama_context::graph_init() {
     return ggml_context_ptr { ggml_init(params) };
 }
 
+llama_graph_result llama_context::graph_build(
+        ggml_context_ptr & ctx,
+      const llama_ubatch & ubatch,
+                    bool   worst_case) {
+    return model.build_graph(ctx, *this, cparams, ubatch, worst_case);
+}
+
 enum ggml_status llama_context::graph_compute(
             ggml_cgraph * graph,
                    bool   batched) {
@@ -907,10 +917,6 @@ void llama_context::build_cb(
     }
 }
 
-llama_graph_result llama_context::graph_build(const llama_ubatch & ubatch, bool worst_case) {
-    return model.build_graph(*this, cparams, ubatch, graph_init(), worst_case);
-}
-
 llama_perf_context_data llama_context::perf_get_data() const {
     llama_perf_context_data data = {};
 
@@ -1831,7 +1837,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
             llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
-            auto res = graph_build(ubatch, true);
+            auto ctx = graph_init();
+            auto res = graph_build(ctx, ubatch, true);
 
             // initialize scheduler with the worst-case graph
             ggml_backend_sched_reset(sched.get());
@@ -1845,7 +1852,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-        auto res = graph_build(ubatch, false);
+        auto ctx = graph_init();
+        auto res = graph_build(ctx, ubatch, false);
 
         auto & gf = res.gf;
 
@@ -2092,7 +2100,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-    auto res = graph_build(ubatch, false);
+    auto ctx = graph_init();
+    auto res = graph_build(ctx, ubatch, false);
 
     auto & gf = res.gf;
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 981afcc005b06..e3ab12e59c746 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -96,7 +96,10 @@ struct llama_context : public llama_graph_i {
     virtual ggml_context_ptr graph_init();
 
     // TODO: add encode/decode graphs
-    virtual llama_graph_result graph_build(const llama_ubatch & ubatch, bool worst_case);
+    virtual llama_graph_result graph_build(
+            ggml_context_ptr & ctx,
+          const llama_ubatch & ubatch,
+                        bool   worst_case);
 
     // returns the result of ggml_backend_sched_graph_compute_async execution
     virtual enum ggml_status graph_compute(
diff --git a/src/llama-graph.h b/src/llama-graph.h
index de3cd2f043458..14d0c5da0a359 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -13,8 +13,10 @@ struct llama_ubatch;
 struct llama_graph_result {
     ggml_cgraph * gf = nullptr;
 
-    ggml_tensor * t_logits = nullptr;
-    ggml_tensor * t_embd   = nullptr;
+    // important graph nodes
+    ggml_tensor * t_logits      = nullptr;
+    ggml_tensor * t_embd        = nullptr;
+    ggml_tensor * t_embd_pooled = nullptr;
 };
 
 // TODO: can become more granular in the future
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 4950af59bf01e..ecfd6f185039a 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3841,17 +3841,19 @@ struct llm_build_context {
     const enum llama_pooling_type pooling_type;
     const enum llama_rope_type    rope_type;
 
-    const ggml_context_ptr   ctx  = nullptr;
-          ggml_context     * ctx0 = nullptr;
+    ggml_context_ptr & ctx;
+    ggml_context     * ctx0 = nullptr;
+
+    llama_graph_result res;
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
-             llama_graph_i &  lgf,
-       const llama_model   &  model,
-       const llama_cparams &  cparams,
-       const llama_ubatch  &  ubatch,
-          ggml_context_ptr && ctx,
-                      bool    worst_case) :
+            ggml_context_ptr & ctx,
+            llama_graph_i    & lgf,
+      const llama_model      & model,
+      const llama_cparams    & cparams,
+      const llama_ubatch     & ubatch,
+            bool               worst_case) :
         lgf              (lgf),
         model            (model),
         hparams          (model.hparams),
@@ -3883,7 +3885,7 @@ struct llm_build_context {
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
-        ctx              (std::move(ctx)),
+        ctx              (ctx),
         ctx0             (this->ctx.get()) {
         }
 
@@ -4280,16 +4282,18 @@ struct llm_build_context {
     }
 
     void append_pooling(struct ggml_cgraph * gf) {
-        // find result_norm tensor for input
-        struct ggml_tensor * inp = nullptr;
-        for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-            inp = ggml_graph_node(gf, i);
-            if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
-                break;
-            }
+        struct ggml_tensor * inp = res.t_embd;
+
+        //// find result_norm tensor for input
+        //for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+        //    inp = ggml_graph_node(gf, i);
+        //    if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
+        //        break;
+        //    }
+
+        //    inp = nullptr;
+        //}
 
-            inp = nullptr;
-        }
         GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
 
         struct ggml_tensor * cur;
@@ -4338,6 +4342,7 @@ struct llm_build_context {
         }
 
         cb(cur, "result_embd_pooled", -1);
+        res.t_embd_pooled = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -4390,6 +4395,7 @@ struct llm_build_context {
 
     void build_llama(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -4530,7 +4536,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -4541,12 +4549,14 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_deci(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -4682,7 +4692,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -4693,12 +4705,14 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_baichuan(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -4799,17 +4813,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_xverse(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -4898,11 +4917,15 @@ struct llm_build_context {
         cur = inpL;
 
         cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -4910,6 +4933,7 @@ struct llm_build_context {
     void build_falcon(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -5015,16 +5039,21 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_grok(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -5158,7 +5187,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -5169,6 +5200,7 @@ struct llm_build_context {
         cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5176,6 +5208,7 @@ struct llm_build_context {
     void build_dbrx(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -5282,12 +5315,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5295,6 +5331,7 @@ struct llm_build_context {
     void build_starcoder(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -5384,16 +5421,21 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_refact(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -5473,11 +5515,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5668,6 +5714,7 @@ struct llm_build_context {
         cur = inpL;
 
         cb(cur, "result_embd", -1);
+        res.t_embd = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5675,6 +5722,7 @@ struct llm_build_context {
     void build_bloom(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -5761,10 +5809,14 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5772,6 +5824,7 @@ struct llm_build_context {
     void build_mpt(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -5897,16 +5950,21 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_stablelm(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -6042,17 +6100,22 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_qwen(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -6150,17 +6213,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_qwen2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -6258,17 +6326,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_qwen2vl(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -6371,17 +6444,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_qwen2moe(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -6511,11 +6589,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6523,6 +6605,7 @@ struct llm_build_context {
     void build_phi2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -6628,13 +6711,17 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output_no_bias", -1);
 
         cur = ggml_add(ctx0, cur, model.output_b);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6642,6 +6729,7 @@ struct llm_build_context {
     void build_phi3(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -6656,7 +6744,7 @@ struct llm_build_context {
         lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            auto residual = inpL;
+            auto * residual = inpL;
 
             // self-attention
             {
@@ -6766,7 +6854,9 @@ struct llm_build_context {
             model.output_norm,
             model.output_norm_b,
             LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
@@ -6774,13 +6864,16 @@ struct llm_build_context {
             cb(cur, "result_output_no_bias", -1);
             cur = ggml_add(ctx0, cur, model.output_b);
         }
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_plamo(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -6870,11 +6963,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6882,6 +6979,7 @@ struct llm_build_context {
     void build_gpt2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -6972,10 +7070,14 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6983,6 +7085,7 @@ struct llm_build_context {
     void build_codeshell(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -7079,16 +7182,21 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_orion(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -7192,17 +7300,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_internlm2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -7306,11 +7419,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7507,7 +7624,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head scaling
         const float scale_lmhead = float(n_embd_base)/float(n_embd);
@@ -7516,7 +7635,9 @@ struct llm_build_context {
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7616,11 +7737,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7736,7 +7861,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -7747,6 +7874,7 @@ struct llm_build_context {
         cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7754,6 +7882,7 @@ struct llm_build_context {
     // TODO: move up next to build_starcoder
     void build_starcoder2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -7858,11 +7987,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7908,18 +8041,24 @@ struct llm_build_context {
         cur = build_norm(inpL,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_command_r(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
         const float f_logit_scale = hparams.f_logit_scale;
 
         struct ggml_tensor * cur;
@@ -8046,7 +8185,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -8056,13 +8197,16 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_cohere2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
         const float f_logit_scale = hparams.f_logit_scale;
 
         struct ggml_tensor * cur;
@@ -8170,7 +8314,9 @@ struct llm_build_context {
         cur = inpL;
 
         cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -8180,6 +8326,7 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8192,6 +8339,7 @@ struct llm_build_context {
     //   * removed MoE
     void build_olmo(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -8296,17 +8444,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 NULL, NULL,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_olmo2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -8411,11 +8564,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8426,6 +8583,7 @@ struct llm_build_context {
     //   * added q, k norm
     void build_olmoe(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -8533,17 +8691,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_openelm(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -8655,10 +8818,14 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8666,6 +8833,7 @@ struct llm_build_context {
     void build_gptneox(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -8794,16 +8962,21 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_arctic(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -8918,17 +9091,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_deepseek(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -9068,12 +9246,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9292,17 +9473,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = ggml_mul_mat(ctx0, model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_bitnet(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -9438,12 +9624,16 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         // FIXME: do not use model.tok_embd directly, duplicate as model.output
         cur = build_lora_mm(model.tok_embd, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9451,6 +9641,7 @@ struct llm_build_context {
     //void build_t5_enc(ggml_cgraph * gf) {
     //    const int64_t n_embd_head = hparams.n_embd_head_v;
     //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
     //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
     //    struct ggml_tensor * cur;
@@ -9567,7 +9758,9 @@ struct llm_build_context {
     //    cur = build_norm(cur,
     //            model.output_norm_enc, NULL,
     //            LLM_NORM_RMS, -1);
+    //
     //    cb(cur, "result_norm", -1);
+    //    res.t_embd = cur;
 
     //    ggml_build_forward_expand(gf, cur);
     //}
@@ -9575,6 +9768,7 @@ struct llm_build_context {
     //void build_t5_dec(ggml_cgraph * gf) {
     //    const int64_t n_embd_head = hparams.n_embd_head_v;
     //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
     //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
     //    struct ggml_tensor * cur;
@@ -9760,11 +9954,15 @@ struct llm_build_context {
     //    cur = build_norm(cur,
     //            model.output_norm, NULL,
     //            LLM_NORM_RMS, -1);
+
     //    cb(cur, "result_norm", -1);
+    //    res.t_embd = cur;
 
     //    // lm_head
     //    cur = build_lora_mm(model.output, cur);
+
     //    cb(cur, "result_output", -1);
+    //    res.t_logits = cur;
 
     //    ggml_build_forward_expand(gf, cur);
 
@@ -9774,6 +9972,7 @@ struct llm_build_context {
     void build_jais(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -9849,11 +10048,14 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9861,6 +10063,7 @@ struct llm_build_context {
     void build_chatglm(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -9975,16 +10178,21 @@ struct llm_build_context {
                 model.output_norm,
                 NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_nemotron(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         //GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -10090,17 +10298,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_exaone(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -10208,11 +10421,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10290,15 +10507,21 @@ struct llm_build_context {
         }
 
         cur = inpL;
+
         struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
         cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10375,10 +10598,14 @@ struct llm_build_context {
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
         cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10391,6 +10618,7 @@ struct llm_build_context {
     //   * removed MoE
     void build_chameleon(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -10530,7 +10758,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -10546,8 +10776,11 @@ struct llm_build_context {
         struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
         img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
         cb(img_logits, "img_logits", -1);
+
         cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10695,23 +10928,23 @@ struct llm_build_context {
         cur = build_lora_mm(model.output, cur);
 
         cur = ggml_add(ctx0, cur, model.output_b);
+
         cb(cur, "result_embd", -1);
+        res.t_embd = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 };
 
 llama_graph_result llama_model::build_graph(
-         llama_graph_i &  lgf,
-   const llama_cparams &  cparams,
-   const llama_ubatch  &  ubatch,
-      ggml_context_ptr && ctx,
+      ggml_context_ptr & ctx,
+         llama_graph_i & lgf,
+   const llama_cparams & cparams,
+   const llama_ubatch  & ubatch,
                   bool    worst_case) const {
-    llama_graph_result result = {};
+    struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch, worst_case);
 
-    struct llm_build_context llm(lgf, *this, cparams, ubatch, std::move(ctx), worst_case);
-
-    auto & gf = result.gf;
+    auto & gf = llm.res.gf;
 
     gf = ggml_new_graph_custom(llm.ctx0, max_nodes(), false);
 
@@ -10935,7 +11168,7 @@ llama_graph_result llama_model::build_graph(
         llm.append_pooling(gf);
     }
 
-    return result;
+    return llm.res;
 }
 
 //
diff --git a/src/llama-model.h b/src/llama-model.h
index a3267bbbbb44a..f5d1f7b79f50b 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -370,11 +370,11 @@ struct llama_model {
 
     // TODO: add encode/decode graphs
     llama_graph_result build_graph(
-             llama_graph_i &  lgf,
-       const llama_cparams &  cparams,
-       const llama_ubatch  &  ubatch,
-          ggml_context_ptr && ctx,
-                      bool    worst_case) const;
+          ggml_context_ptr & ctx,
+             llama_graph_i & lgf,
+       const llama_cparams & cparams,
+       const llama_ubatch  & ubatch,
+                      bool   worst_case) const;
 
 private:
     struct impl;

From bc6f187e9c0d40ca355e088708e4323bac2828da Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Feb 2025 14:24:17 +0200
Subject: [PATCH 50/84] cont : use returend tensors from the graph build

ggml-ci
---
 src/llama-context.cpp | 60 ++++++++++---------------------------------
 1 file changed, 13 insertions(+), 47 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d39263d288f8b..b508a4f8d194c 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1855,7 +1855,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         auto ctx = graph_init();
         auto res = graph_build(ctx, ubatch, false);
 
-        auto & gf = res.gf;
+        auto * gf = res.gf;
 
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
@@ -1863,29 +1863,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
         input_set(ubatch);
 
-        // the output is always the last tensor in the graph
-        struct ggml_tensor * t_logits = ggml_graph_node(gf, -1);
-        struct ggml_tensor * t_embd   = ggml_graph_node(gf, -2);
-
-        if (n_outputs == 0) {
-            // no output
-            t_logits  = nullptr;
-            t_embd = nullptr;
-        } else if (cparams.embeddings) {
-            t_logits  = nullptr; // do not extract logits for embedding case
-            t_embd = nullptr;
-            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-                    t_embd = ggml_graph_node(gf, i);
-                    break;
-                }
-            }
-            GGML_ASSERT(t_embd != nullptr && "missing embeddings tensor");
-        } else {
-            t_embd = nullptr; // do not extract embeddings when not needed
-            GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor");
-        }
-
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
             switch (compute_status) {
@@ -1914,8 +1891,15 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
         //}
 
+        auto * t_logits = cparams.embeddings ? nullptr    : res.t_logits;
+        auto * t_embd   = cparams.embeddings ? res.t_embd : nullptr;
+
+        if (t_embd && res.t_embd_pooled) {
+            t_embd = res.t_embd_pooled;
+        }
+
         // extract logits
-        if (t_logits) {
+        if (t_logits && n_outputs > 0) {
             ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
             GGML_ASSERT(backend_res != nullptr);
             GGML_ASSERT(logits != nullptr);
@@ -1930,7 +1914,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         }
 
         // extract embeddings
-        if (t_embd) {
+        if (t_embd && n_outputs > 0) {
             ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
             GGML_ASSERT(backend_embd != nullptr);
 
@@ -2103,32 +2087,12 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     auto ctx = graph_init();
     auto res = graph_build(ctx, ubatch, false);
 
-    auto & gf = res.gf;
+    auto * gf = res.gf;
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
     input_set(ubatch);
 
-    // the output embeddings after the final encoder normalization
-    struct ggml_tensor * t_embd = nullptr;
-
-    // there are two cases here
-    if (llama_model_has_decoder(&model)) {
-        // first case is an encoder-decoder T5 model where embeddings are passed to decoder
-        t_embd = ggml_graph_node(gf, -1);
-        GGML_ASSERT(strcmp(t_embd->name, "result_norm") == 0 && "missing result_output tensor");
-    } else {
-        // second case is an encoder-only T5 model
-        if (cparams.embeddings) {
-            // only output embeddings if required
-            t_embd = ggml_graph_node(gf, -1);
-            if (strcmp(t_embd->name, "result_embd_pooled") != 0) {
-                t_embd = ggml_graph_node(gf, -2);
-            }
-            GGML_ASSERT(strcmp(t_embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
-        }
-    }
-
     const auto compute_status = graph_compute(gf, n_tokens > 1);
     switch (compute_status) {
         case GGML_STATUS_SUCCESS:
@@ -2142,6 +2106,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
             return -3;
     }
 
+    auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd;
+
     // extract embeddings
     if (t_embd) {
         ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);

From befe14f06f2f36e16f87a79706d874d406c51bfa Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Feb 2025 14:47:53 +0200
Subject: [PATCH 51/84] llama : reorder encode/decode in sources

---
 src/llama-context.cpp | 324 +++++++++++++++++++++---------------------
 src/llama-context.h   |  20 +--
 2 files changed, 172 insertions(+), 172 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index b508a4f8d194c..0e0af806d66c9 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1655,6 +1655,168 @@ ggml_context_ptr llama_context_kv_self::graph_init() {
     return llama_context::graph_init();
 }
 
+int llama_context_kv_self::encode(llama_batch & inp_batch) {
+    is_encoding = true;
+
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
+
+    const llama_batch & batch = batch_allocr.batch;
+    const int32_t n_tokens = batch.n_tokens;
+
+    const auto & hparams = model.hparams;
+
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+    if (batch.token) {
+        for (int32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
+
+    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
+    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
+
+    if (t_compute_start_us == 0) {
+        t_compute_start_us = ggml_time_us();
+    }
+
+    n_queued_tokens += n_tokens;
+
+    const int64_t n_embd = hparams.n_embd;
+
+    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+
+    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
+
+    // reserve output buffer
+    if (output_reserve(n_tokens) < n_tokens) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+        return -2;
+    };
+
+    for (int32_t i = 0; i < n_tokens; ++i) {
+        output_ids[i] = i;
+    }
+
+    inp_embd_enc = NULL;
+    n_outputs = n_tokens;
+
+    //batch_manager->prepare(ubatch);
+
+    // TODO: do reserve
+    GGML_ASSERT(need_reserve == false);
+
+    ggml_backend_sched_reset(sched.get());
+    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+    auto ctx = graph_init();
+    auto res = graph_build(ctx, ubatch, false);
+
+    auto * gf = res.gf;
+
+    ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+    input_set(ubatch);
+
+    const auto compute_status = graph_compute(gf, n_tokens > 1);
+    switch (compute_status) {
+        case GGML_STATUS_SUCCESS:
+            break;
+        case GGML_STATUS_ABORTED:
+            return 2;
+        case GGML_STATUS_ALLOC_FAILED:
+            return -2;
+        case GGML_STATUS_FAILED:
+        default:
+            return -3;
+    }
+
+    auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd;
+
+    // extract embeddings
+    if (t_embd) {
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+        GGML_ASSERT(backend_embd != nullptr);
+
+        if (llama_model_has_decoder(&model)) {
+            embd_enc.resize(n_tokens*n_embd);
+            float * embd_out = embd_enc.data();
+
+            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+            GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+            // remember the sequence ids used during the encoding - needed for cross attention later
+            seq_ids_enc.resize(n_tokens);
+            for (int32_t i = 0; i < n_tokens; i++) {
+                for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
+                    llama_seq_id seq_id = ubatch.seq_id[i][s];
+                    seq_ids_enc[i].insert(seq_id);
+                }
+            }
+        } else {
+            GGML_ASSERT(embd != nullptr);
+
+            switch (cparams.pooling_type) {
+                case LLAMA_POOLING_TYPE_NONE:
+                    {
+                        // extract token embeddings
+                        GGML_ASSERT(embd != nullptr);
+                        float * embd_out = embd;
+
+                        GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                    } break;
+                case LLAMA_POOLING_TYPE_MEAN:
+                case LLAMA_POOLING_TYPE_CLS:
+                case LLAMA_POOLING_TYPE_LAST:
+                    {
+                        // extract sequence embeddings
+                        auto & embd_seq_out = embd_seq;
+                        embd_seq_out.clear();
+
+                        GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+                        for (int32_t i = 0; i < n_tokens; i++) {
+                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(n_embd);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                        //       wait for an encoder model that requires this pooling type in order to test it
+                        //       https://github.com/ggerganov/llama.cpp/pull/9510
+                        GGML_ABORT("RANK pooling not implemented yet");
+                    }
+                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                    {
+                        GGML_ABORT("unknown pooling type");
+                    }
+            }
+        }
+    }
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    return 0;
+}
+
 int llama_context_kv_self::decode(llama_batch & inp_batch) {
     is_encoding = false;
 
@@ -2020,168 +2182,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-int llama_context_kv_self::encode(llama_batch & inp_batch) {
-    is_encoding = true;
-
-    if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
-        return -1;
-    }
-
-    // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
-
-    const llama_batch & batch = batch_allocr.batch;
-    const int32_t n_tokens = batch.n_tokens;
-
-    const auto & hparams = model.hparams;
-
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
-
-    if (batch.token) {
-        for (int32_t i = 0; i < n_tokens; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
-                return -1;
-            }
-        }
-    }
-
-    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
-    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
-
-    if (t_compute_start_us == 0) {
-        t_compute_start_us = ggml_time_us();
-    }
-
-    n_queued_tokens += n_tokens;
-
-    const int64_t n_embd = hparams.n_embd;
-
-    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-
-    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
-
-    // reserve output buffer
-    if (output_reserve(n_tokens) < n_tokens) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
-        return -2;
-    };
-
-    for (int32_t i = 0; i < n_tokens; ++i) {
-        output_ids[i] = i;
-    }
-
-    inp_embd_enc = NULL;
-    n_outputs = n_tokens;
-
-    //batch_manager->prepare(ubatch);
-
-    // TODO: do reserve
-    GGML_ASSERT(need_reserve == false);
-
-    ggml_backend_sched_reset(sched.get());
-    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
-
-    auto ctx = graph_init();
-    auto res = graph_build(ctx, ubatch, false);
-
-    auto * gf = res.gf;
-
-    ggml_backend_sched_alloc_graph(sched.get(), gf);
-
-    input_set(ubatch);
-
-    const auto compute_status = graph_compute(gf, n_tokens > 1);
-    switch (compute_status) {
-        case GGML_STATUS_SUCCESS:
-            break;
-        case GGML_STATUS_ABORTED:
-            return 2;
-        case GGML_STATUS_ALLOC_FAILED:
-            return -2;
-        case GGML_STATUS_FAILED:
-        default:
-            return -3;
-    }
-
-    auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd;
-
-    // extract embeddings
-    if (t_embd) {
-        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
-        GGML_ASSERT(backend_embd != nullptr);
-
-        if (llama_model_has_decoder(&model)) {
-            embd_enc.resize(n_tokens*n_embd);
-            float * embd_out = embd_enc.data();
-
-            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
-            GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
-
-            // remember the sequence ids used during the encoding - needed for cross attention later
-            seq_ids_enc.resize(n_tokens);
-            for (int32_t i = 0; i < n_tokens; i++) {
-                for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
-                    llama_seq_id seq_id = ubatch.seq_id[i][s];
-                    seq_ids_enc[i].insert(seq_id);
-                }
-            }
-        } else {
-            GGML_ASSERT(embd != nullptr);
-
-            switch (cparams.pooling_type) {
-                case LLAMA_POOLING_TYPE_NONE:
-                    {
-                        // extract token embeddings
-                        GGML_ASSERT(embd != nullptr);
-                        float * embd_out = embd;
-
-                        GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
-                    } break;
-                case LLAMA_POOLING_TYPE_MEAN:
-                case LLAMA_POOLING_TYPE_CLS:
-                case LLAMA_POOLING_TYPE_LAST:
-                    {
-                        // extract sequence embeddings
-                        auto & embd_seq_out = embd_seq;
-                        embd_seq_out.clear();
-
-                        GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
-
-                        for (int32_t i = 0; i < n_tokens; i++) {
-                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(n_embd);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_RANK:
-                    {
-                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
-                        //       wait for an encoder model that requires this pooling type in order to test it
-                        //       https://github.com/ggerganov/llama.cpp/pull/9510
-                        GGML_ABORT("RANK pooling not implemented yet");
-                    }
-                case LLAMA_POOLING_TYPE_UNSPECIFIED:
-                    {
-                        GGML_ABORT("unknown pooling type");
-                    }
-            }
-        }
-    }
-
-    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-    // overlap with device computation.
-    ggml_backend_sched_reset(sched.get());
-
-    return 0;
-}
-
 llama_pos llama_context_kv_self::pos_max() const {
     return kv_self.pos_max();
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index e3ab12e59c746..9f6abfc824b3d 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -116,30 +116,30 @@ struct llama_context : public llama_graph_i {
     // TODO: maybe remove this
     virtual void output_reorder();
 
-    // decode a batch of tokens by evaluating the transformer
-    // in case of unsuccessful decoding (error or warning),
-    // the kv_cache state will be returned to its original state
-    // (for non-recurrent models) or cleaned (for recurrent models)
+    // encode a batch of tokens by evaluating the encoder part of the transformer
     //
     //   - lctx:      llama context
-    //   - inp_batch: batch to evaluate
+    //   - batch:     batch to evaluate
     //
     // return 0 on success
     // return positive int on warning
     // return negative int on error
     //
-    virtual int decode(llama_batch & inp_batch) = 0;
+    virtual int encode(llama_batch & inp_batch) = 0;
 
-    // encode a batch of tokens by evaluating the encoder part of the transformer
+    // decode a batch of tokens by evaluating the transformer
+    // in case of unsuccessful decoding (error or warning),
+    // the kv_cache state will be returned to its original state
+    // (for non-recurrent models) or cleaned (for recurrent models)
     //
     //   - lctx:      llama context
-    //   - batch:     batch to evaluate
+    //   - inp_batch: batch to evaluate
     //
     // return 0 on success
     // return positive int on warning
     // return negative int on error
     //
-    virtual int encode(llama_batch & inp_batch) = 0;
+    virtual int decode(llama_batch & inp_batch) = 0;
 
     //
     // graph build API (generic)
@@ -336,8 +336,8 @@ class llama_context_kv_self : public llama_context {
 
     virtual void input_set(const llama_ubatch & ubatch) override;
 
-    virtual int decode(llama_batch & inp_batch) override;
     virtual int encode(llama_batch & inp_batch) override;
+    virtual int decode(llama_batch & inp_batch) override;
 
     // max token position across all sequences in the current context
     llama_pos pos_max() const;

From 9e50456e19ac5c24c40387e6b4a2b3072f7a9d8e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Feb 2025 14:53:02 +0200
Subject: [PATCH 52/84] context : minor simplify

ggml-ci
---
 src/llama-context.cpp | 24 +++++++++++-------------
 src/llama-context.h   |  2 +-
 src/llama-model.cpp   | 20 +++++++++-----------
 src/llama-model.h     |  2 +-
 4 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0e0af806d66c9..d9735cfaa41fc 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -256,7 +256,7 @@ void llama_context::init() {
         {
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
             auto ctx = graph_init();
-            auto res_pp = graph_build(ctx, ubatch_pp, true);
+            auto res_pp = graph_build(ctx.get(), ubatch_pp, true);
             auto & gf_pp = res_pp.gf;
             if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
@@ -271,7 +271,7 @@ void llama_context::init() {
         {
             llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
             auto ctx = graph_init();
-            auto res_tg = graph_build(ctx, ubatch_tg, true);
+            auto res_tg = graph_build(ctx.get(), ubatch_tg, true);
             auto & gf_tg = res_tg.gf;
             if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
@@ -285,7 +285,7 @@ void llama_context::init() {
         {
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
             auto ctx = graph_init();
-            auto res_pp = graph_build(ctx, ubatch_pp, true);
+            auto res_pp = graph_build(ctx.get(), ubatch_pp, true);
             auto & gf_pp = res_pp.gf;
             if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
@@ -573,7 +573,7 @@ ggml_context_ptr llama_context::graph_init() {
 }
 
 llama_graph_result llama_context::graph_build(
-        ggml_context_ptr & ctx,
+            ggml_context * ctx,
       const llama_ubatch & ubatch,
                     bool   worst_case) {
     return model.build_graph(ctx, *this, cparams, ubatch, worst_case);
@@ -1720,7 +1720,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
     auto ctx = graph_init();
-    auto res = graph_build(ctx, ubatch, false);
+    auto res = graph_build(ctx.get(), ubatch, false);
 
     auto * gf = res.gf;
 
@@ -2000,7 +2000,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
             llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
             auto ctx = graph_init();
-            auto res = graph_build(ctx, ubatch, true);
+            auto res = graph_build(ctx.get(), ubatch, true);
 
             // initialize scheduler with the worst-case graph
             ggml_backend_sched_reset(sched.get());
@@ -2015,7 +2015,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
         auto ctx = graph_init();
-        auto res = graph_build(ctx, ubatch, false);
+        auto res = graph_build(ctx.get(), ubatch, false);
 
         auto * gf = res.gf;
 
@@ -2483,11 +2483,10 @@ void llama_context_kv_self::kv_self_update() {
             ggml_backend_sched_reset(sched.get());
 
             auto ctx = graph_init();
-            auto * ctx0 = ctx.get();
 
-            ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+            ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false);
 
-            build_kv_self_shift(ctx0, gf);
+            build_kv_self_shift(ctx.get(), gf);
 
             ggml_backend_sched_alloc_graph(sched.get(), gf);
 
@@ -2512,11 +2511,10 @@ void llama_context_kv_self::kv_self_update() {
         ggml_backend_sched_reset(sched.get());
 
         auto ctx = graph_init();
-        auto * ctx0 = ctx.get();
 
-        ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+        ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false);
 
-        build_kv_self_defrag(ctx0, gf);
+        build_kv_self_defrag(ctx.get(), gf);
 
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 9f6abfc824b3d..4bf8244e625c1 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -97,7 +97,7 @@ struct llama_context : public llama_graph_i {
 
     // TODO: add encode/decode graphs
     virtual llama_graph_result graph_build(
-            ggml_context_ptr & ctx,
+                ggml_context * ctx,
           const llama_ubatch & ubatch,
                         bool   worst_case);
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ecfd6f185039a..289c3422e3dcf 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3841,19 +3841,18 @@ struct llm_build_context {
     const enum llama_pooling_type pooling_type;
     const enum llama_rope_type    rope_type;
 
-    ggml_context_ptr & ctx;
-    ggml_context     * ctx0 = nullptr;
+    ggml_context * ctx0 = nullptr;
 
     llama_graph_result res;
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
-            ggml_context_ptr & ctx,
-            llama_graph_i    & lgf,
-      const llama_model      & model,
-      const llama_cparams    & cparams,
-      const llama_ubatch     & ubatch,
-            bool               worst_case) :
+            ggml_context  * ctx,
+            llama_graph_i & lgf,
+      const llama_model   & model,
+      const llama_cparams & cparams,
+      const llama_ubatch  & ubatch,
+            bool            worst_case) :
         lgf              (lgf),
         model            (model),
         hparams          (model.hparams),
@@ -3885,8 +3884,7 @@ struct llm_build_context {
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
-        ctx              (ctx),
-        ctx0             (this->ctx.get()) {
+        ctx0             (ctx) {
         }
 
     // TODO: tmp
@@ -10937,7 +10935,7 @@ struct llm_build_context {
 };
 
 llama_graph_result llama_model::build_graph(
-      ggml_context_ptr & ctx,
+          ggml_context * ctx,
          llama_graph_i & lgf,
    const llama_cparams & cparams,
    const llama_ubatch  & ubatch,
diff --git a/src/llama-model.h b/src/llama-model.h
index f5d1f7b79f50b..a7c53bdbdc7ea 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -370,7 +370,7 @@ struct llama_model {
 
     // TODO: add encode/decode graphs
     llama_graph_result build_graph(
-          ggml_context_ptr & ctx,
+              ggml_context * ctx,
              llama_graph_i & lgf,
        const llama_cparams & cparams,
        const llama_ubatch  & ubatch,

From 2bffc2d514ac2a86acae27037e0e466ebc723fd4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Feb 2025 14:57:26 +0200
Subject: [PATCH 53/84] model : pass llama_graph_i as ptr

ggml-ci
---
 src/llama-context.cpp |   2 +-
 src/llama-model.cpp   | 252 +++++++++++++++++++++---------------------
 src/llama-model.h     |   2 +-
 3 files changed, 128 insertions(+), 128 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d9735cfaa41fc..bfcdf6cddcf30 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -576,7 +576,7 @@ llama_graph_result llama_context::graph_build(
             ggml_context * ctx,
       const llama_ubatch & ubatch,
                     bool   worst_case) {
-    return model.build_graph(ctx, *this, cparams, ubatch, worst_case);
+    return model.build_graph(ctx, this, cparams, ubatch, worst_case);
 }
 
 enum ggml_status llama_context::graph_compute(
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 289c3422e3dcf..350dfd89cee3d 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3804,7 +3804,6 @@ enum llm_norm_type {
 };
 
 struct llm_build_context {
-          llama_graph_i & lgf;
     const llama_model   & model;
     const llama_hparams & hparams;
     const llama_cparams & cparams;
@@ -3842,18 +3841,18 @@ struct llm_build_context {
     const enum llama_rope_type    rope_type;
 
     ggml_context * ctx0 = nullptr;
+    llama_graph_i * lgf = nullptr;
 
     llama_graph_result res;
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
             ggml_context  * ctx,
-            llama_graph_i & lgf,
+            llama_graph_i * lgf,
       const llama_model   & model,
       const llama_cparams & cparams,
       const llama_ubatch  & ubatch,
             bool            worst_case) :
-        lgf              (lgf),
         model            (model),
         hparams          (model.hparams),
         cparams          (cparams),
@@ -3884,17 +3883,18 @@ struct llm_build_context {
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
-        ctx0             (ctx) {
+        ctx0             (ctx),
+        lgf              (lgf) {
         }
 
     // TODO: tmp
     void cb(struct ggml_tensor * cur, const char * name, int il) {
-        lgf.build_cb(cur, name, ubatch, il);
+        lgf->build_cb(cur, name, ubatch, il);
     }
 
     // TODO: tmp
     struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
-        struct ggml_tensor * inpL = lgf.build_inp_embd(ctx0, tok_embd, ubatch);
+        struct ggml_tensor * inpL = lgf->build_inp_embd(ctx0, tok_embd, ubatch);
         cb(inpL, "inp_embd", -1);
 
         return inpL;
@@ -3904,7 +3904,7 @@ struct llm_build_context {
     struct ggml_tensor * build_lora_mm(
               struct ggml_tensor * w,
               struct ggml_tensor * cur) {
-        return lgf.build_lora_mm(ctx0, w, cur);
+        return lgf->build_lora_mm(ctx0, w, cur);
     }
 
     // TODO: tmp
@@ -3912,7 +3912,7 @@ struct llm_build_context {
               struct ggml_tensor * w,   // struct ggml_tensor * as
               struct ggml_tensor * cur, // struct ggml_tensor * b
               struct ggml_tensor * ids) {
-        return lgf.build_lora_mm_id(ctx0, w, cur, ids);
+        return lgf->build_lora_mm_id(ctx0, w, cur, ids);
     }
 
     struct ggml_tensor * build_norm(
@@ -4211,12 +4211,12 @@ struct llm_build_context {
         ggml_build_forward_expand(graph, v_cur);
 
         //build_kv_store(graph, k_cur, v_cur, il);
-        lgf.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
+        lgf->build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
 
         struct ggml_tensor * cur;
 
         //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il);
-        cur = lgf.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
+        cur = lgf->build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
         cb(cur, "kqv_out", il);
 
         return cur;
@@ -4252,28 +4252,28 @@ struct llm_build_context {
     }
 
     struct ggml_tensor * build_inp_pos() {
-        ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
         cb(cur, "inp_pos", -1);
 
         return cur;
     }
 
     struct ggml_tensor * build_inp_out_ids() {
-        ggml_tensor * cur = lgf.build_inp_out_ids(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case);
         cb(cur, "inp_out_ids", -1);
 
         return cur;
     }
 
     struct ggml_tensor * build_inp_mean() {
-        ggml_tensor * cur = lgf.build_inp_mean(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
         cb(cur, "inp_mean", -1);
 
         return cur;
     }
 
     struct ggml_tensor * build_inp_cls() {
-        ggml_tensor * cur = lgf.build_inp_cls(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
         cb(cur, "inp_cls", -1);
 
         return cur;
@@ -4378,14 +4378,14 @@ struct llm_build_context {
     //}
 
     struct ggml_tensor * build_inp_embd_enc() {
-        ggml_tensor * cur = lgf.build_inp_embd_enc(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case);
         cb(cur, "embd_enc", -1);
 
         return cur;
     }
 
     struct ggml_tensor * build_inp_KQ_mask_cross() {
-        ggml_tensor * cur = lgf.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
         cb(cur, "KQ_mask_cross", -1);
 
         return cur;
@@ -4405,7 +4405,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -4420,7 +4420,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4522,7 +4522,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4566,7 +4566,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -4592,7 +4592,7 @@ struct llm_build_context {
             } else if (n_head > 0) {
                 // self-attention
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4678,7 +4678,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4722,7 +4722,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4799,7 +4799,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4838,7 +4838,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4905,7 +4905,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4943,7 +4943,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
@@ -5023,7 +5023,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cur = ggml_add(ctx0, cur, inpL);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5066,7 +5066,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5173,7 +5173,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5218,7 +5218,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5301,7 +5301,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5340,7 +5340,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -5408,7 +5408,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5441,7 +5441,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5501,7 +5501,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5555,7 +5555,7 @@ struct llm_build_context {
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
-        lgf.build_attn_inp(ctx0, n_tokens, false, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, false, false, worst_case);
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
@@ -5626,7 +5626,7 @@ struct llm_build_context {
             cb(kq, "kq", il);
 
             //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
-            kq = lgf.build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
+            kq = lgf->build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
             cb(kq, "kq_soft_max_ext", il);
 
             struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -5728,7 +5728,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         inpL = build_norm(inpL,
                 model.tok_norm,
@@ -5796,7 +5796,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5831,7 +5831,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         if (model.pos_embd) {
             // inp_pos - contains the positions
@@ -5935,7 +5935,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5973,7 +5973,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -6085,7 +6085,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6124,7 +6124,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6199,7 +6199,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6238,7 +6238,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6312,7 +6312,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6351,7 +6351,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         int sections[4];
         std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -6430,7 +6430,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6469,7 +6469,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6575,7 +6575,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6616,7 +6616,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             attn_norm_output = build_norm(inpL,
@@ -6698,7 +6698,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_output);
             cur = ggml_add(ctx0, cur, inpL);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6739,7 +6739,7 @@ struct llm_build_context {
         struct ggml_tensor * inp_pos = build_inp_pos();
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             auto * residual = inpL;
@@ -6747,7 +6747,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
 
                 struct ggml_tensor* attn_norm_output = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -6841,7 +6841,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, residual, cur);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6883,7 +6883,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -6949,7 +6949,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, sa_out);
             cur = ggml_add(ctx0, cur, inpL);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6989,7 +6989,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -7057,7 +7057,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -7095,7 +7095,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -7169,7 +7169,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -7206,7 +7206,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7286,7 +7286,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -7325,7 +7325,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7405,7 +7405,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -7453,12 +7453,12 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+            struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
             // norm
             cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
@@ -7610,7 +7610,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -7654,7 +7654,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7723,7 +7723,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, sa_out);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -7762,7 +7762,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7847,7 +7847,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, sa_out);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -7892,7 +7892,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7973,7 +7973,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8005,8 +8005,8 @@ struct llm_build_context {
         // {n_embd, n_tokens}
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -8016,7 +8016,7 @@ struct llm_build_context {
             cb(cur, "attn_norm", il);
 
             //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
-            cur = lgf.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf->build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
 
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
@@ -8028,7 +8028,7 @@ struct llm_build_context {
             // residual
             cur = ggml_add(ctx0, cur, inpL);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8067,7 +8067,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -8171,7 +8171,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8215,7 +8215,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         // sliding window switch pattern
         const int32_t sliding_window_pattern = 4;
@@ -8233,7 +8233,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -8302,7 +8302,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8349,7 +8349,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8430,7 +8430,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8469,7 +8469,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8550,7 +8550,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8593,7 +8593,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8677,7 +8677,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -8714,7 +8714,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             const int64_t n_head    = hparams.n_head(il);
@@ -8804,7 +8804,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             inpL = cur;
@@ -8842,7 +8842,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -8919,7 +8919,7 @@ struct llm_build_context {
 
                 cur = ggml_add(ctx0, cur, attn_out);
 
-                cur = lgf.build_cvec(ctx0, cur, il);
+                cur = lgf->build_cvec(ctx0, cur, il);
                 cb(cur, "l_out", il);
 
                 // input for next layer
@@ -8948,7 +8948,7 @@ struct llm_build_context {
 
                 cur = ggml_add(ctx0, cur, ffn_inp);
 
-                cur = lgf.build_cvec(ctx0, cur, il);
+                cur = lgf->build_cvec(ctx0, cur, il);
                 cb(cur, "l_out", il);
 
                 // input for next layer
@@ -8986,7 +8986,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9077,7 +9077,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_out);
             cb(cur, "ffn_out", il);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -9116,7 +9116,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
@@ -9132,7 +9132,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9232,7 +9232,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -9279,7 +9279,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9459,7 +9459,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -9497,7 +9497,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9978,7 +9978,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -10072,7 +10072,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10202,7 +10202,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10284,7 +10284,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -10323,7 +10323,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10337,7 +10337,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -10407,7 +10407,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -10441,8 +10441,8 @@ struct llm_build_context {
         inpL = build_inp_embd(model.tok_embd);
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10451,7 +10451,7 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             const llama_layer * layer = &model.layers[il];
 
-            struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load(
+            struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load(
                 ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
             );
 
@@ -10468,7 +10468,7 @@ struct llm_build_context {
                 1
             );
 
-            cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
 
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
@@ -10491,13 +10491,13 @@ struct llm_build_context {
                 ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
                 1
             );
-            ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+            ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
 
             if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
                 cur = ggml_scale(ctx0, cur, 0.5F);
             }
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -10533,8 +10533,8 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10545,7 +10545,7 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             const llama_layer * layer = &model.layers[il];
 
-            struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load(
+            struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load(
                 ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
             );
 
@@ -10559,10 +10559,10 @@ struct llm_build_context {
                 1
             );
 
-            cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
 
             token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
-            ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+            ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
 
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
@@ -10583,7 +10583,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -10628,7 +10628,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10744,7 +10744,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lgf.build_cvec(ctx0, cur, il);
+            cur = lgf->build_cvec(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -10936,7 +10936,7 @@ struct llm_build_context {
 
 llama_graph_result llama_model::build_graph(
           ggml_context * ctx,
-         llama_graph_i & lgf,
+         llama_graph_i * lgf,
    const llama_cparams & cparams,
    const llama_ubatch  & ubatch,
                   bool    worst_case) const {
diff --git a/src/llama-model.h b/src/llama-model.h
index a7c53bdbdc7ea..2a9fca7d40c6d 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -371,7 +371,7 @@ struct llama_model {
     // TODO: add encode/decode graphs
     llama_graph_result build_graph(
               ggml_context * ctx,
-             llama_graph_i & lgf,
+             llama_graph_i * lgf,
        const llama_cparams & cparams,
        const llama_ubatch  & ubatch,
                       bool   worst_case) const;

From f5cedbcaaa5070d17f5290a03fd3124d58a3b824 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Feb 2025 21:26:42 +0200
Subject: [PATCH 54/84] kv-cache : prepare for abstraction

ggml-ci
---
 src/llama-context.cpp  | 518 +++++++++--------------------------------
 src/llama-context.h    |  47 ++--
 src/llama-graph.h      |  48 ++--
 src/llama-kv-cache.cpp | 319 ++++++++++++++++++++++++-
 src/llama-kv-cache.h   |  76 +++---
 src/llama-model.cpp    | 117 +++++-----
 src/llama-model.h      |   3 +-
 7 files changed, 594 insertions(+), 534 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index bfcdf6cddcf30..454e141c85796 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -201,7 +201,7 @@ void llama_context::init() {
         backend_ptrs.push_back(backend.get());
     }
 
-    const size_t max_nodes = model.max_nodes();
+    const size_t max_nodes = this->max_nodes();
 
     // buffer used to store the computation graph and the tensor meta data
     // TODO: move to base class
@@ -255,39 +255,36 @@ void llama_context::init() {
         // reserve pp graph first so that buffers are only allocated once
         {
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto ctx = graph_init();
-            auto res_pp = graph_build(ctx.get(), ubatch_pp, true);
-            auto & gf_pp = res_pp.gf;
-            if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
+            auto * gf = graph_init();
+            graph_build(ctx_compute.get(), gf, ubatch_pp, true);
+            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
                 throw std::runtime_error("failed to allocate compute buffers");
             }
 
             n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
-            n_nodes_pp  = ggml_graph_n_nodes(gf_pp);
+            n_nodes_pp  = ggml_graph_n_nodes(gf);
         }
 
         // reserve with tg graph to get the number of splits and nodes
         {
             llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto ctx = graph_init();
-            auto res_tg = graph_build(ctx.get(), ubatch_tg, true);
-            auto & gf_tg = res_tg.gf;
-            if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) {
+            auto * gf = graph_init();
+            graph_build(ctx_compute.get(), gf, ubatch_tg, true);
+            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
                 throw std::runtime_error("failed to allocate compute buffers");
             }
             n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
-            n_nodes_tg  = ggml_graph_n_nodes(gf_tg);
+            n_nodes_tg  = ggml_graph_n_nodes(gf);
         }
 
         // reserve again with pp graph to avoid ggml-alloc reallocations during inference
         {
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto ctx = graph_init();
-            auto res_pp = graph_build(ctx.get(), ubatch_pp, true);
-            auto & gf_pp = res_pp.gf;
-            if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
+            auto * gf = graph_init();
+            graph_build(ctx_compute.get(), gf, ubatch_pp, true);
+            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
                 throw std::runtime_error("failed to allocate compute buffers");
             }
@@ -350,6 +347,10 @@ uint32_t llama_context::n_threads_batch() const {
     return cparams.n_threads_batch;
 }
 
+int32_t llama_context::max_nodes() const {
+    return std::max<int32_t>(8192, 5*model.n_tensors());
+}
+
 enum llama_pooling_type llama_context::pooling_type() const {
     return cparams.pooling_type;
 }
@@ -555,7 +556,7 @@ void llama_context::synchronize() {
     t_compute_start_us = 0;
 }
 
-ggml_context_ptr llama_context::graph_init() {
+ggml_cgraph * llama_context::graph_init() {
     inp_tokens  = nullptr;
     inp_embd    = nullptr;
     inp_pos     = nullptr;
@@ -569,18 +570,21 @@ ggml_context_ptr llama_context::graph_init() {
         /*.no_alloc   =*/ true,
     };
 
-    return ggml_context_ptr { ggml_init(params) };
+    ctx_compute.reset(ggml_init(params));
+
+    return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false);
 }
 
 llama_graph_result llama_context::graph_build(
             ggml_context * ctx,
+             ggml_cgraph * gf,
       const llama_ubatch & ubatch,
                     bool   worst_case) {
-    return model.build_graph(ctx, this, cparams, ubatch, worst_case);
+    return model.build_graph(ctx, gf, this, cparams, ubatch, worst_case);
 }
 
 enum ggml_status llama_context::graph_compute(
-            ggml_cgraph * graph,
+            ggml_cgraph * gf,
                    bool   batched) {
     int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
     ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
@@ -596,7 +600,7 @@ enum ggml_status llama_context::graph_compute(
         set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
     }
 
-    auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph);
+    auto status = ggml_backend_sched_graph_compute_async(sched.get(), gf);
     if (status != GGML_STATUS_SUCCESS) {
         LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
     }
@@ -881,7 +885,6 @@ void llama_context::output_reorder() {
     }
 }
 
-
 void llama_context::build_cb(
          ggml_tensor * cur,
           const char * name,
@@ -1010,6 +1013,55 @@ ggml_tensor * llama_context::build_rope_factors(int il) {
     return model.layers[il].rope_short;
 }
 
+ggml_tensor * llama_context::build_rope_shift(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * shift,
+        ggml_tensor * factors,
+        ggml_backend_buffer * bbuf) {
+    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
+    const auto & freq_base  = cparams.rope_freq_base;
+    const auto & freq_scale = cparams.rope_freq_scale;
+
+    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
+    const auto & yarn_attn_factor = cparams.yarn_attn_factor;
+    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
+
+    const auto & n_rot     = model.hparams.n_rot;
+    const auto & rope_type = model.hparams.rope_type;
+
+    struct ggml_tensor * tmp;
+
+    if (ggml_is_quantized(cur->type)) {
+        // dequantize to f32 -> RoPE -> quantize back
+        tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
+
+        if (bbuf) {
+            for (auto & backend : backends) {
+                // Figure out which backend KV cache belongs to
+                if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
+                    ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
+                    break;
+                }
+            }
+        }
+
+        tmp = ggml_rope_ext_inplace(ctx0, tmp,
+                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+
+        tmp = ggml_cpy(ctx0, tmp, cur);
+    } else {
+        // we rotate only the first n_rot dimensions
+        tmp = ggml_rope_ext_inplace(ctx0, cur,
+                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+    }
+
+    return tmp;
+}
+
 ggml_tensor * llama_context::build_inp_embd(
         ggml_context * ctx0,
          ggml_tensor * tok_embd,
@@ -1579,7 +1631,8 @@ void llama_context::perf_reset() {
 llama_context_kv_self::llama_context_kv_self(
         const llama_model & model,
         const llama_context_params & params) :
-    llama_context(model, params) {
+    llama_context(model, params),
+    kv_self(model.hparams) {
     const auto & hparams = model.hparams;
 
     LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
@@ -1640,13 +1693,13 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
     return &kv_self;
 }
 
-ggml_context_ptr llama_context_kv_self::graph_init() {
+ggml_cgraph * llama_context_kv_self::graph_init() {
     inp_KQ_mask         = nullptr;
     inp_KQ_mask_cnv     = nullptr;
     inp_KQ_mask_swa     = nullptr;
     inp_KQ_mask_swa_cnv = nullptr;
     inp_KQ_mask_cross   = nullptr;
-    inp_K_shift         = nullptr;
+    inp_k_shift         = nullptr;
     inp_s_copy          = nullptr;
     inp_s_mask          = nullptr;
     inp_embd_enc        = nullptr;
@@ -1719,10 +1772,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-    auto ctx = graph_init();
-    auto res = graph_build(ctx.get(), ubatch, false);
-
-    auto * gf = res.gf;
+    auto * gf = graph_init();
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
@@ -1999,12 +2050,12 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
             llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
-            auto ctx = graph_init();
-            auto res = graph_build(ctx.get(), ubatch, true);
+            auto * gf = graph_init();
+            graph_build(ctx_compute.get(), gf, ubatch, true);
 
             // initialize scheduler with the worst-case graph
             ggml_backend_sched_reset(sched.get());
-            if (!ggml_backend_sched_reserve(sched.get(), res.gf)) {
+            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
             }
 
@@ -2014,10 +2065,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-        auto ctx = graph_init();
-        auto res = graph_build(ctx.get(), ubatch, false);
-
-        auto * gf = res.gf;
+        auto * gf = graph_init();
+        auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
 
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
@@ -2195,10 +2244,10 @@ uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) c
 void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     const llama_hparams & hparams = model.hparams;
 
-    if (inp_K_shift) {
-        assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
+    if (inp_k_shift) {
+        assert(ggml_backend_buffer_is_host(inp_k_shift->buffer));
 
-        int32_t * data = (int32_t *) inp_K_shift->data;
+        int32_t * data = (int32_t *) inp_k_shift->data;
 
         for (uint32_t i = 0; i < kv_self.size; ++i) {
             data[i] = kv_self.cells[i].delta;
@@ -2482,11 +2531,9 @@ void llama_context_kv_self::kv_self_update() {
         if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
             ggml_backend_sched_reset(sched.get());
 
-            auto ctx = graph_init();
+            auto * gf = graph_init();
 
-            ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false);
-
-            build_kv_self_shift(ctx.get(), gf);
+            kv_self.build_shift(ctx_compute.get(), gf, this);
 
             ggml_backend_sched_alloc_graph(sched.get(), gf);
 
@@ -2510,11 +2557,9 @@ void llama_context_kv_self::kv_self_update() {
     if (kv.do_defrag) {
         ggml_backend_sched_reset(sched.get());
 
-        auto ctx = graph_init();
-
-        ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false);
+        auto * gf = graph_init();
 
-        build_kv_self_defrag(ctx.get(), gf);
+        kv_self.build_defrag(ctx_compute.get(), gf, max_nodes(), !cparams.flash_attn);
 
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
@@ -2529,6 +2574,13 @@ void llama_context_kv_self::kv_self_update() {
     }
 }
 
+ggml_tensor * llama_context_kv_self::build_inp_k_shift(ggml_context * ctx0) {
+    inp_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx());
+    ggml_set_input(inp_k_shift);
+
+    return inp_k_shift;
+}
+
 void llama_context_kv_self::build_attn_inp(
         ggml_context * ctx0,
              int32_t   n_tokens,
@@ -2765,348 +2817,6 @@ ggml_tensor * llama_context_kv_self::build_attn_soft_max(
     return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
 }
 
-void llama_context_kv_self::build_kv_self_shift(
-        ggml_context * ctx0,
-         ggml_cgraph * graph) {
-    const auto & n_ctx      = cparams.n_ctx;
-    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
-    const auto & freq_base  = cparams.rope_freq_base;
-    const auto & freq_scale = cparams.rope_freq_scale;
-
-    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
-    const auto & yarn_attn_factor = cparams.yarn_attn_factor;
-    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
-
-    const auto & hparams = model.hparams;
-
-    const auto & n_rot     = hparams.n_rot;
-    const auto & n_layer   = hparams.n_layer;
-    const auto & rope_type = hparams.rope_type;
-
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
-
-    GGML_ASSERT(kv_self.size == n_ctx);
-
-    inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-    //cb(inp_K_shift, "K_shift", -1);
-    ggml_set_input(inp_K_shift);
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const int64_t n_head_kv    = hparams.n_head_kv(il);
-        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-
-        struct ggml_tensor * rope_factors = build_rope_factors(il);
-
-        struct ggml_tensor * k =
-            ggml_view_3d(ctx0, kv_self.k_l[il],
-                n_embd_head_k, n_head_kv, n_ctx,
-                ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-                ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                0);
-
-        struct ggml_tensor * tmp;
-        if (ggml_is_quantized(k->type)) {
-            // dequantize to f32 -> RoPE -> quantize back
-            tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
-            //cb(tmp, "K_f32", il);
-
-            for (auto & backend : backends) {
-                // Figure out which backend KV cache belongs to
-                if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
-                    ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
-                    break;
-                }
-            }
-            tmp = ggml_rope_ext_inplace(ctx0, tmp,
-                    inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-            //cb(tmp, "K_shifted_f32", il);
-
-            tmp = ggml_cpy(ctx0, tmp, k);
-        } else {
-            // we rotate only the first n_rot dimensions
-            tmp = ggml_rope_ext_inplace(ctx0, k,
-                    inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-        }
-        //cb(tmp, "K_shifted", il);
-
-        ggml_build_forward_expand(graph, tmp);
-    }
-}
-
-void llama_context_kv_self::build_kv_self_defrag(
-        ggml_context * ctx0,
-         ggml_cgraph * graph) {
-    const auto & hparams = model.hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    const uint32_t n_kv   = kv_self.cell_max();
-    const uint32_t n_used = kv_self.used;
-
-    assert(n_used <= n_kv);
-
-    //const int64_t t_start = ggml_time_us();
-
-    // number of cells moved
-    uint32_t n_moves = 0;
-
-    // each move requires 6*n_layer tensors (see build_kv_self_defrag)
-    //   - source view, destination view, copy operation
-    //   - x2 for keys and values
-    //const uint32_t max_moves = model.max_nodes()/(6*n_layer);
-    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
-    const uint32_t max_moves = (model.max_nodes() - 2*n_layer)/(6*n_layer);
-
-    // determine which KV cells to move where
-    //
-    //  cell i moves to ids[i]
-    //
-    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
-    //
-    std::vector<uint32_t> ids(n_kv, n_kv);
-
-    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
-        const auto & cell0 = kv_self.cells[i0];
-
-        if (!cell0.is_empty()) {
-            ids[i0] = i0;
-
-            continue;
-        }
-
-        // found a hole - fill it with data from the end of the cache
-
-        uint32_t nh = 1;
-
-        // determine the size of the hole
-        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
-            nh++;
-        }
-
-        uint32_t nf = 0;
-        uint32_t is = n_kv - 1;
-
-        // starting from the end, find nh non-empty cells
-        for (; is > i0; --is) {
-            const auto & cell1 = kv_self.cells[is];
-
-            if (cell1.is_empty() || ids[is] != n_kv) {
-                continue;
-            }
-
-            // non-empty cell which is not yet moved
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        // this can only happen if `n_used` is not accurate, which would be a bug
-        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
-
-        nf = 0;
-
-        uint32_t i1 = is;
-
-        // are we moving a continuous block of memory?
-        bool cont = false;
-
-        // should we stop searching for the next move?
-        bool stop = false;
-
-        // go back and move the nf cells to the hole
-        for (; i1 < n_kv; ++i1) {
-            auto & cell1 = kv_self.cells[i1];
-
-            if (cell1.is_empty() || ids[i1] != n_kv) {
-                if (n_moves == max_moves) {
-                    stop = true;
-                    break;
-                }
-
-                cont = false;
-                continue;
-            }
-
-            // this cell goes to (i0 + nf)
-            ids[i1] = i0 + nf;
-
-            // move the cell meta data
-            kv_self.cells[i0 + nf] = cell1;
-
-            // clear the old cell and move the head there
-            cell1 = llama_kv_cell();
-            kv_self.head = n_used;
-
-            if (!cont) {
-                n_moves++;
-                cont = true;
-            }
-
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        if (stop || n_moves == max_moves) {
-            break;
-        }
-
-        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
-
-        i0 += nh - 1;
-    }
-
-    if (n_moves == 0) {
-        return;
-    }
-
-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
-
-#if 0
-    // CPU defrag
-    //
-    // TODO: optimizations are possible:
-    //       - multiple threads
-    //       - avoid copying to the host memory when already there
-    //
-    // likely not worth the effort, as we have ggml_graph based defrag
-    //
-
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-
-    const uint32_t kv_size = kv_self.size;
-
-    std::vector<uint8_t> buf_k;
-    std::vector<uint8_t> buf_v;
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
-        const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
-
-        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
-        const size_t v_size    = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
-
-        buf_k.resize(k_size);
-        buf_v.resize(v_size);
-
-        ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
-
-        // batch move [i, i+nm) to [id, id+nm)
-        // note: cells can move only to a lower index
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == n_kv) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < n_kv && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-            // move keys
-            {
-                const int64_t os =  i*k_size_row;
-                const int64_t od = id*k_size_row;
-
-                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
-            }
-
-            // move values (note: they are transposed)
-            {
-                const int64_t os =  i;
-                const int64_t od = id;
-
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
-                }
-            }
-
-            i += nm - 1;
-        }
-
-        ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
-    }
-#else
-    for (uint32_t i = 0; i < ids.size(); ++i) {
-        const uint32_t id = ids[i];
-
-        if (i == id || id == ids.size()) {
-            continue;
-        }
-
-        uint32_t nm = 1;
-
-        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-            nm++;
-        }
-
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
-            ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
-                    n_embd_k_gqa, nm,
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
-
-            ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
-                    n_embd_k_gqa, nm,
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
-
-            ggml_tensor * view_v_src;
-            ggml_tensor * view_v_dst;
-
-            if (cparams.flash_attn) {
-                // NOTE: the V cache is not transposed when using flash attention
-                view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                        n_embd_v_gqa, nm,
-                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
-
-                view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                        n_embd_v_gqa, nm,
-                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
-            } else {
-                view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                        nm, n_embd_v_gqa,
-                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                        ggml_row_size(kv_self.v_l[il]->type, i));
-
-                view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                        nm, n_embd_v_gqa,
-                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                        ggml_row_size(kv_self.v_l[il]->type, id));
-            }
-
-            ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_k_src, view_k_dst));
-            ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_v_src, view_v_dst));
-        }
-
-        i += nm - 1;
-    }
-
-    //LLAMA_LOG_INFO("graph->n_nodes = %d\n", graph->n_nodes);
-#endif
-}
-
 ggml_tensor * llama_context_kv_self::build_inp_embd_enc(
         ggml_context * ctx0,
              int32_t   n_tokens,
@@ -3162,7 +2872,7 @@ ggml_tensor * llama_context_kv_self::build_inp_s_mask(
 
 ggml_tensor * llama_context_kv_self::build_copy_mask_state(
         ggml_context * ctx0,
-         ggml_cgraph * graph,
+         ggml_cgraph * gf,
          ggml_tensor * s,
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
@@ -3185,7 +2895,7 @@ ggml_tensor * llama_context_kv_self::build_copy_mask_state(
     states = ggml_mul(ctx0, states, state_mask);
 
     // copy states which won't be changed further (between n_seqs and n_kv)
-    ggml_build_forward_expand(graph,
+    ggml_build_forward_expand(gf,
         ggml_cpy(ctx0,
             ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs          )*n_state*ggml_element_size(states)),
             ggml_view_1d(ctx0, s,      n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
@@ -3197,7 +2907,7 @@ ggml_tensor * llama_context_kv_self::build_copy_mask_state(
 // TODO: split
 ggml_tensor * llama_context_kv_self::build_mamba_layer(
         ggml_context * ctx0,
-         ggml_cgraph * graph,
+         ggml_cgraph * gf,
          ggml_tensor * cur,
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
@@ -3231,11 +2941,11 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer(
 
     // (ab)using the KV cache to store the states
     struct ggml_tensor * conv = build_copy_mask_state(
-            ctx0, graph, conv_states_all, state_copy, state_mask,
+            ctx0, gf, conv_states_all, state_copy, state_mask,
             n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
     conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
     struct ggml_tensor * ssm = build_copy_mask_state(
-            ctx0, graph, ssm_states_all, state_copy, state_mask,
+            ctx0, gf, ssm_states_all, state_copy, state_mask,
             n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
     ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
 
@@ -3257,7 +2967,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer(
         // copy last (d_conv - 1) columns back into the state cache
         struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
 
-        ggml_build_forward_expand(graph,
+        ggml_build_forward_expand(gf,
             ggml_cpy(ctx0, last_conv,
                 ggml_view_1d(ctx0, conv_states_all,
                     (d_conv - 1)*(d_inner)*(n_seqs),
@@ -3306,7 +3016,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer(
         struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
 
         // store last states
-        ggml_build_forward_expand(graph,
+        ggml_build_forward_expand(gf,
             ggml_cpy(ctx0,
                 ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
                 ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
@@ -3333,7 +3043,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer(
 
 ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load(
         ggml_context * ctx0,
-         ggml_cgraph * graph,
+         ggml_cgraph * gf,
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
@@ -3349,7 +3059,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load(
     struct ggml_tensor * token_shift_all = kv_self.k_l[il];
 
     struct ggml_tensor * token_shift = build_copy_mask_state(
-            ctx0, graph, token_shift_all, state_copy, state_mask,
+            ctx0, gf, token_shift_all, state_copy, state_mask,
             n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
 
     token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
@@ -3384,7 +3094,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store(
 
 ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
         ggml_context * ctx0,
-         ggml_cgraph * graph,
+         ggml_cgraph * gf,
          ggml_tensor * cur,
          ggml_tensor * x_prev,
          ggml_tensor * state_copy,
@@ -3509,7 +3219,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
     }
 
     struct ggml_tensor * wkv_state = build_copy_mask_state(
-            ctx0, graph, kv_self.v_l[il], state_copy, state_mask,
+            ctx0, gf, kv_self.v_l[il], state_copy, state_mask,
             n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
 
     struct ggml_tensor * wkv_output;
@@ -3522,7 +3232,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
     wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
 
     ggml_build_forward_expand(
-        graph,
+        gf,
         ggml_cpy(
             ctx0,
             wkv_state,
@@ -3558,7 +3268,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
 size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
     llama_context::state_get_data(io);
 
-    kv_self.state_write(io, model.hparams);
+    kv_self.state_write(io);
 
     return io.n_bytes();
 }
@@ -3566,7 +3276,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
 size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
     llama_context::state_set_data(io);
 
-    kv_self.state_read(io, model.hparams);
+    kv_self.state_read(io);
 
     return io.n_bytes();
 }
@@ -3574,7 +3284,7 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
 size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
     llama_context::state_seq_get_data(io, seq_id);
 
-    kv_self.state_write(io, model.hparams, seq_id);
+    kv_self.state_write(io, seq_id);
 
     return io.n_bytes();
 }
@@ -3582,7 +3292,7 @@ size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_se
 size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
     llama_context::state_seq_set_data(io, seq_id);
 
-    kv_self.state_read(io, model.hparams, seq_id);
+    kv_self.state_read(io, seq_id);
 
     return io.n_bytes();
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index 4bf8244e625c1..0311ad4734daf 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -43,6 +43,8 @@ struct llama_context : public llama_graph_i {
     virtual uint32_t n_threads()       const;
     virtual uint32_t n_threads_batch() const;
 
+    virtual int32_t max_nodes() const;
+
     virtual       llama_kv_cache * get_kv_self()       = 0;
     virtual const llama_kv_cache * get_kv_self() const = 0;
 
@@ -93,18 +95,19 @@ struct llama_context : public llama_graph_i {
     virtual void synchronize();
 
     // zero-out inputs and create ggml_context
-    virtual ggml_context_ptr graph_init();
+    virtual ggml_cgraph * graph_init();
 
     // TODO: add encode/decode graphs
     virtual llama_graph_result graph_build(
-                ggml_context * ctx,
-          const llama_ubatch & ubatch,
-                        bool   worst_case);
+            ggml_context * ctx,
+             ggml_cgraph * gf,
+      const llama_ubatch & ubatch,
+                    bool   worst_case);
 
     // returns the result of ggml_backend_sched_graph_compute_async execution
     virtual enum ggml_status graph_compute(
-                ggml_cgraph * graph,
-                       bool   batched);
+            ggml_cgraph * gf,
+                   bool   batched);
 
     virtual void input_set(const llama_ubatch & ubatch);
 
@@ -172,6 +175,13 @@ struct llama_context : public llama_graph_i {
 
     virtual ggml_tensor * build_rope_factors(int il);
 
+    virtual ggml_tensor * build_rope_shift(
+            ggml_context * ctx0,
+             ggml_tensor * cur,
+             ggml_tensor * shift,
+             ggml_tensor * factors,
+             ggml_backend_buffer * bbuf);
+
     virtual ggml_tensor * build_inp_embd(
             ggml_context * ctx0,
              ggml_tensor * tok_embd,
@@ -274,6 +284,8 @@ struct llama_context : public llama_graph_i {
 
     ggml_backend_sched_ptr sched;
 
+    ggml_context_ptr ctx_compute;
+
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
 
@@ -332,7 +344,7 @@ class llama_context_kv_self : public llama_context {
 
     virtual void kv_self_update() override;
 
-    virtual ggml_context_ptr graph_init() override;
+    virtual ggml_cgraph * graph_init() override;
 
     virtual void input_set(const llama_ubatch & ubatch) override;
 
@@ -349,11 +361,13 @@ class llama_context_kv_self : public llama_context {
 
     llama_kv_cache kv_self;
 
-    struct ggml_tensor * inp_KQ_mask;         // F32 [kv_size, n_batch]
-    struct ggml_tensor * inp_KQ_mask_cnv;     //     [kv_size, n_batch]
-    struct ggml_tensor * inp_KQ_mask_swa;     // F32 [kv_size, n_batch]
-    struct ggml_tensor * inp_KQ_mask_swa_cnv; //     [kv_size, n_batch]
-    struct ggml_tensor * inp_K_shift;         // I32 [kv_size]
+    ggml_tensor * inp_KQ_mask;         // F32 [kv_size, n_batch]
+    ggml_tensor * inp_KQ_mask_cnv;     //     [kv_size, n_batch]
+    ggml_tensor * inp_KQ_mask_swa;     // F32 [kv_size, n_batch]
+    ggml_tensor * inp_KQ_mask_swa_cnv; //     [kv_size, n_batch]
+    ggml_tensor * inp_k_shift;         // I32 [kv_size]
+
+    virtual ggml_tensor * build_inp_k_shift(ggml_context * ctx0) override;
 
     virtual void build_attn_inp(
             ggml_context * ctx0,
@@ -387,15 +401,6 @@ class llama_context_kv_self : public llama_context {
              ggml_tensor * kq,
                  float     kq_scale) override;
 
-    virtual void build_kv_self_shift(
-            ggml_context * ctx0,
-             ggml_cgraph * graph) override;
-
-    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    virtual void build_kv_self_defrag(
-            ggml_context * ctx0,
-             ggml_cgraph * graph) override;
-
     // === encoder-decoder ===
 
     // whether we are computing encoder output or decoder output
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 14d0c5da0a359..6098d2b9293b4 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -8,11 +8,10 @@
 struct ggml_cgraph;
 struct ggml_context;
 struct ggml_tensor;
+struct ggml_backend_buffer;
 struct llama_ubatch;
 
 struct llama_graph_result {
-    ggml_cgraph * gf = nullptr;
-
     // important graph nodes
     ggml_tensor * t_logits      = nullptr;
     ggml_tensor * t_embd        = nullptr;
@@ -50,6 +49,14 @@ class llama_graph_i {
 
     virtual ggml_tensor * build_rope_factors(int il) = 0;
 
+    // note: optionally set the backend to be the same as the bbuf's backend
+    virtual ggml_tensor * build_rope_shift(
+            ggml_context * ctx0,
+             ggml_tensor * cur,
+             ggml_tensor * shift,
+             ggml_tensor * factors,
+             ggml_backend_buffer * bbuft) = 0;
+
     // graph build API (context-specific)
 
     virtual ggml_tensor * build_inp_embd(
@@ -83,7 +90,7 @@ class llama_graph_i {
 
     virtual void build_attn_kv_store(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
                  int32_t   n_tokens,
@@ -92,7 +99,7 @@ class llama_graph_i {
 
     virtual ggml_tensor * build_attn_qkv(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * wo,
              ggml_tensor * wo_b,
              ggml_tensor * q_cur,
@@ -106,14 +113,8 @@ class llama_graph_i {
              ggml_tensor * kq,
                  float     kq_scale) = 0;
 
-    virtual void build_kv_self_shift(
-            ggml_context * ctx0,
-             ggml_cgraph * graph) = 0;
-
-    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    virtual void build_kv_self_defrag(
-            ggml_context * ctx0,
-             ggml_cgraph * graph) = 0;
+    virtual ggml_tensor * build_inp_k_shift(
+            ggml_context * ctx0) = 0;
 
     virtual ggml_tensor * build_inp_embd_enc(
             ggml_context * ctx0,
@@ -135,7 +136,7 @@ class llama_graph_i {
 
     virtual ggml_tensor * build_copy_mask_state(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * s,
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
@@ -146,7 +147,7 @@ class llama_graph_i {
 
     virtual ggml_tensor * build_mamba_layer(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * cur,
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
@@ -156,7 +157,7 @@ class llama_graph_i {
 
     virtual ggml_tensor * build_rwkv_token_shift_load(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
@@ -172,7 +173,7 @@ class llama_graph_i {
 
     virtual ggml_tensor * build_rwkv6_time_mix(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * cur,
              ggml_tensor * x_prev,
              ggml_tensor * state_copy,
@@ -181,3 +182,18 @@ class llama_graph_i {
                      int   il,
                     bool   worst_case) = 0;
 };
+
+class llama_graph_kv_cache_i {
+public:
+    virtual void build_shift(
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+           llama_graph_i * lgf) = 0;
+
+    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
+    virtual void build_defrag(
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+                 int32_t   max_nodes,
+                    bool   v_trans) = 0;
+};
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index c93410f0a412c..5dde8b8703875 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -13,6 +13,9 @@
 
 static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
 
+llama_kv_cache::llama_kv_cache(const llama_hparams & hparams) : hparams(hparams) {
+}
+
 bool llama_kv_cache::init(
         const llama_model & model,
       const llama_cparams & cparams,
@@ -20,8 +23,6 @@ bool llama_kv_cache::init(
                 ggml_type   type_v,
                  uint32_t   kv_size,
                      bool   offload) {
-    const struct llama_hparams & hparams = model.hparams;
-
     const int32_t n_layer = hparams.n_layer;
 
     has_shift = false;
@@ -698,7 +699,309 @@ size_t llama_kv_cache::size_v_bytes() const {
     return size_v_bytes;
 }
 
-void llama_kv_cache::state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id) const {
+void llama_kv_cache::build_shift(
+        ggml_context * ctx0,
+        ggml_cgraph * gf,
+        llama_graph_i * lgf) {
+    const auto & n_layer = hparams.n_layer;
+
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    //GGML_ASSERT(kv_self.size == n_ctx);
+
+    ggml_tensor * inp_k_shift = lgf->build_inp_k_shift(ctx0);
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const int64_t n_head_kv    = hparams.n_head_kv(il);
+        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+        struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
+
+        struct ggml_tensor * k =
+            ggml_view_3d(ctx0, k_l[il],
+                n_embd_head_k, n_head_kv, size,
+                ggml_row_size(k_l[il]->type, n_embd_head_k),
+                ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+                0);
+
+        ggml_tensor * cur = lgf->build_rope_shift(ctx0, k, inp_k_shift, rope_factors, k_l[il]->buffer);
+
+        ggml_build_forward_expand(gf, cur);
+    }
+}
+
+void llama_kv_cache::build_defrag(
+        ggml_context * ctx0,
+        ggml_cgraph * gf,
+        int32_t max_nodes,
+        bool v_trans) {
+    const uint32_t n_layer = hparams.n_layer;
+
+    const uint32_t n_kv   = cell_max();
+    const uint32_t n_used = used;
+
+    assert(n_used <= n_kv);
+
+    //const int64_t t_start = ggml_time_us();
+
+    // number of cells moved
+    uint32_t n_moves = 0;
+
+    // each move requires 6*n_layer tensors (see build_kv_self_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    //const uint32_t max_moves = max_nodes/(6*n_layer);
+    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+    const uint32_t max_moves = (max_nodes - 2*n_layer)/(6*n_layer);
+
+    // determine which KV cells to move where
+    //
+    //  cell i moves to ids[i]
+    //
+    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+    //
+    std::vector<uint32_t> ids(n_kv, n_kv);
+
+    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+        const auto & cell0 = cells[i0];
+
+        if (!cell0.is_empty()) {
+            ids[i0] = i0;
+
+            continue;
+        }
+
+        // found a hole - fill it with data from the end of the cache
+
+        uint32_t nh = 1;
+
+        // determine the size of the hole
+        while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
+            nh++;
+        }
+
+        uint32_t nf = 0;
+        uint32_t is = n_kv - 1;
+
+        // starting from the end, find nh non-empty cells
+        for (; is > i0; --is) {
+            const auto & cell1 = cells[is];
+
+            if (cell1.is_empty() || ids[is] != n_kv) {
+                continue;
+            }
+
+            // non-empty cell which is not yet moved
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        // this can only happen if `n_used` is not accurate, which would be a bug
+        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
+
+        nf = 0;
+
+        uint32_t i1 = is;
+
+        // are we moving a continuous block of memory?
+        bool cont = false;
+
+        // should we stop searching for the next move?
+        bool stop = false;
+
+        // go back and move the nf cells to the hole
+        for (; i1 < n_kv; ++i1) {
+            auto & cell1 = cells[i1];
+
+            if (cell1.is_empty() || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
+                cont = false;
+                continue;
+            }
+
+            // this cell goes to (i0 + nf)
+            ids[i1] = i0 + nf;
+
+            // move the cell meta data
+            cells[i0 + nf] = cell1;
+
+            // clear the old cell and move the head there
+            cell1 = llama_kv_cell();
+            head = n_used;
+
+            if (!cont) {
+                n_moves++;
+                cont = true;
+            }
+
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
+        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+
+        i0 += nh - 1;
+    }
+
+    if (n_moves == 0) {
+        return;
+    }
+
+    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
+
+    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+
+#if 0
+    // CPU defrag
+    //
+    // TODO: optimizations are possible:
+    //       - multiple threads
+    //       - avoid copying to the host memory when already there
+    //
+    // likely not worth the effort, as we have ggml_graph based defrag
+    //
+
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+    const uint32_t kv_size = size;
+
+    std::vector<uint8_t> buf_k;
+    std::vector<uint8_t> buf_v;
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        const size_t k_size     = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
+
+        const size_t v_size_el = ggml_type_size(v_l[il]->type);
+        const size_t v_size    = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
+
+        buf_k.resize(k_size);
+        buf_v.resize(v_size);
+
+        ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
+
+        // batch move [i, i+nm) to [id, id+nm)
+        // note: cells can move only to a lower index
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t id = ids[i];
+
+            if (i == id || id == n_kv) {
+                continue;
+            }
+
+            uint32_t nm = 1;
+
+            while (i + nm < n_kv && ids[i + nm] == id + nm) {
+                nm++;
+            }
+
+            // move keys
+            {
+                const int64_t os =  i*k_size_row;
+                const int64_t od = id*k_size_row;
+
+                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
+            }
+
+            // move values (note: they are transposed)
+            {
+                const int64_t os =  i;
+                const int64_t od = id;
+
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
+                }
+            }
+
+            i += nm - 1;
+        }
+
+        ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
+    }
+#else
+    for (uint32_t i = 0; i < ids.size(); ++i) {
+        const uint32_t id = ids[i];
+
+        if (i == id || id == ids.size()) {
+            continue;
+        }
+
+        uint32_t nm = 1;
+
+        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+            nm++;
+        }
+
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            ggml_tensor * view_k_src = ggml_view_2d(ctx0, k_l[il],
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
+
+            ggml_tensor * view_k_dst = ggml_view_2d(ctx0, k_l[il],
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
+
+            ggml_tensor * view_v_src;
+            ggml_tensor * view_v_dst;
+
+            if (!v_trans) {
+                // NOTE: the V cache is not transposed when using flash attention
+                view_v_src = ggml_view_2d(ctx0, v_l[il],
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
+
+                view_v_dst = ggml_view_2d(ctx0, v_l[il],
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
+            } else {
+                view_v_src = ggml_view_2d(ctx0, v_l[il],
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(v_l[il]->type, size),
+                        ggml_row_size(v_l[il]->type, i));
+
+                view_v_dst = ggml_view_2d(ctx0, v_l[il],
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(v_l[il]->type, size),
+                        ggml_row_size(v_l[il]->type, id));
+            }
+
+            ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
+            ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+        }
+
+        i += nm - 1;
+    }
+
+    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+#endif
+}
+
+void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
     std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
     uint32_t cell_count = 0;
 
@@ -733,16 +1036,16 @@ void llama_kv_cache::state_write(llama_io_write_i & io, const llama_hparams & hp
     io.write(&cell_count, sizeof(cell_count));
 
     state_write_meta(io, cell_ranges, seq_id);
-    state_write_data(io, cell_ranges, hparams);
+    state_write_data(io, cell_ranges);
 }
 
-void llama_kv_cache::state_read(llama_io_read_i & io, const llama_hparams & hparams, llama_seq_id seq_id) {
+void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
     uint32_t cell_count;
     io.read_to(&cell_count, sizeof(cell_count));
 
     bool res = true;
     res = res && state_read_meta(io, cell_count, seq_id);
-    res = res && state_read_data(io, hparams, cell_count);
+    res = res && state_read_data(io, cell_count);
 
     if (!res) {
         if (seq_id == -1) {
@@ -773,7 +1076,7 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const std::vector<s
     }
 }
 
-void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const {
+void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
     const uint32_t v_trans = this->v_trans ? 1 : 0;
     const uint32_t n_layer = hparams.n_layer;
 
@@ -955,7 +1258,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t cell_count,
     return true;
 }
 
-bool llama_kv_cache::state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count) {
+bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
     uint32_t v_trans;
     uint32_t n_layer;
     io.read_to(&v_trans, sizeof(v_trans));
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 3ea9abfce59be..67e59bc094b71 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -2,12 +2,12 @@
 
 #include "llama.h"
 #include "llama-io.h"
+#include "llama-graph.h"
 
 #include "ggml-cpp.h"
 
 #include <set>
 #include <vector>
-#include <functional>
 
 struct llama_cparams;
 struct llama_hparams;
@@ -49,31 +49,13 @@ struct llama_kv_cache_slot_info {
 // TODO: pimpl
 // TODO: add notion of max sequences
 // TODO: add llama_hparams &
-struct llama_kv_cache {
-    bool has_shift = false;
-    bool do_defrag = false;
-    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
-    bool v_trans   = true;  // the value tensor is transposed
-    bool can_shift = false;
-
-    // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_impl also uses it, so it
-    // cannot be freely changed after a slot has been allocated.
-    uint32_t head = 0;
-    uint32_t size = 0;
-    uint32_t used = 0; // used cells (i.e. at least one seq_id)
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    std::vector<llama_kv_cell> cells;
-
-    std::vector<struct ggml_tensor *> k_l; // per layer
-    std::vector<struct ggml_tensor *> v_l;
+struct llama_kv_cache : public llama_graph_kv_cache_i {
+    llama_kv_cache(const llama_hparams & hparams);
+    virtual ~llama_kv_cache() = default;
 
     // TODO: become constructor
     bool init(
-            const llama_model & model,
+            const llama_model & model,   // TODO: do not reference the model
           const llama_cparams & cparams,
                     ggml_type   type_k,
                     ggml_type   type_v,
@@ -115,8 +97,48 @@ struct llama_kv_cache {
     size_t size_k_bytes() const;
     size_t size_v_bytes() const;
 
-    void state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const;
-    void state_read (llama_io_read_i  & io, const llama_hparams & hparams, llama_seq_id seq_id = -1);
+    // graph build API
+
+    virtual void build_shift(
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+           llama_graph_i * lgf) override;
+
+    virtual void build_defrag(
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+                 int32_t   max_nodes,
+                    bool   v_trans) override;
+
+    // state save/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1);
+
+    // members
+
+    const llama_hparams & hparams;
+
+    bool has_shift = false;
+    bool do_defrag = false;
+    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
+    bool v_trans   = true;  // the value tensor is transposed
+    bool can_shift = false;
+
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_impl also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+    uint32_t used = 0; // used cells (i.e. at least one seq_id)
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    std::vector<llama_kv_cell> cells;
+
+    std::vector<struct ggml_tensor *> k_l; // per layer
+    std::vector<struct ggml_tensor *> v_l;
 
 private:
     ggml_type type_k = GGML_TYPE_F16;
@@ -126,10 +148,10 @@ struct llama_kv_cache {
     std::vector<ggml_backend_buffer_ptr> bufs;
 
     void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
 
     bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count);
+    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 
 //
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 350dfd89cee3d..09fd63f61ce6c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3579,8 +3579,8 @@ size_t llama_model::size() const {
     return pimpl->n_bytes;
 }
 
-size_t llama_model::max_nodes() const {
-    return std::max<size_t>(8192, tensors_by_name.size()*5);
+size_t llama_model::n_tensors() const {
+    return tensors_by_name.size();
 }
 
 size_t llama_model::n_devices() const {
@@ -3900,6 +3900,38 @@ struct llm_build_context {
         return inpL;
     }
 
+    // TODO: tmp
+    struct ggml_tensor * build_inp_pos() {
+        ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
+        cb(cur, "inp_pos", -1);
+
+        return cur;
+    }
+
+    // TODO: tmp
+    struct ggml_tensor * build_inp_out_ids() {
+        ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case);
+        cb(cur, "inp_out_ids", -1);
+
+        return cur;
+    }
+
+    // TODO: tmp
+    struct ggml_tensor * build_inp_mean() {
+        ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
+        cb(cur, "inp_mean", -1);
+
+        return cur;
+    }
+
+    // TODO: tmp
+    struct ggml_tensor * build_inp_cls() {
+        ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
+        cb(cur, "inp_cls", -1);
+
+        return cur;
+    }
+
     // TODO: tmp
     struct ggml_tensor * build_lora_mm(
               struct ggml_tensor * w,
@@ -3915,6 +3947,22 @@ struct llm_build_context {
         return lgf->build_lora_mm_id(ctx0, w, cur, ids);
     }
 
+    // TODO: tmp
+    struct ggml_tensor * build_inp_embd_enc() {
+        ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case);
+        cb(cur, "embd_enc", -1);
+
+        return cur;
+    }
+
+    // TODO: tmp
+    struct ggml_tensor * build_inp_KQ_mask_cross() {
+        ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
+        cb(cur, "KQ_mask_cross", -1);
+
+        return cur;
+    }
+
     struct ggml_tensor * build_norm(
              struct ggml_tensor * cur,
              struct ggml_tensor * mw,
@@ -4195,7 +4243,7 @@ struct llm_build_context {
     }
 
     struct ggml_tensor * build_attn(
-             struct ggml_cgraph * graph,
+             struct ggml_cgraph * gf,
              struct ggml_tensor * wo,
              struct ggml_tensor * wo_b,
              struct ggml_tensor * k_cur,
@@ -4206,17 +4254,17 @@ struct llm_build_context {
                         int       il) {
         // these nodes are added to the graph together so that they are not reordered
         // by doing so, the number of splits in the graph is reduced
-        ggml_build_forward_expand(graph, q_cur);
-        ggml_build_forward_expand(graph, k_cur);
-        ggml_build_forward_expand(graph, v_cur);
+        ggml_build_forward_expand(gf, q_cur);
+        ggml_build_forward_expand(gf, k_cur);
+        ggml_build_forward_expand(gf, v_cur);
 
-        //build_kv_store(graph, k_cur, v_cur, il);
-        lgf->build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
+        //build_kv_store(gf, k_cur, v_cur, il);
+        lgf->build_attn_kv_store(ctx0, gf, k_cur, v_cur, n_tokens, il, worst_case);
 
         struct ggml_tensor * cur;
 
-        //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il);
-        cur = lgf->build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
+        //cur = build_kqv(gf, wo, wo_b, q_cur, kq_mask, kq_scale, il);
+        cur = lgf->build_attn_qkv(ctx0, gf, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
         cb(cur, "kqv_out", il);
 
         return cur;
@@ -4251,34 +4299,6 @@ struct llm_build_context {
         return cur;
     }
 
-    struct ggml_tensor * build_inp_pos() {
-        ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
-        cb(cur, "inp_pos", -1);
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_inp_out_ids() {
-        ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case);
-        cb(cur, "inp_out_ids", -1);
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_inp_mean() {
-        ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
-        cb(cur, "inp_mean", -1);
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_inp_cls() {
-        ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
-        cb(cur, "inp_cls", -1);
-
-        return cur;
-    }
-
     void append_pooling(struct ggml_cgraph * gf) {
         struct ggml_tensor * inp = res.t_embd;
 
@@ -4377,20 +4397,6 @@ struct llm_build_context {
     //    return pos_bias;
     //}
 
-    struct ggml_tensor * build_inp_embd_enc() {
-        ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case);
-        cb(cur, "embd_enc", -1);
-
-        return cur;
-    }
-
-    struct ggml_tensor * build_inp_KQ_mask_cross() {
-        ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
-        cb(cur, "KQ_mask_cross", -1);
-
-        return cur;
-    }
-
     void build_llama(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
 
@@ -10936,16 +10942,13 @@ struct llm_build_context {
 
 llama_graph_result llama_model::build_graph(
           ggml_context * ctx,
+           ggml_cgraph * gf,
          llama_graph_i * lgf,
    const llama_cparams & cparams,
    const llama_ubatch  & ubatch,
-                  bool    worst_case) const {
+                  bool   worst_case) const {
     struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch, worst_case);
 
-    auto & gf = llm.res.gf;
-
-    gf = ggml_new_graph_custom(llm.ctx0, max_nodes(), false);
-
     switch (arch) {
         case LLM_ARCH_LLAMA:
         case LLM_ARCH_MINICPM:
diff --git a/src/llama-model.h b/src/llama-model.h
index 2a9fca7d40c6d..94e7622943937 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -353,7 +353,7 @@ struct llama_model {
     std::string desc() const;
 
     size_t size() const;
-    size_t max_nodes() const;
+    size_t n_tensors() const;
     size_t n_devices() const;
 
     // total number of parameters in the model
@@ -371,6 +371,7 @@ struct llama_model {
     // TODO: add encode/decode graphs
     llama_graph_result build_graph(
               ggml_context * ctx,
+               ggml_cgraph * gf,
              llama_graph_i * lgf,
        const llama_cparams & cparams,
        const llama_ubatch  & ubatch,

From 5f11a5502a37df607d35c703f52dd6f8f6454bdd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 19 Feb 2025 14:36:27 +0200
Subject: [PATCH 55/84] kv-cache : remove llama_kv_cache_i

---
 src/llama-context.cpp  | 307 ++++++++++++++++++++++++++++++++++++++++-
 src/llama-context.h    |  21 ++-
 src/llama-graph.h      |  24 ++--
 src/llama-kv-cache.cpp | 302 ----------------------------------------
 src/llama-kv-cache.h   |  15 +-
 5 files changed, 330 insertions(+), 339 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 454e141c85796..bec82b4464303 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2533,7 +2533,7 @@ void llama_context_kv_self::kv_self_update() {
 
             auto * gf = graph_init();
 
-            kv_self.build_shift(ctx_compute.get(), gf, this);
+            build_kv_self_shift(ctx_compute.get(), gf);
 
             ggml_backend_sched_alloc_graph(sched.get(), gf);
 
@@ -2559,7 +2559,7 @@ void llama_context_kv_self::kv_self_update() {
 
         auto * gf = graph_init();
 
-        kv_self.build_defrag(ctx_compute.get(), gf, max_nodes(), !cparams.flash_attn);
+        build_kv_self_defrag(ctx_compute.get(), gf);
 
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
@@ -2817,6 +2817,309 @@ ggml_tensor * llama_context_kv_self::build_attn_soft_max(
     return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
 }
 
+void llama_context_kv_self::build_kv_self_shift(
+        ggml_context * ctx0,
+        ggml_cgraph * gf) {
+    const auto & hparams = model.hparams;
+
+    const auto & n_layer = hparams.n_layer;
+
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    //GGML_ASSERT(kv_self.size == n_ctx);
+
+    ggml_tensor * inp_k_shift = build_inp_k_shift(ctx0);
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const int64_t n_head_kv    = hparams.n_head_kv(il);
+        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+        struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+        struct ggml_tensor * k =
+            ggml_view_3d(ctx0, kv_self.k_l[il],
+                n_embd_head_k, n_head_kv, kv_self.size,
+                ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                0);
+
+        ggml_tensor * cur = build_rope_shift(ctx0, k, inp_k_shift, rope_factors, kv_self.k_l[il]->buffer);
+
+        ggml_build_forward_expand(gf, cur);
+    }
+}
+
+void llama_context_kv_self::build_kv_self_defrag(
+        ggml_context * ctx0,
+        ggml_cgraph * gf) {
+    const auto & hparams = model.hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    const uint32_t n_kv   = kv_self.cell_max();
+    const uint32_t n_used = kv_self.used;
+
+    assert(n_used <= n_kv);
+
+    //const int64_t t_start = ggml_time_us();
+
+    // number of cells moved
+    uint32_t n_moves = 0;
+
+    // each move requires 6*n_layer tensors (see build_kv_self_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    //const uint32_t max_moves = max_nodes()/(6*n_layer);
+    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+    const uint32_t max_moves = (max_nodes() - 2*n_layer)/(6*n_layer);
+
+    // determine which KV cells to move where
+    //
+    //  cell i moves to ids[i]
+    //
+    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+    //
+    std::vector<uint32_t> ids(n_kv, n_kv);
+
+    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+        const auto & cell0 = kv_self.cells[i0];
+
+        if (!cell0.is_empty()) {
+            ids[i0] = i0;
+
+            continue;
+        }
+
+        // found a hole - fill it with data from the end of the cache
+
+        uint32_t nh = 1;
+
+        // determine the size of the hole
+        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
+            nh++;
+        }
+
+        uint32_t nf = 0;
+        uint32_t is = n_kv - 1;
+
+        // starting from the end, find nh non-empty cells
+        for (; is > i0; --is) {
+            const auto & cell1 = kv_self.cells[is];
+
+            if (cell1.is_empty() || ids[is] != n_kv) {
+                continue;
+            }
+
+            // non-empty cell which is not yet moved
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        // this can only happen if `n_used` is not accurate, which would be a bug
+        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
+
+        nf = 0;
+
+        uint32_t i1 = is;
+
+        // are we moving a continuous block of memory?
+        bool cont = false;
+
+        // should we stop searching for the next move?
+        bool stop = false;
+
+        // go back and move the nf cells to the hole
+        for (; i1 < n_kv; ++i1) {
+            auto & cell1 = kv_self.cells[i1];
+
+            if (cell1.is_empty() || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
+                cont = false;
+                continue;
+            }
+
+            // this cell goes to (i0 + nf)
+            ids[i1] = i0 + nf;
+
+            // move the cell meta data
+            kv_self.cells[i0 + nf] = cell1;
+
+            // clear the old cell and move the head there
+            cell1 = llama_kv_cell();
+            kv_self.head = n_used;
+
+            if (!cont) {
+                n_moves++;
+                cont = true;
+            }
+
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
+        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+
+        i0 += nh - 1;
+    }
+
+    if (n_moves == 0) {
+        return;
+    }
+
+    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
+
+    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+
+#if 0
+    // CPU defrag
+    //
+    // TODO: optimizations are possible:
+    //       - multiple threads
+    //       - avoid copying to the host memory when already there
+    //
+    // likely not worth the effort, as we have ggml_graph based defrag
+    //
+
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+    const uint32_t kv_size = size;
+
+    std::vector<uint8_t> buf_k;
+    std::vector<uint8_t> buf_v;
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        const size_t k_size     = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
+
+        const size_t v_size_el = ggml_type_size(v_l[il]->type);
+        const size_t v_size    = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
+
+        buf_k.resize(k_size);
+        buf_v.resize(v_size);
+
+        ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
+
+        // batch move [i, i+nm) to [id, id+nm)
+        // note: cells can move only to a lower index
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t id = ids[i];
+
+            if (i == id || id == n_kv) {
+                continue;
+            }
+
+            uint32_t nm = 1;
+
+            while (i + nm < n_kv && ids[i + nm] == id + nm) {
+                nm++;
+            }
+
+            // move keys
+            {
+                const int64_t os =  i*k_size_row;
+                const int64_t od = id*k_size_row;
+
+                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
+            }
+
+            // move values (note: they are transposed)
+            {
+                const int64_t os =  i;
+                const int64_t od = id;
+
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
+                }
+            }
+
+            i += nm - 1;
+        }
+
+        ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
+    }
+#else
+    for (uint32_t i = 0; i < ids.size(); ++i) {
+        const uint32_t id = ids[i];
+
+        if (i == id || id == ids.size()) {
+            continue;
+        }
+
+        uint32_t nm = 1;
+
+        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+            nm++;
+        }
+
+        for (uint32_t il = 0; il < n_layer; ++il) { // NOLINT
+            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+
+            ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+
+            ggml_tensor * view_v_src;
+            ggml_tensor * view_v_dst;
+
+            if (cparams.flash_attn) {
+                // NOTE: the V cache is not transposed when using flash attention
+                view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+
+                view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+            } else {
+                view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+                        ggml_row_size(kv_self.v_l[il]->type, i));
+
+                view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+                        ggml_row_size(kv_self.v_l[il]->type, id));
+            }
+
+            ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
+            ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
+        }
+
+        i += nm - 1;
+    }
+
+    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+#endif
+}
+
 ggml_tensor * llama_context_kv_self::build_inp_embd_enc(
         ggml_context * ctx0,
              int32_t   n_tokens,
diff --git a/src/llama-context.h b/src/llama-context.h
index 0311ad4734daf..a256f3042257b 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -378,7 +378,7 @@ class llama_context_kv_self : public llama_context {
 
     virtual void build_attn_kv_store(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
                  int32_t   n_tokens,
@@ -387,7 +387,7 @@ class llama_context_kv_self : public llama_context {
 
     virtual ggml_tensor * build_attn_qkv(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * wo,
              ggml_tensor * wo_b,
              ggml_tensor * q_cur,
@@ -401,6 +401,15 @@ class llama_context_kv_self : public llama_context {
              ggml_tensor * kq,
                  float     kq_scale) override;
 
+    virtual void build_kv_self_shift(
+            ggml_context * ctx0,
+            ggml_cgraph * gf) override;
+
+    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
+    virtual void build_kv_self_defrag(
+            ggml_context * ctx0,
+            ggml_cgraph * gf) override;
+
     // === encoder-decoder ===
 
     // whether we are computing encoder output or decoder output
@@ -443,7 +452,7 @@ class llama_context_kv_self : public llama_context {
 
     virtual ggml_tensor * build_copy_mask_state(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * s,
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
@@ -454,7 +463,7 @@ class llama_context_kv_self : public llama_context {
 
     virtual ggml_tensor * build_mamba_layer(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * cur,
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
@@ -464,7 +473,7 @@ class llama_context_kv_self : public llama_context {
 
     virtual ggml_tensor * build_rwkv_token_shift_load(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
@@ -480,7 +489,7 @@ class llama_context_kv_self : public llama_context {
 
     virtual ggml_tensor * build_rwkv6_time_mix(
             ggml_context * ctx0,
-             ggml_cgraph * graph,
+             ggml_cgraph * gf,
              ggml_tensor * cur,
              ggml_tensor * x_prev,
              ggml_tensor * state_copy,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 6098d2b9293b4..bb51b9a912f81 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -113,6 +113,15 @@ class llama_graph_i {
              ggml_tensor * kq,
                  float     kq_scale) = 0;
 
+    virtual void build_kv_self_shift(
+            ggml_context * ctx0,
+            ggml_cgraph * gf) = 0;
+
+    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
+    virtual void build_kv_self_defrag(
+            ggml_context * ctx0,
+            ggml_cgraph * gf) = 0;
+
     virtual ggml_tensor * build_inp_k_shift(
             ggml_context * ctx0) = 0;
 
@@ -182,18 +191,3 @@ class llama_graph_i {
                      int   il,
                     bool   worst_case) = 0;
 };
-
-class llama_graph_kv_cache_i {
-public:
-    virtual void build_shift(
-            ggml_context * ctx0,
-             ggml_cgraph * gf,
-           llama_graph_i * lgf) = 0;
-
-    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    virtual void build_defrag(
-            ggml_context * ctx0,
-             ggml_cgraph * gf,
-                 int32_t   max_nodes,
-                    bool   v_trans) = 0;
-};
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 5dde8b8703875..8a87f91290eed 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -699,308 +699,6 @@ size_t llama_kv_cache::size_v_bytes() const {
     return size_v_bytes;
 }
 
-void llama_kv_cache::build_shift(
-        ggml_context * ctx0,
-        ggml_cgraph * gf,
-        llama_graph_i * lgf) {
-    const auto & n_layer = hparams.n_layer;
-
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
-
-    //GGML_ASSERT(kv_self.size == n_ctx);
-
-    ggml_tensor * inp_k_shift = lgf->build_inp_k_shift(ctx0);
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const int64_t n_head_kv    = hparams.n_head_kv(il);
-        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-
-        struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
-
-        struct ggml_tensor * k =
-            ggml_view_3d(ctx0, k_l[il],
-                n_embd_head_k, n_head_kv, size,
-                ggml_row_size(k_l[il]->type, n_embd_head_k),
-                ggml_row_size(k_l[il]->type, n_embd_k_gqa),
-                0);
-
-        ggml_tensor * cur = lgf->build_rope_shift(ctx0, k, inp_k_shift, rope_factors, k_l[il]->buffer);
-
-        ggml_build_forward_expand(gf, cur);
-    }
-}
-
-void llama_kv_cache::build_defrag(
-        ggml_context * ctx0,
-        ggml_cgraph * gf,
-        int32_t max_nodes,
-        bool v_trans) {
-    const uint32_t n_layer = hparams.n_layer;
-
-    const uint32_t n_kv   = cell_max();
-    const uint32_t n_used = used;
-
-    assert(n_used <= n_kv);
-
-    //const int64_t t_start = ggml_time_us();
-
-    // number of cells moved
-    uint32_t n_moves = 0;
-
-    // each move requires 6*n_layer tensors (see build_kv_self_defrag)
-    //   - source view, destination view, copy operation
-    //   - x2 for keys and values
-    //const uint32_t max_moves = max_nodes/(6*n_layer);
-    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
-    const uint32_t max_moves = (max_nodes - 2*n_layer)/(6*n_layer);
-
-    // determine which KV cells to move where
-    //
-    //  cell i moves to ids[i]
-    //
-    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
-    //
-    std::vector<uint32_t> ids(n_kv, n_kv);
-
-    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
-        const auto & cell0 = cells[i0];
-
-        if (!cell0.is_empty()) {
-            ids[i0] = i0;
-
-            continue;
-        }
-
-        // found a hole - fill it with data from the end of the cache
-
-        uint32_t nh = 1;
-
-        // determine the size of the hole
-        while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
-            nh++;
-        }
-
-        uint32_t nf = 0;
-        uint32_t is = n_kv - 1;
-
-        // starting from the end, find nh non-empty cells
-        for (; is > i0; --is) {
-            const auto & cell1 = cells[is];
-
-            if (cell1.is_empty() || ids[is] != n_kv) {
-                continue;
-            }
-
-            // non-empty cell which is not yet moved
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        // this can only happen if `n_used` is not accurate, which would be a bug
-        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
-
-        nf = 0;
-
-        uint32_t i1 = is;
-
-        // are we moving a continuous block of memory?
-        bool cont = false;
-
-        // should we stop searching for the next move?
-        bool stop = false;
-
-        // go back and move the nf cells to the hole
-        for (; i1 < n_kv; ++i1) {
-            auto & cell1 = cells[i1];
-
-            if (cell1.is_empty() || ids[i1] != n_kv) {
-                if (n_moves == max_moves) {
-                    stop = true;
-                    break;
-                }
-
-                cont = false;
-                continue;
-            }
-
-            // this cell goes to (i0 + nf)
-            ids[i1] = i0 + nf;
-
-            // move the cell meta data
-            cells[i0 + nf] = cell1;
-
-            // clear the old cell and move the head there
-            cell1 = llama_kv_cell();
-            head = n_used;
-
-            if (!cont) {
-                n_moves++;
-                cont = true;
-            }
-
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        if (stop || n_moves == max_moves) {
-            break;
-        }
-
-        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
-
-        i0 += nh - 1;
-    }
-
-    if (n_moves == 0) {
-        return;
-    }
-
-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
-
-#if 0
-    // CPU defrag
-    //
-    // TODO: optimizations are possible:
-    //       - multiple threads
-    //       - avoid copying to the host memory when already there
-    //
-    // likely not worth the effort, as we have ggml_graph based defrag
-    //
-
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-
-    const uint32_t kv_size = size;
-
-    std::vector<uint8_t> buf_k;
-    std::vector<uint8_t> buf_v;
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
-        const size_t k_size     = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
-
-        const size_t v_size_el = ggml_type_size(v_l[il]->type);
-        const size_t v_size    = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
-
-        buf_k.resize(k_size);
-        buf_v.resize(v_size);
-
-        ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
-
-        // batch move [i, i+nm) to [id, id+nm)
-        // note: cells can move only to a lower index
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == n_kv) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < n_kv && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-            // move keys
-            {
-                const int64_t os =  i*k_size_row;
-                const int64_t od = id*k_size_row;
-
-                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
-            }
-
-            // move values (note: they are transposed)
-            {
-                const int64_t os =  i;
-                const int64_t od = id;
-
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
-                }
-            }
-
-            i += nm - 1;
-        }
-
-        ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
-    }
-#else
-    for (uint32_t i = 0; i < ids.size(); ++i) {
-        const uint32_t id = ids[i];
-
-        if (i == id || id == ids.size()) {
-            continue;
-        }
-
-        uint32_t nm = 1;
-
-        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-            nm++;
-        }
-
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
-            ggml_tensor * view_k_src = ggml_view_2d(ctx0, k_l[il],
-                    n_embd_k_gqa, nm,
-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
-
-            ggml_tensor * view_k_dst = ggml_view_2d(ctx0, k_l[il],
-                    n_embd_k_gqa, nm,
-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
-
-            ggml_tensor * view_v_src;
-            ggml_tensor * view_v_dst;
-
-            if (!v_trans) {
-                // NOTE: the V cache is not transposed when using flash attention
-                view_v_src = ggml_view_2d(ctx0, v_l[il],
-                        n_embd_v_gqa, nm,
-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
-
-                view_v_dst = ggml_view_2d(ctx0, v_l[il],
-                        n_embd_v_gqa, nm,
-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
-            } else {
-                view_v_src = ggml_view_2d(ctx0, v_l[il],
-                        nm, n_embd_v_gqa,
-                        ggml_row_size(v_l[il]->type, size),
-                        ggml_row_size(v_l[il]->type, i));
-
-                view_v_dst = ggml_view_2d(ctx0, v_l[il],
-                        nm, n_embd_v_gqa,
-                        ggml_row_size(v_l[il]->type, size),
-                        ggml_row_size(v_l[il]->type, id));
-            }
-
-            ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
-            ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
-        }
-
-        i += nm - 1;
-    }
-
-    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-#endif
-}
-
 void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
     std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
     uint32_t cell_count = 0;
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 67e59bc094b71..049193fd0f176 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -49,7 +49,7 @@ struct llama_kv_cache_slot_info {
 // TODO: pimpl
 // TODO: add notion of max sequences
 // TODO: add llama_hparams &
-struct llama_kv_cache : public llama_graph_kv_cache_i {
+struct llama_kv_cache {
     llama_kv_cache(const llama_hparams & hparams);
     virtual ~llama_kv_cache() = default;
 
@@ -97,19 +97,6 @@ struct llama_kv_cache : public llama_graph_kv_cache_i {
     size_t size_k_bytes() const;
     size_t size_v_bytes() const;
 
-    // graph build API
-
-    virtual void build_shift(
-            ggml_context * ctx0,
-             ggml_cgraph * gf,
-           llama_graph_i * lgf) override;
-
-    virtual void build_defrag(
-            ggml_context * ctx0,
-             ggml_cgraph * gf,
-                 int32_t   max_nodes,
-                    bool   v_trans) override;
-
     // state save/load
 
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;

From e17e4b72d16710ee430b6858d58ce6ab3f4a31bb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 19 Feb 2025 14:56:01 +0200
Subject: [PATCH 56/84] context : add llama_context_recurrent

ggml-ci
---
 src/llama-context.cpp | 151 ++++++++++++++++++++++++------------------
 src/llama-context.h   |  32 ++++++---
 src/llama-graph.cpp   | 135 +++++++++++++++++++++++++++++++++++++
 src/llama-graph.h     |  16 ++---
 src/llama.cpp         |  15 ++++-
 5 files changed, 266 insertions(+), 83 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index bec82b4464303..b571c9343fa88 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -20,6 +20,8 @@ llama_context::llama_context(
     model     (model),
     t_start_us(model.t_start_us),
     t_load_us (model.t_load_us) {
+    LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
+
     const auto & hparams = model.hparams;
 
     cparams.n_seq_max        = std::max(1u, params.n_seq_max);
@@ -1633,6 +1635,8 @@ llama_context_kv_self::llama_context_kv_self(
         const llama_context_params & params) :
     llama_context(model, params),
     kv_self(model.hparams) {
+    LLAMA_LOG_INFO("%s: constructing llama_context_kv_self\n", __func__);
+
     const auto & hparams = model.hparams;
 
     LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
@@ -1700,8 +1704,6 @@ ggml_cgraph * llama_context_kv_self::graph_init() {
     inp_KQ_mask_swa_cnv = nullptr;
     inp_KQ_mask_cross   = nullptr;
     inp_k_shift         = nullptr;
-    inp_s_copy          = nullptr;
-    inp_s_mask          = nullptr;
     inp_embd_enc        = nullptr;
     inp_pos_bucket      = nullptr;
 
@@ -2381,53 +2383,6 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
         }
     }
 
-    if (kv_self.recurrent) {
-        const int64_t n_kv = kv_self.n;
-
-        if (inp_s_mask) {
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer));
-            float * data = (float *) inp_s_mask->data;
-
-            // clear unused states
-            for (int i = 0; i < n_kv; ++i) {
-                const uint32_t  cell_id = i + kv_self.head;
-                llama_kv_cell & kv_cell = kv_self.cells[cell_id];
-
-                data[i] = (float) (kv_cell.src >= 0);
-
-                // TODO: do not mutate the KV cache
-                // only clear once
-                if (kv_cell.src < 0) {
-                    kv_cell.src = cell_id;
-                }
-            }
-        }
-
-        if (inp_s_copy) {
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer));
-            int32_t * data = (int32_t *) inp_s_copy->data;
-
-            // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-            for (uint32_t i = 0; i < n_kv; ++i) {
-                const uint32_t  cell_id = i + kv_self.head;
-                llama_kv_cell & kv_cell = kv_self.cells[cell_id];
-
-                // prevent out-of-bound sources
-                if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) {
-                    kv_cell.src = cell_id;
-                }
-
-                data[i] = kv_cell.src;
-
-                // TODO: do not mutate the KV cache
-                // ensure copy only happens once
-                if (kv_cell.src != (int32_t) cell_id) {
-                    kv_cell.src = cell_id;
-                }
-            }
-        }
-    }
-
     if (inp_pos_bucket) {
         const int64_t n_tokens = ubatch.n_tokens;
 
@@ -2614,7 +2569,7 @@ void llama_context_kv_self::build_attn_inp(
 
 void llama_context_kv_self::build_attn_kv_store(
         ggml_context * ctx0,
-         ggml_cgraph * graph,
+         ggml_cgraph * gf,
          ggml_tensor * k_cur,
          ggml_tensor * v_cur,
              int32_t   n_tokens,
@@ -2635,7 +2590,7 @@ void llama_context_kv_self::build_attn_kv_store(
     //cb(k_cache_view, "k_cache_view", il);
 
     // note: storing RoPE-ed version of K in the KV cache
-    ggml_build_forward_expand(graph, ggml_cpy(ctx0, k_cur, k_cache_view));
+    ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
 
     assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
 
@@ -2653,12 +2608,12 @@ void llama_context_kv_self::build_attn_kv_store(
     }
     //cb(v_cache_view, "v_cache_view", il);
 
-    ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view));
+    ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
 }
 
 ggml_tensor * llama_context_kv_self::build_attn_qkv(
         ggml_context * ctx0,
-         ggml_cgraph * graph,
+         ggml_cgraph * gf,
          ggml_tensor * wo,
          ggml_tensor * wo_b,
          ggml_tensor * q_cur,
@@ -2791,7 +2746,7 @@ ggml_tensor * llama_context_kv_self::build_attn_qkv(
         }
     }
 
-    ggml_build_forward_expand(graph, cur);
+    ggml_build_forward_expand(gf, cur);
 
     if (wo) {
         cur = build_lora_mm(ctx0, wo, cur);
@@ -3152,7 +3107,79 @@ ggml_tensor * llama_context_kv_self::build_inp_KQ_mask_cross(
     return inp_KQ_mask_cross;
 }
 
-ggml_tensor * llama_context_kv_self::build_inp_s_copy(
+//
+// llama_context_recurrent
+//
+
+llama_context_recurrent::llama_context_recurrent(
+        const llama_model & model,
+        const llama_context_params & params) :
+    llama_context_kv_self(model, params) {
+    LLAMA_LOG_INFO("%s: constructing llama_context_recurrent\n", __func__);
+}
+
+llama_context_recurrent::~llama_context_recurrent() = default;
+
+ggml_cgraph * llama_context_recurrent::graph_init() {
+    inp_s_copy          = nullptr;
+    inp_s_mask          = nullptr;
+
+    return llama_context_kv_self::graph_init();
+}
+
+void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
+    // call base functionality
+    llama_context_kv_self::input_set(ubatch);
+
+    GGML_ASSERT(kv_self.recurrent);
+
+    const int64_t n_kv = kv_self.n;
+
+    if (inp_s_mask) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer));
+        float * data = (float *) inp_s_mask->data;
+
+        // clear unused states
+        for (int i = 0; i < n_kv; ++i) {
+            const uint32_t  cell_id = i + kv_self.head;
+            llama_kv_cell & kv_cell = kv_self.cells[cell_id];
+
+            data[i] = (float) (kv_cell.src >= 0);
+
+            // TODO: do not mutate the KV cache
+            // only clear once
+            if (kv_cell.src < 0) {
+                kv_cell.src = cell_id;
+            }
+        }
+    }
+
+    if (inp_s_copy) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer));
+        int32_t * data = (int32_t *) inp_s_copy->data;
+
+        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t  cell_id = i + kv_self.head;
+            llama_kv_cell & kv_cell = kv_self.cells[cell_id];
+
+            // prevent out-of-bound sources
+            if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) {
+                kv_cell.src = cell_id;
+            }
+
+            data[i] = kv_cell.src;
+
+            // TODO: do not mutate the KV cache
+            // ensure copy only happens once
+            if (kv_cell.src != (int32_t) cell_id) {
+                kv_cell.src = cell_id;
+            }
+        }
+    }
+}
+
+ggml_tensor * llama_context_recurrent::build_inp_s_copy(
         ggml_context * ctx0,
                 bool   worst_case) {
     const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
@@ -3163,7 +3190,7 @@ ggml_tensor * llama_context_kv_self::build_inp_s_copy(
     return inp_s_copy;
 }
 
-ggml_tensor * llama_context_kv_self::build_inp_s_mask(
+ggml_tensor * llama_context_recurrent::build_inp_s_mask(
         ggml_context * ctx0,
                 bool   worst_case) {
     const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
@@ -3173,7 +3200,7 @@ ggml_tensor * llama_context_kv_self::build_inp_s_mask(
     return inp_s_mask;
 }
 
-ggml_tensor * llama_context_kv_self::build_copy_mask_state(
+ggml_tensor * llama_context_recurrent::build_copy_mask_state(
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * s,
@@ -3208,7 +3235,7 @@ ggml_tensor * llama_context_kv_self::build_copy_mask_state(
 }
 
 // TODO: split
-ggml_tensor * llama_context_kv_self::build_mamba_layer(
+ggml_tensor * llama_context_recurrent::build_mamba_layer(
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * cur,
@@ -3344,7 +3371,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer(
 }
 
 
-ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load(
+ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_load(
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * state_copy,
@@ -3370,8 +3397,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load(
     return token_shift;
 }
 
-
-ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store(
+ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_store(
         ggml_context * ctx0,
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
@@ -3394,8 +3420,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store(
     );
 }
 
-
-ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
+ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * cur,
diff --git a/src/llama-context.h b/src/llama-context.h
index a256f3042257b..133eb8b36f739 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -433,15 +433,28 @@ class llama_context_kv_self : public llama_context {
                  int32_t   n_tokens,
                     bool   worst_case) override;
 
-    // === recurrent ===
+protected:
+    virtual size_t state_get_data(llama_io_write_i & io) override;
+    virtual size_t state_set_data(llama_io_read_i  & io) override;
 
-    struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
-    struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
+    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
+    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
+};
 
-    // TODO: add recurrent cache
-    // TODO: add mamba-specific llama_context
+// a recurrent transformer (ie.e RWKV, Mamba)
+// TODO: temporary reuse kv_self, but in the future, implement recurrent-specific context with specific cache
+class llama_context_recurrent : public llama_context_kv_self {
+public:
+    llama_context_recurrent(
+            const llama_model & model,
+            const llama_context_params & params);
+
+    virtual ~llama_context_recurrent();
+
+    virtual ggml_cgraph * graph_init() override;
+
+    virtual void input_set(const llama_ubatch & ubatch) override;
 
-    // TODO: change these to build_mamba_inp and hide `state_copy` and `state_mask` inside the llama_context impl
     virtual ggml_tensor * build_inp_s_copy(
             ggml_context * ctx0,
                     bool   worst_case) override;
@@ -499,11 +512,10 @@ class llama_context_kv_self : public llama_context {
                     bool   worst_case) override;
 
 protected:
-    virtual size_t state_get_data(llama_io_write_i & io) override;
-    virtual size_t state_set_data(llama_io_read_i  & io) override;
+    struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
+    struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
 
-    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
-    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
+    // TODO: add recurrent cache
 };
 
 // For internal test use
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 20f2ee0bd56aa..17605e74cc90b 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1 +1,136 @@
 #include "llama-graph.h"
+
+#include "llama-impl.h"
+
+ggml_tensor * llama_graph_i::build_inp_s_copy (
+        ggml_context * ctx0,
+                bool   worst_case) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(worst_case);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+
+    return nullptr; // NOLINT
+}
+
+ggml_tensor * llama_graph_i::build_inp_s_mask(
+        ggml_context * ctx0,
+                bool   worst_case) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(worst_case);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+
+    return nullptr; // NOLINT
+}
+
+ggml_tensor * llama_graph_i::build_copy_mask_state(
+        ggml_context * ctx0,
+         ggml_cgraph * gf,
+         ggml_tensor * s,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+             int32_t   n_tokens,
+             int32_t   n_state,
+             int32_t   n_seqs,
+                bool   worst_case) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+    GGML_UNUSED(s);
+    GGML_UNUSED(state_copy);
+    GGML_UNUSED(state_mask);
+    GGML_UNUSED(n_tokens);
+    GGML_UNUSED(n_state);
+    GGML_UNUSED(n_seqs);
+    GGML_UNUSED(worst_case);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+
+    return nullptr; // NOLINT
+}
+
+ggml_tensor * llama_graph_i::build_mamba_layer(
+        ggml_context * ctx0,
+         ggml_cgraph * gf,
+         ggml_tensor * cur,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+    GGML_UNUSED(cur);
+    GGML_UNUSED(state_copy);
+    GGML_UNUSED(state_mask);
+    GGML_UNUSED(ubatch);
+    GGML_UNUSED(il);
+    GGML_UNUSED(worst_case);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+
+    return nullptr; // NOLINT
+}
+
+ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
+        ggml_context * ctx0,
+         ggml_cgraph * gf,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+    GGML_UNUSED(state_copy);
+    GGML_UNUSED(state_mask);
+    GGML_UNUSED(ubatch);
+    GGML_UNUSED(il);
+    GGML_UNUSED(worst_case);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+
+    return nullptr; // NOLINT
+}
+
+ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
+        ggml_context * ctx0,
+         ggml_tensor * token_shift,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(token_shift);
+    GGML_UNUSED(ubatch);
+    GGML_UNUSED(il);
+    GGML_UNUSED(worst_case);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+
+    return nullptr; // NOLINT
+}
+
+ggml_tensor * llama_graph_i::build_rwkv6_time_mix(
+        ggml_context * ctx0,
+         ggml_cgraph * gf,
+         ggml_tensor * cur,
+         ggml_tensor * x_prev,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+    GGML_UNUSED(cur);
+    GGML_UNUSED(x_prev);
+    GGML_UNUSED(state_copy);
+    GGML_UNUSED(state_mask);
+    GGML_UNUSED(ubatch);
+    GGML_UNUSED(il);
+    GGML_UNUSED(worst_case);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+
+    return nullptr; // NOLINT
+}
diff --git a/src/llama-graph.h b/src/llama-graph.h
index bb51b9a912f81..b9456e3d1ca74 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -55,7 +55,7 @@ class llama_graph_i {
              ggml_tensor * cur,
              ggml_tensor * shift,
              ggml_tensor * factors,
-             ggml_backend_buffer * bbuft) = 0;
+             ggml_backend_buffer * bbuf) = 0;
 
     // graph build API (context-specific)
 
@@ -137,11 +137,11 @@ class llama_graph_i {
 
     virtual ggml_tensor * build_inp_s_copy(
             ggml_context * ctx0,
-                    bool   worst_case) = 0;
+                    bool   worst_case);
 
     virtual ggml_tensor * build_inp_s_mask(
             ggml_context * ctx0,
-                    bool   worst_case) = 0;
+                    bool   worst_case);
 
     virtual ggml_tensor * build_copy_mask_state(
             ggml_context * ctx0,
@@ -152,7 +152,7 @@ class llama_graph_i {
                  int32_t   n_tokens,
                  int32_t   n_state,
                  int32_t   n_seqs,
-                    bool   worst_case) = 0;
+                    bool   worst_case);
 
     virtual ggml_tensor * build_mamba_layer(
             ggml_context * ctx0,
@@ -162,7 +162,7 @@ class llama_graph_i {
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
                      int   il,
-                    bool   worst_case) = 0;
+                    bool   worst_case);
 
     virtual ggml_tensor * build_rwkv_token_shift_load(
             ggml_context * ctx0,
@@ -171,14 +171,14 @@ class llama_graph_i {
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
                      int   il,
-                    bool   worst_case) = 0;
+                    bool   worst_case);
 
     virtual ggml_tensor * build_rwkv_token_shift_store(
             ggml_context * ctx0,
              ggml_tensor * token_shift,
       const llama_ubatch & ubatch,
                      int   il,
-                    bool   worst_case) = 0;
+                    bool   worst_case);
 
     virtual ggml_tensor * build_rwkv6_time_mix(
             ggml_context * ctx0,
@@ -189,5 +189,5 @@ class llama_graph_i {
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
                      int   il,
-                    bool   worst_case) = 0;
+                    bool   worst_case);
 };
diff --git a/src/llama.cpp b/src/llama.cpp
index a677902f0ba7c..3db1644775fe7 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -326,8 +326,19 @@ struct llama_context * llama_init_from_model(
     llama_context * ctx = nullptr;
 
     try {
-        // TODO: add logic which llama_context implementation to construct
-        ctx = new llama_context_kv_self(*model, params);
+        // TODO: make static method of llama_context
+        switch (model->arch) {
+            case LLM_ARCH_RWKV6:
+            case LLM_ARCH_RWKV6QWEN2:
+            case LLM_ARCH_MAMBA:
+                GGML_ASSERT(llama_model_is_recurrent(model));
+                ctx = new llama_context_recurrent(*model, params);
+                break;
+            default:
+                GGML_ASSERT(!llama_model_is_recurrent(model));
+                ctx = new llama_context_kv_self(*model, params);
+        };
+
         ctx->init();
     } catch (const std::exception & e) {
         LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what());

From 2eacb4c1bfe01839f579e8aac3068f8758c26874 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 19 Feb 2025 18:43:49 +0200
Subject: [PATCH 57/84] graph : simplify attention api

ggml-ci
---
 src/llama-context.cpp | 65 +++++++++++++++++++------------------------
 src/llama-context.h   | 14 +++-------
 src/llama-graph.h     | 13 ++-------
 src/llama-model.cpp   |  8 +-----
 4 files changed, 36 insertions(+), 64 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index b571c9343fa88..818702143e196 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2567,63 +2567,56 @@ void llama_context_kv_self::build_attn_inp(
     }
 }
 
-void llama_context_kv_self::build_attn_kv_store(
+ggml_tensor * llama_context_kv_self::build_attn(
         ggml_context * ctx0,
          ggml_cgraph * gf,
+         ggml_tensor * wo,
+         ggml_tensor * wo_b,
          ggml_tensor * k_cur,
          ggml_tensor * v_cur,
+         ggml_tensor * q_cur,
              int32_t   n_tokens,
-             int64_t   il,
+             float     kq_scale,
+             int       il,
              bool      worst_case) {
     const auto & hparams = model.hparams;
 
     const auto & n_ctx = cparams.n_ctx;
 
-    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
-
     const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
     const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
 
-    GGML_ASSERT(kv_self.size == n_ctx);
+    // store to KV cache
+    {
+        const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
 
-    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head);
-    //cb(k_cache_view, "k_cache_view", il);
+        GGML_ASSERT(kv_self.size == n_ctx);
 
-    // note: storing RoPE-ed version of K in the KV cache
-    ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
+        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head);
+        //cb(k_cache_view, "k_cache_view", il);
 
-    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
+        // note: storing RoPE-ed version of K in the KV cache
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
 
-    struct ggml_tensor * v_cache_view = nullptr;
+        assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
 
-    if (cparams.flash_attn) {
-        v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head);
-    } else {
-        // note: the V cache is transposed when not using flash attention
-        v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa,
-                (  n_ctx)*ggml_element_size(kv_self.v_l[il]),
-                (kv_head)*ggml_element_size(kv_self.v_l[il]));
+        struct ggml_tensor * v_cache_view = nullptr;
 
-        v_cur = ggml_transpose(ctx0, v_cur);
-    }
-    //cb(v_cache_view, "v_cache_view", il);
+        if (cparams.flash_attn) {
+            v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head);
+        } else {
+            // note: the V cache is transposed when not using flash attention
+            v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa,
+                    (  n_ctx)*ggml_element_size(kv_self.v_l[il]),
+                    (kv_head)*ggml_element_size(kv_self.v_l[il]));
 
-    ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
-}
+            v_cur = ggml_transpose(ctx0, v_cur);
+        }
+        //cb(v_cache_view, "v_cache_view", il);
 
-ggml_tensor * llama_context_kv_self::build_attn_qkv(
-        ggml_context * ctx0,
-         ggml_cgraph * gf,
-         ggml_tensor * wo,
-         ggml_tensor * wo_b,
-         ggml_tensor * q_cur,
-             int32_t   n_tokens,
-             float     kq_scale,
-             int       il,
-             bool      worst_case) {
-    const auto & hparams = model.hparams;
+        ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
+    }
 
-    const auto & n_ctx         = cparams.n_ctx;
     const auto & n_embd_head_k = hparams.n_embd_head_k;
     const auto & n_embd_head_v = hparams.n_embd_head_v;
 
@@ -2657,8 +2650,6 @@ ggml_tensor * llama_context_kv_self::build_attn_qkv(
 
     const int64_t n_head       = hparams.n_head(il);
     const int64_t n_head_kv    = hparams.n_head_kv(il);
-    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
 
     struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
     //cb(q, "q", il);
diff --git a/src/llama-context.h b/src/llama-context.h
index 133eb8b36f739..fb241adf1d151 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -376,20 +376,13 @@ class llama_context_kv_self : public llama_context {
                     bool   swa,
                     bool   worst_case) override;
 
-    virtual void build_attn_kv_store(
-            ggml_context * ctx0,
-             ggml_cgraph * gf,
-             ggml_tensor * k_cur,
-             ggml_tensor * v_cur,
-                 int32_t   n_tokens,
-                 int64_t   il,
-                 bool      worst_case) override;
-
-    virtual ggml_tensor * build_attn_qkv(
+    virtual ggml_tensor * build_attn(
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * wo,
              ggml_tensor * wo_b,
+             ggml_tensor * k_cur,
+             ggml_tensor * v_cur,
              ggml_tensor * q_cur,
                  int32_t   n_tokens,
                  float     kq_scale,
@@ -443,6 +436,7 @@ class llama_context_kv_self : public llama_context {
 
 // a recurrent transformer (ie.e RWKV, Mamba)
 // TODO: temporary reuse kv_self, but in the future, implement recurrent-specific context with specific cache
+//class llama_context_recurrent : public llama_context {
 class llama_context_recurrent : public llama_context_kv_self {
 public:
     llama_context_recurrent(
diff --git a/src/llama-graph.h b/src/llama-graph.h
index b9456e3d1ca74..9adfc6f2313e2 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -88,20 +88,13 @@ class llama_graph_i {
                     bool   swa,
                     bool   worst_case) = 0;
 
-    virtual void build_attn_kv_store(
-            ggml_context * ctx0,
-             ggml_cgraph * gf,
-             ggml_tensor * k_cur,
-             ggml_tensor * v_cur,
-                 int32_t   n_tokens,
-                 int64_t   il,
-                 bool      worst_case) = 0;
-
-    virtual ggml_tensor * build_attn_qkv(
+    virtual ggml_tensor * build_attn(
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * wo,
              ggml_tensor * wo_b,
+             ggml_tensor * k_cur,
+             ggml_tensor * v_cur,
              ggml_tensor * q_cur,
                  int32_t   n_tokens,
                  float     kq_scale,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 09fd63f61ce6c..a22720c3ad184 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4258,13 +4258,7 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, k_cur);
         ggml_build_forward_expand(gf, v_cur);
 
-        //build_kv_store(gf, k_cur, v_cur, il);
-        lgf->build_attn_kv_store(ctx0, gf, k_cur, v_cur, n_tokens, il, worst_case);
-
-        struct ggml_tensor * cur;
-
-        //cur = build_kqv(gf, wo, wo_b, q_cur, kq_mask, kq_scale, il);
-        cur = lgf->build_attn_qkv(ctx0, gf, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
+        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, k_cur, v_cur, q_cur, n_tokens, kq_scale, il, worst_case);
         cb(cur, "kqv_out", il);
 
         return cur;

From f95b04a21cbb748ff5ed1a0489389166bc345672 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 19 Feb 2025 18:47:37 +0200
Subject: [PATCH 58/84] model : fix order kvq -> qkv

ggml-ci
---
 src/llama-context.cpp | 12 +++---
 src/llama-context.h   |  2 +-
 src/llama-graph.h     |  2 +-
 src/llama-model.cpp   | 95 ++++++++++++++++++++++---------------------
 4 files changed, 56 insertions(+), 55 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 818702143e196..dbc9231acf1c8 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2572,9 +2572,9 @@ ggml_tensor * llama_context_kv_self::build_attn(
          ggml_cgraph * gf,
          ggml_tensor * wo,
          ggml_tensor * wo_b,
+         ggml_tensor * q_cur,
          ggml_tensor * k_cur,
          ggml_tensor * v_cur,
-         ggml_tensor * q_cur,
              int32_t   n_tokens,
              float     kq_scale,
              int       il,
@@ -2617,9 +2617,6 @@ ggml_tensor * llama_context_kv_self::build_attn(
         ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
     }
 
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-    const auto & n_embd_head_v = hparams.n_embd_head_v;
-
     // TODO: improve
     bool is_sliding = false;
 
@@ -2648,8 +2645,11 @@ ggml_tensor * llama_context_kv_self::build_attn(
 
     const auto n_kv = worst_case ? kv_self.size : kv_self.n;
 
-    const int64_t n_head       = hparams.n_head(il);
-    const int64_t n_head_kv    = hparams.n_head_kv(il);
+    const int64_t n_head    = hparams.n_head(il);
+    const int64_t n_head_kv = hparams.n_head_kv(il);
+
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+    const auto & n_embd_head_v = hparams.n_embd_head_v;
 
     struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
     //cb(q, "q", il);
diff --git a/src/llama-context.h b/src/llama-context.h
index fb241adf1d151..2b3d5f122bbbe 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -381,9 +381,9 @@ class llama_context_kv_self : public llama_context {
              ggml_cgraph * gf,
              ggml_tensor * wo,
              ggml_tensor * wo_b,
+             ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
-             ggml_tensor * q_cur,
                  int32_t   n_tokens,
                  float     kq_scale,
                  int       il,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 9adfc6f2313e2..b64e0f5f4fdb0 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -93,9 +93,9 @@ class llama_graph_i {
              ggml_cgraph * gf,
              ggml_tensor * wo,
              ggml_tensor * wo_b,
+             ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
-             ggml_tensor * q_cur,
                  int32_t   n_tokens,
                  float     kq_scale,
                  int       il,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index a22720c3ad184..debbacbb6183b 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4246,9 +4246,9 @@ struct llm_build_context {
              struct ggml_cgraph * gf,
              struct ggml_tensor * wo,
              struct ggml_tensor * wo_b,
+             struct ggml_tensor * q_cur,
              struct ggml_tensor * k_cur,
              struct ggml_tensor * v_cur,
-             struct ggml_tensor * q_cur,
                         int32_t   n_tokens,
                         float     kq_scale,
                         int       il) {
@@ -4258,7 +4258,7 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, k_cur);
         ggml_build_forward_expand(gf, v_cur);
 
-        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, k_cur, v_cur, q_cur, n_tokens, kq_scale, il, worst_case);
+        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il, worst_case);
         cb(cur, "kqv_out", il);
 
         return cur;
@@ -4460,7 +4460,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, kq_scale, il);
+                        Qcur, Kcur, Vcur, n_tokens, kq_scale, il);
             }
 
             if (il == n_layer - 1) {
@@ -4632,7 +4632,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, kq_scale, il);
+                        Qcur, Kcur, Vcur, n_tokens, kq_scale, il);
             }
 
             if (il == n_layer - 1) {
@@ -4768,7 +4768,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -4874,7 +4874,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -4996,7 +4996,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -5118,7 +5118,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f, il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
             }
 
             if (il == n_layer - 1) {
@@ -5265,7 +5265,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -5375,7 +5375,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -5470,7 +5470,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -5763,7 +5763,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -5896,13 +5896,13 @@ struct llm_build_context {
 
                     cur = build_attn(gf,
                             model.layers[il].wo, model.layers[il].bo,
-                            Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                            Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
                 } else {
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
                     cur = build_attn(gf,
                             model.layers[il].wo, model.layers[il].bo,
-                            Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                            Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
                 }
             }
 
@@ -6048,7 +6048,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -6168,7 +6168,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -6283,7 +6283,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -6401,7 +6401,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -6514,7 +6514,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -6673,7 +6673,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f, il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
             }
 
             if (il == n_layer - 1) {
@@ -6796,7 +6796,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f, il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
             }
 
             if (il == n_layer - 1) {
@@ -6921,7 +6921,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
             struct ggml_tensor * sa_out = cur;
 
@@ -7024,7 +7024,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -7136,7 +7136,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -7257,7 +7257,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -7376,7 +7376,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -7570,7 +7570,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        k_states, v_states, q_states, n_tokens, kq_scale, il);
+                        q_states, k_states, v_states, n_tokens, kq_scale, il);
             }
 
             if (il == n_layer - 1) {
@@ -7692,7 +7692,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f, il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
             }
 
             if (il == n_layer - 1) {
@@ -7806,7 +7806,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f, il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
             }
 
             cur = build_norm(cur,
@@ -7943,7 +7943,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -8143,7 +8143,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -8276,8 +8276,9 @@ struct llm_build_context {
                     cb(Kcur, "Kcur", il);
                 }
 
-                cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
-                                   n_tokens, 1.0f / sqrtf(float(n_embd_head)), il);
+                cur = build_attn(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -8400,7 +8401,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, nullptr,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -8515,7 +8516,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             cur = build_norm(cur,
@@ -8643,7 +8644,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -8773,7 +8774,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -8883,7 +8884,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -9025,7 +9026,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -9172,7 +9173,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, kq_scale, il);
+                        Qcur, Kcur, Vcur, n_tokens, kq_scale, il);
             }
 
             if (il == n_layer - 1) {
@@ -9400,7 +9401,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        k_states, v_states, q_states, n_tokens, kq_scale, il);
+                        q_states, k_states, v_states, n_tokens, kq_scale, il);
             }
 
             if (il == n_layer - 1) {
@@ -9558,7 +9559,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         NULL, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
 
                 cur = build_norm(cur,
                         model.layers[il].attn_sub_norm, NULL,
@@ -10007,7 +10008,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/float(n_embd_head), il);
             }
 
             if (il == n_layer - 1) {
@@ -10135,7 +10136,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
 
             }
 
@@ -10254,7 +10255,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -10377,7 +10378,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
             if (il == n_layer - 1) {
@@ -10699,7 +10700,7 @@ struct llm_build_context {
 
                 cur = build_attn(gf,
                         model.layers[il].wo, nullptr,
-                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
 
                 if (hparams.swin_norm) {
                     cur = build_norm(cur,

From b1554be1d7213fbc628e184bffef5e42a734595d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 20 Feb 2025 15:18:45 +0200
Subject: [PATCH 59/84] context : add cache-less llama_context

ggml-ci
---
 common/common.cpp      |    2 +-
 src/llama-context.cpp  | 1210 ++++++++++++++++++++++++++++++----------
 src/llama-context.h    |  108 ++--
 src/llama-graph.cpp    |   78 +++
 src/llama-graph.h      |   21 +-
 src/llama-kv-cache.cpp |   44 ++
 src/llama-model.cpp    |   58 +-
 src/llama.cpp          |    5 +
 8 files changed, 1122 insertions(+), 404 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index b751959569ca1..ec95f32d63122 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -952,7 +952,7 @@ struct common_init_result common_init_from_params(common_params & params) {
     }
 
     if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
-        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
+        LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
         params.ctx_shift = false;
     }
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index dbc9231acf1c8..6b2a11ad69097 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -341,6 +341,10 @@ uint32_t llama_context::n_ubatch() const {
     return cparams.n_ubatch;
 }
 
+uint32_t llama_context::n_seq_max() const {
+    return cparams.n_seq_max;
+}
+
 uint32_t llama_context::n_threads() const {
     return cparams.n_threads;
 }
@@ -353,6 +357,20 @@ int32_t llama_context::max_nodes() const {
     return std::max<int32_t>(8192, 5*model.n_tensors());
 }
 
+llama_kv_cache * llama_context::get_kv_self() {
+    LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__);
+    return nullptr;
+}
+
+const llama_kv_cache * llama_context::get_kv_self() const {
+    LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__);
+    return nullptr;
+}
+
+void llama_context::kv_self_update() {
+    LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__);
+}
+
 enum llama_pooling_type llama_context::pooling_type() const {
     return cparams.pooling_type;
 }
@@ -566,6 +584,9 @@ ggml_cgraph * llama_context::graph_init() {
     inp_mean    = nullptr;
     inp_cls     = nullptr;
 
+    inp_kq_mask     = nullptr;
+    inp_kq_mask_cnv = nullptr;
+
     struct ggml_init_params params = {
         /*.mem_size   =*/ buf_compute_meta.size(),
         /*.mem_buffer =*/ buf_compute_meta.data(),
@@ -612,179 +633,11 @@ enum ggml_status llama_context::graph_compute(
     return status;
 }
 
-void llama_context::input_set(const llama_ubatch & ubatch) {
-    const llama_hparams & hparams = model.hparams;
-
-    if (ubatch.token) {
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
-    }
-
-    if (ubatch.embd) {
-        const int64_t n_embd   = hparams.n_embd;
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd));
-    }
-
-    if (ubatch.pos && inp_pos) {
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos));
-    }
-
-    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs");
-
-        if (!inp_out_ids) {
-            LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__);
-        } else {
-            const int64_t n_tokens = ubatch.n_tokens;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer));
-            int32_t * data = (int32_t *) inp_out_ids->data;
-
-            if (n_outputs == n_tokens) {
-                for (int i = 0; i < n_tokens; ++i) {
-                    data[i] = i;
-                }
-            } else if (ubatch.output) {
-                int32_t n_outputs = 0;
-                for (int i = 0; i < n_tokens; ++i) {
-                    if (ubatch.output[i]) {
-                        data[n_outputs++] = i;
-                    }
-                }
-                // the graph needs to have been passed the correct number of outputs
-                GGML_ASSERT(n_outputs == n_outputs);
-            } else if (n_outputs == 1) {
-                // only keep last output
-                data[0] = n_tokens - 1;
-            } else {
-                GGML_ASSERT(n_outputs == 0);
-            }
-        }
-    }
-
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp_mean);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer));
-
-        float * data = (float *) inp_mean->data;
-        memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean));
-
-        std::vector<uint64_t> sum(n_tokens, 0);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
-            sum[seq_id] += ubatch.n_seq_tokens;
-        }
-
-        std::vector<float> div(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
-            const uint64_t s = sum[i];
-            if (s > 0) {
-                div[i] = 1.0f/float(s);
-            }
-        }
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
-            }
-        }
-    }
-
-    if (cparams.embeddings && (
-                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
-                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
-
-        uint32_t * data = (uint32_t *) inp_cls->data;
-        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
-
-                if (pos == 0) {
-                    data[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-    }
-
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
-
-        uint32_t * data = (uint32_t *) inp_cls->data;
-        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
-
-        std::vector<int> last_pos(n_tokens, -1);
-        std::vector<int> last_row(n_tokens, -1);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
-
-                if (pos >= last_pos[seq_id]) {
-                    last_pos[seq_id] = pos;
-                    last_row[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-
-        for (int i = 0; i < n_tokens; ++i) {
-            if (last_row[i] >= 0) {
-                data[i] = last_row[i];
-            }
-        }
-    }
-
-    GGML_ASSERT(
-            // (!a || b) is a logical implication (a -> b)
-            // !hparams.causal_attn -> !cparams.causal_attn
-            (hparams.causal_attn || !cparams.causal_attn) &&
-            "causal attention is not supported by this model"
-            );
-}
-
 int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto & hparams = model.hparams;
     const auto & vocab   = model.vocab;
 
-    const int64_t n_outputs_max = std::max<int64_t>(n_outputs, cparams.n_seq_max);
+    const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
 
     const auto n_batch = cparams.n_batch;
     const auto n_vocab = vocab.n_tokens();
@@ -887,72 +740,401 @@ void llama_context::output_reorder() {
     }
 }
 
-void llama_context::build_cb(
-         ggml_tensor * cur,
-          const char * name,
-  const llama_ubatch & ubatch,
-                 int   il) {
-    if (il >= 0) {
-        ggml_format_name(cur, "%s-%d", name, il);
-    } else {
-        ggml_set_name(cur, name);
+int llama_context::encode(llama_batch & inp_batch) {
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
     }
 
-    if (!cparams.offload_kqv) {
-        if (strcmp(name, "kqv_merged_cont") == 0) {
-            // all nodes between the KV store and the attention output are run on the CPU
-            ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
-        }
-    }
+    // temporary allocate memory for the input batch if needed
+    llama_batch_allocr batch_allocr(inp_batch, 0);
 
-    // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
-    // FIXME: fix in ggml_backend_sched
-    const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
-    if (ubatch.n_tokens < 32 || full_offload) {
-        if (il != -1 && strcmp(name, "norm") == 0) {
-            const auto & dev_layer = model.dev_layer(il);
-            for (auto & backend : backends) {
-                if (ggml_backend_get_device(backend.get()) == dev_layer) {
-                    if (ggml_backend_supports_op(backend.get(), cur)) {
-                        ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get());
-                    }
-                }
+    const llama_batch & batch = batch_allocr.batch;
+
+    const int32_t n_tokens = batch.n_tokens;
+
+    const auto & hparams = model.hparams;
+
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+    if (batch.token) {
+        for (int32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return -1;
             }
         }
     }
-}
 
-llama_perf_context_data llama_context::perf_get_data() const {
-    llama_perf_context_data data = {};
+    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
+    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
 
-    data.t_start_ms  = 1e-3 * t_start_us;
-    data.t_load_ms   = 1e-3 * t_load_us;
-    data.t_p_eval_ms = 1e-3 * t_p_eval_us;
-    data.t_eval_ms   = 1e-3 * t_eval_us;
-    data.n_p_eval    = std::max(1, n_p_eval);
-    data.n_eval      = std::max(1, n_eval);
+    if (t_compute_start_us == 0) {
+        t_compute_start_us = ggml_time_us();
+    }
 
-    return data;
-}
+    n_queued_tokens += n_tokens;
 
-ggml_tensor * llama_context::build_cvec(
-        ggml_context * ctx0,
-         ggml_tensor * cur,
-                 int   il) {
-    return cvec.apply_to(ctx0, cur, il);
-}
+    const int64_t n_embd = hparams.n_embd;
 
-ggml_tensor * llama_context::build_lora_mm(
-        ggml_context * ctx0,
-         ggml_tensor * w,
-         ggml_tensor * cur) {
-    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
 
-    for (const auto & lora : loras) {
-        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
-        if (lw == nullptr) {
-            continue;
-        }
+    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
+
+    // reserve output buffer
+    if (output_reserve(n_tokens) < n_tokens) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+        return -2;
+    };
+
+    for (int32_t i = 0; i < n_tokens; ++i) {
+        output_ids[i] = i;
+    }
+
+    n_outputs = n_tokens;
+
+    GGML_ASSERT(need_reserve == false);
+
+    ggml_backend_sched_reset(sched.get());
+    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+    auto * gf = graph_init();
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
+
+    ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+    input_set(ubatch);
+
+    const auto compute_status = graph_compute(gf, n_tokens > 1);
+    switch (compute_status) {
+        case GGML_STATUS_SUCCESS:
+            break;
+        case GGML_STATUS_ABORTED:
+            return 2;
+        case GGML_STATUS_ALLOC_FAILED:
+            return -2;
+        case GGML_STATUS_FAILED:
+        default:
+            return -3;
+    }
+
+    auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd;
+
+    // extract embeddings
+    if (t_embd) {
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+        GGML_ASSERT(backend_embd != nullptr);
+
+        switch (cparams.pooling_type) {
+            case LLAMA_POOLING_TYPE_NONE:
+                {
+                    GGML_ASSERT(embd != nullptr);
+
+                    // extract token embeddings
+                    float * embd_out = embd;
+
+                    GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
+                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                } break;
+            case LLAMA_POOLING_TYPE_MEAN:
+            case LLAMA_POOLING_TYPE_CLS:
+            case LLAMA_POOLING_TYPE_LAST:
+                {
+                    // extract sequence embeddings
+                    auto & embd_seq_out = embd_seq;
+                    embd_seq_out.clear();
+
+                    GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+                    for (int32_t i = 0; i < n_tokens; i++) {
+                        const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
+                        embd_seq_out[seq_id].resize(n_embd);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                    }
+                } break;
+            case LLAMA_POOLING_TYPE_RANK:
+                {
+                    // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                    //       wait for an encoder model that requires this pooling type in order to test it
+                    //       https://github.com/ggerganov/llama.cpp/pull/9510
+                    GGML_ABORT("RANK pooling not implemented yet");
+                }
+            case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                {
+                    GGML_ABORT("unknown pooling type");
+                }
+        }
+    }
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    return 0;
+}
+
+int llama_context::decode(llama_batch & inp_batch) {
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporary allocate memory for the input batch if needed
+    llama_batch_allocr batch_allocr(inp_batch, 0);
+
+    const llama_batch & batch = batch_allocr.batch;
+
+    const auto & vocab   = model.vocab;
+    const auto & hparams = model.hparams;
+
+    const int32_t n_vocab = vocab.n_tokens();
+
+    const int64_t n_tokens = batch.n_tokens;
+    const int64_t n_embd   = hparams.n_embd;
+
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+    if (batch.token) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
+                throw std::runtime_error("invalid token");
+            }
+        }
+    }
+
+    // micro-batching is not possible without KV cache
+    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "llama_context requires n_ubatch >= n_tokens");
+
+    if (t_compute_start_us == 0) {
+        t_compute_start_us = ggml_time_us();
+    }
+    n_queued_tokens += n_tokens;
+
+    // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
+    embd_seq.clear();
+
+    int64_t n_outputs_all = 0;
+
+    // count outputs
+    if (batch.logits && !embd_pooled) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            n_outputs_all += batch.logits[i] != 0;
+        }
+    } else if (logits_all || embd_pooled) {
+        n_outputs_all = n_tokens;
+    } else {
+        // keep last output only
+        n_outputs_all = 1;
+    }
+
+    const bool logits_all = n_outputs_all == n_tokens;
+
+    sbatch.from_batch(batch, n_embd,
+            /* simple_split */ true,
+            /* logits_all   */ logits_all);
+
+    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
+
+    // reserve output buffer
+    if (output_reserve(n_outputs_all) < n_outputs_all) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
+        return -2;
+    };
+
+    n_outputs = n_outputs_all;
+
+    GGML_ASSERT(need_reserve == false);
+
+    ggml_backend_sched_reset(sched.get());
+    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+    auto * gf = graph_init();
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
+
+    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+    ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+    input_set(ubatch);
+
+    const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
+    if (compute_status != GGML_STATUS_SUCCESS) {
+        switch (compute_status) {
+            case GGML_STATUS_ABORTED:
+                return 2;
+            case GGML_STATUS_ALLOC_FAILED:
+                return -2;
+            case GGML_STATUS_FAILED:
+            default:
+                return -3;
+        }
+    }
+
+    auto * t_logits = cparams.embeddings ? nullptr    : res.t_logits;
+    auto * t_embd   = cparams.embeddings ? res.t_embd : nullptr;
+
+    if (t_embd && res.t_embd_pooled) {
+        t_embd = res.t_embd_pooled;
+    }
+
+    // extract logits
+    if (t_logits && n_outputs > 0) {
+        ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
+        GGML_ASSERT(backend_res != nullptr);
+        GGML_ASSERT(logits != nullptr);
+
+        float * logits_out = logits;
+
+        if (n_outputs) {
+            GGML_ASSERT(n_outputs <= n_outputs_all);
+            GGML_ASSERT(n_outputs*n_vocab <= (int64_t) logits_size);
+            ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+        }
+    }
+
+    // extract embeddings
+    if (t_embd && n_outputs > 0) {
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+        GGML_ASSERT(backend_embd != nullptr);
+
+        switch (cparams.pooling_type) {
+            case LLAMA_POOLING_TYPE_NONE:
+                {
+                    // extract token embeddings
+                    GGML_ASSERT(embd != nullptr);
+                    float * embd_out = embd;
+
+                    if (n_outputs) {
+                        GGML_ASSERT(n_outputs <= n_outputs_all);
+                        GGML_ASSERT(n_outputs*n_embd <= (int64_t) embd_size);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
+                    }
+                } break;
+            case LLAMA_POOLING_TYPE_MEAN:
+            case LLAMA_POOLING_TYPE_CLS:
+            case LLAMA_POOLING_TYPE_LAST:
+                {
+                    // extract sequence embeddings (cleared before processing each batch)
+                    auto & embd_seq_out = embd_seq;
+
+                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
+                        embd_seq_out[seq_id].resize(n_embd);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                    }
+                } break;
+            case LLAMA_POOLING_TYPE_RANK:
+                {
+                    // extract the rerank score - a single float per sequence
+                    auto & embd_seq_out = embd_seq;
+
+                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
+                        embd_seq_out[seq_id].resize(1);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                    }
+                } break;
+            case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                {
+                    GGML_ABORT("unknown pooling type");
+                }
+        }
+    }
+
+    // set output mappings
+    {
+        bool sorted_output = true;
+
+        GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
+
+        for (int64_t i = 0; i < n_outputs_all; ++i) {
+            int64_t out_id = sbatch.out_ids[i];
+            output_ids[out_id] = i;
+            if (out_id != i) {
+                sorted_output = false;
+            }
+        }
+
+        if (sorted_output) {
+            sbatch.out_ids.clear();
+        }
+    }
+
+    // wait for the computation to finish (automatically done when obtaining the model output)
+    //synchronize();
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    return 0;
+}
+
+void llama_context::build_cb(
+         ggml_tensor * cur,
+          const char * name,
+  const llama_ubatch & ubatch,
+                 int   il) {
+    if (il >= 0) {
+        ggml_format_name(cur, "%s-%d", name, il);
+    } else {
+        ggml_set_name(cur, name);
+    }
+
+    if (!cparams.offload_kqv) {
+        if (strcmp(name, "kqv_merged_cont") == 0) {
+            // all nodes between the KV store and the attention output are run on the CPU
+            ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
+        }
+    }
+
+    // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
+    // FIXME: fix in ggml_backend_sched
+    const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
+    if (ubatch.n_tokens < 32 || full_offload) {
+        if (il != -1 && strcmp(name, "norm") == 0) {
+            const auto & dev_layer = model.dev_layer(il);
+            for (auto & backend : backends) {
+                if (ggml_backend_get_device(backend.get()) == dev_layer) {
+                    if (ggml_backend_supports_op(backend.get(), cur)) {
+                        ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get());
+                    }
+                }
+            }
+        }
+    }
+}
+
+ggml_tensor * llama_context::build_cvec(
+        ggml_context * ctx0,
+         ggml_tensor * cur,
+                 int   il) {
+    return cvec.apply_to(ctx0, cur, il);
+}
+
+ggml_tensor * llama_context::build_lora_mm(
+        ggml_context * ctx0,
+         ggml_tensor * w,
+         ggml_tensor * cur) {
+    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+
+    for (const auto & lora : loras) {
+        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
 
         const float adapter_scale = lora.second;
         const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
@@ -1002,7 +1184,7 @@ ggml_tensor * llama_context::build_rope_factors(int il) {
     const auto & hparams = model.hparams;
 
     // choose long/short freq factors based on the context size
-    const auto n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+    const auto n_ctx_per_seq = n_ctx() / n_seq_max();
 
     if (model.layers[il].rope_freqs != nullptr) {
         return model.layers[il].rope_freqs;
@@ -1141,16 +1323,176 @@ ggml_tensor * llama_context::build_inp_mean(
     inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
     ggml_set_input(inp_mean);
 
-    return inp_mean;
-}
+    return inp_mean;
+}
+
+ggml_tensor * llama_context::build_inp_cls(
+        ggml_context * ctx0,
+             int32_t   n_tokens) {
+    inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp_cls);
+
+    return inp_cls;
+}
+
+ggml_tensor * llama_context::build_attn(
+        ggml_context * ctx0,
+         ggml_cgraph * gf,
+         ggml_tensor * wo,
+         ggml_tensor * wo_b,
+         ggml_tensor * q_cur,
+         ggml_tensor * k_cur,
+         ggml_tensor * v_cur,
+             int32_t   n_tokens,
+             float     kq_scale,
+             int       il,
+             bool      worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto & n_ctx = cparams.n_ctx;
+
+  //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+    const auto & kq_mask = inp_kq_mask_cnv;
+
+    const int64_t n_head    = hparams.n_head(il);
+    const int64_t n_head_kv = hparams.n_head_kv(il);
+
+  //const auto & n_embd_head_k = hparams.n_embd_head_k;
+    const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
+    GGML_UNUSED(worst_case);
+    const auto n_kv = n_tokens;
+
+    struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
+
+    struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, k_cur, 0, 2, 1, 3));
+    //cb(k, "k", il);
+
+    struct ggml_tensor * cur;
+
+    //if (cparams.flash_attn) {
+    if (false) { // TODO: need to pad the batch size to a multiple of GGML_KQ_MASK_PAD
+        GGML_UNUSED(model);
+        GGML_UNUSED(n_ctx);
+
+        struct ggml_tensor * v = ggml_cont(ctx0, ggml_permute(ctx0, v_cur, 0, 2, 1, 3));
+        v = ggml_reshape_3d(ctx0, v, n_embd_head_v, n_kv, n_head_kv);
+
+        cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
+                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
+
+        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
+
+        cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
+    } else {
+        struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+        //cb(kq, "kq", il);
+
+        // note: this op tends to require high floating point range
+        //       while for some models F16 is enough, for others it is not, so we default to F32 here
+        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+
+        if (model.arch == LLM_ARCH_GROK) {
+            // need to do the following:
+            // multiply by attn_output_multiplyer of 0.08838834764831845
+            // and then :
+            // kq = 30 * tanh(kq / 30)
+            // before the softmax below
+
+            kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
+            kq = ggml_scale(ctx0, kq, 30);
+        }
+
+        if (hparams.attn_soft_cap) {
+            kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
+            kq = ggml_tanh(ctx0, kq);
+            kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
+        }
+
+        kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
+        //cb(kq, "kq_soft_max_ext", il);
+
+        // split cached v into n_head heads
+        struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens)));
+
+        v = ggml_reshape_3d(ctx0, v, n_kv, n_embd_head_v, n_head_kv);
+        //cb(v, "v", il);
+
+        struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+        //cb(kqv, "kqv", il);
+
+        struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+        //cb(kqv_merged, "kqv_merged", il);
+
+        cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
+        //cb(cur, "kqv_merged_cont", il);
+
+        if (!cparams.offload_kqv) {
+            // all nodes between the KV store and the attention output are run on the CPU
+            ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
+        }
+    }
+
+    ggml_build_forward_expand(gf, cur);
+
+    if (wo) {
+        cur = build_lora_mm(ctx0, wo, cur);
+    }
+
+    if (wo_b) {
+        //cb(cur, "kqv_wo", il);
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
+
+    return cur;
+}
+
+void llama_context::build_attn_inp(
+        ggml_context * ctx0,
+             int32_t   n_tokens,
+                bool   causal,
+                bool   swa,
+                bool   worst_case) {
+    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
+    GGML_UNUSED(causal);
+    GGML_UNUSED(swa);
+    GGML_UNUSED(worst_case);
+
+    inp_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    //cb(inp_kq_mask, "KQ_mask", -1);
+    ggml_set_input(inp_kq_mask);
+
+    inp_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_kq_mask, GGML_TYPE_F16) : inp_kq_mask;
+}
+
+//
+// perf
+//
+
+llama_perf_context_data llama_context::perf_get_data() const {
+    llama_perf_context_data data = {};
+
+    data.t_start_ms  = 1e-3 * t_start_us;
+    data.t_load_ms   = 1e-3 * t_load_us;
+    data.t_p_eval_ms = 1e-3 * t_p_eval_us;
+    data.t_eval_ms   = 1e-3 * t_eval_us;
+    data.n_p_eval    = std::max(1, n_p_eval);
+    data.n_eval      = std::max(1, n_eval);
 
-ggml_tensor * llama_context::build_inp_cls(
-        ggml_context * ctx0,
-             int32_t   n_tokens) {
-    inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_set_input(inp_cls);
+    return data;
+}
 
-    return inp_cls;
+void llama_context::perf_reset() {
+    t_start_us  = ggml_time_us();
+    t_eval_us   = n_eval = 0;
+    t_p_eval_us = n_p_eval = 0;
 }
 
 //
@@ -1620,10 +1962,277 @@ size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_
     return io.n_bytes();
 }
 
-void llama_context::perf_reset() {
-    t_start_us  = ggml_time_us();
-    t_eval_us   = n_eval = 0;
-    t_p_eval_us = n_p_eval = 0;
+//
+// input
+//
+
+void llama_context::input_set(const llama_ubatch & ubatch) {
+    const llama_hparams & hparams = model.hparams;
+
+    if (ubatch.token) {
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
+    }
+
+    if (ubatch.embd) {
+        const int64_t n_embd   = hparams.n_embd;
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd));
+    }
+
+    if (ubatch.pos && inp_pos) {
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos));
+    }
+
+    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs");
+
+        if (!inp_out_ids) {
+            LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__);
+        } else {
+            const int64_t n_tokens = ubatch.n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer));
+            int32_t * data = (int32_t *) inp_out_ids->data;
+
+            if (n_outputs == n_tokens) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = i;
+                }
+            } else if (ubatch.output) {
+                int32_t n_outputs = 0;
+                for (int i = 0; i < n_tokens; ++i) {
+                    if (ubatch.output[i]) {
+                        data[n_outputs++] = i;
+                    }
+                }
+                // the graph needs to have been passed the correct number of outputs
+                GGML_ASSERT(n_outputs == n_outputs);
+            } else if (n_outputs == 1) {
+                // only keep last output
+                data[0] = n_tokens - 1;
+            } else {
+                GGML_ASSERT(n_outputs == 0);
+            }
+        }
+    }
+
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
+
+        GGML_ASSERT(inp_mean);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer));
+
+        float * data = (float *) inp_mean->data;
+        memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean));
+
+        std::vector<uint64_t> sum(n_tokens, 0);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
+
+            sum[seq_id] += ubatch.n_seq_tokens;
+        }
+
+        std::vector<float> div(n_tokens, 0.0f);
+        for (int i = 0; i < n_tokens; ++i) {
+            const uint64_t s = sum[i];
+            if (s > 0) {
+                div[i] = 1.0f/float(s);
+            }
+        }
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
+            }
+        }
+    }
+
+    if (cparams.embeddings && (
+                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
+
+        GGML_ASSERT(inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
+
+        uint32_t * data = (uint32_t *) inp_cls->data;
+        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+
+                if (pos == 0) {
+                    data[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+    }
+
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
+
+        GGML_ASSERT(inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
+
+        uint32_t * data = (uint32_t *) inp_cls->data;
+        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
+
+        std::vector<int> last_pos(n_tokens, -1);
+        std::vector<int> last_row(n_tokens, -1);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+
+                if (pos >= last_pos[seq_id]) {
+                    last_pos[seq_id] = pos;
+                    last_row[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+
+        for (int i = 0; i < n_tokens; ++i) {
+            if (last_row[i] >= 0) {
+                data[i] = last_row[i];
+            }
+        }
+    }
+
+    if (inp_kq_mask) {
+        // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
+        if (cparams.causal_attn) {
+            // TODO: need to use the batch directly to construct the masks
+            GGML_ABORT("TODO");
+
+            //const int64_t n_kv         = ubatch.n_tokens;
+            //const int64_t n_tokens     = ubatch.n_tokens;
+            //const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+            //const int64_t n_seqs       = ubatch.n_seqs;
+
+            //float * data     = nullptr;
+
+            //if (inp_kq_mask) {
+            //    GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer));
+            //    data = (float *) inp_kq_mask->data;
+            //}
+
+            //// For causal attention, use only the previous KV cells
+            //// of the correct sequence for each token of the ubatch.
+            //// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
+            //for (int h = 0; h < 1; ++h) {
+            //    for (int s = 0; s < n_seqs; ++s) {
+            //        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            //        for (int j = 0; j < n_seq_tokens; ++j) {
+            //            const llama_pos pos = ubatch.pos[s*n_seq_tokens + j];
+
+            //            for (int i = 0; i < n_kv; ++i) {
+            //                float f;
+            //                if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+            //                    f = -INFINITY;
+            //                } else {
+            //                    if (hparams.use_alibi) {
+            //                        f = -std::abs(kv_self.cells[i].pos - pos);
+            //                    } else {
+            //                        f = 0.0f;
+            //                    }
+            //                }
+
+            //                if (data) {
+            //                    data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
+            //                }
+            //            }
+            //        }
+            //    }
+
+            //    if (data) {
+            //        for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+            //            for (int j = 0; j < n_kv; ++j) {
+            //                data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+            //            }
+            //        }
+            //    }
+            //}
+        } else {
+            const int64_t n_tokens     = ubatch.n_tokens;
+            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+            const int64_t n_seqs       = ubatch.n_seqs;
+            const int64_t n_stride     = ubatch.n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer));
+
+            float * data = (float *) inp_kq_mask->data;
+
+            for (int h = 0; h < 1; ++h) {
+                for (int s1 = 0; s1 < n_seqs; ++s1) {
+                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
+
+                    for (int j = 0; j < n_seq_tokens; ++j) {
+                        const int32_t tj = s1*n_seq_tokens + j;
+
+                        for (int s0 = 0; s0 < n_seqs; ++s0) {
+                            for (int i = 0; i < n_seq_tokens; ++i) {
+                                const int32_t ti = s0*n_seq_tokens + i;
+                                float f = -INFINITY;
+
+                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
+                                    if (ubatch.seq_id[s0][s] == seq_id) {
+                                        if (hparams.use_alibi) {
+                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
+                                        } else {
+                                            f = 0.0f;
+                                        }
+                                        break;
+                                    }
+                                }
+
+                                data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
+                            }
+                        }
+
+                        for (int i = n_tokens; i < n_stride; ++i) {
+                            data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    GGML_ASSERT(
+            // (!a || b) is a logical implication (a -> b)
+            // !hparams.causal_attn -> !cparams.causal_attn
+            (hparams.causal_attn || !cparams.causal_attn) &&
+            "causal attention is not supported by this model"
+            );
 }
 
 //
@@ -1684,11 +2293,6 @@ llama_context_kv_self::llama_context_kv_self(
 
 llama_context_kv_self::~llama_context_kv_self() = default;
 
-uint32_t llama_context_kv_self::n_seq_max() const {
-    // TODO: add notion of n_seq_max to llama_kv_cache and use it here
-    return kv_self.size;
-}
-
 llama_kv_cache * llama_context_kv_self::get_kv_self() {
     return &kv_self;
 }
@@ -1698,14 +2302,15 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
 }
 
 ggml_cgraph * llama_context_kv_self::graph_init() {
-    inp_KQ_mask         = nullptr;
-    inp_KQ_mask_cnv     = nullptr;
-    inp_KQ_mask_swa     = nullptr;
-    inp_KQ_mask_swa_cnv = nullptr;
-    inp_KQ_mask_cross   = nullptr;
-    inp_k_shift         = nullptr;
-    inp_embd_enc        = nullptr;
-    inp_pos_bucket      = nullptr;
+    inp_embd_enc      = nullptr;
+    inp_pos_bucket    = nullptr;
+    inp_kq_mask_cross = nullptr;
+
+    inp_self_kq_mask         = nullptr;
+    inp_self_kq_mask_cnv     = nullptr;
+    inp_self_kq_mask_swa     = nullptr;
+    inp_self_kq_mask_swa_cnv = nullptr;
+    inp_self_k_shift         = nullptr;
 
     return llama_context::graph_init();
 }
@@ -1979,8 +2584,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
         const auto & n_ubatch = cparams.n_ubatch;
 
-        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
-
         if (kv_self.recurrent) {
             if (embd_pooled) {
                 // Pooled embeddings cannot be split across ubatches (yet)
@@ -2033,7 +2636,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
                 // after enough generations, the benefit from this heuristic disappears
                 // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = kv_self.get_padding(cparams);
+                const uint32_t pad = get_ctx_padding(cparams);
                 kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
                 //kv_self.n = llama_kv_cache_cell_max(kv_self);
             }
@@ -2246,10 +2849,10 @@ uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) c
 void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     const llama_hparams & hparams = model.hparams;
 
-    if (inp_k_shift) {
-        assert(ggml_backend_buffer_is_host(inp_k_shift->buffer));
+    if (inp_self_k_shift) {
+        assert(ggml_backend_buffer_is_host(inp_self_k_shift->buffer));
 
-        int32_t * data = (int32_t *) inp_k_shift->data;
+        int32_t * data = (int32_t *) inp_self_k_shift->data;
 
         for (uint32_t i = 0; i < kv_self.size; ++i) {
             data[i] = kv_self.cells[i].delta;
@@ -2262,7 +2865,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     // call base functionality
     llama_context::input_set(ubatch);
 
-    if (inp_KQ_mask || inp_KQ_mask_swa) {
+    if (inp_self_kq_mask || inp_self_kq_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
         if (cparams.causal_attn && !is_encoding) {
             const int64_t n_kv         = kv_self.n;
@@ -2273,14 +2876,14 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
             float * data     = nullptr;
             float * data_swa = nullptr;
 
-            if (inp_KQ_mask) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer));
-                data = (float *) inp_KQ_mask->data;
+            if (inp_self_kq_mask) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask->buffer));
+                data = (float *) inp_self_kq_mask->data;
             }
 
-            if (inp_KQ_mask_swa) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_swa->buffer));
-                data_swa = (float *) inp_KQ_mask_swa->data;
+            if (inp_self_kq_mask_swa) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask_swa->buffer));
+                data_swa = (float *) inp_self_kq_mask_swa->data;
             }
 
             // For causal attention, use only the previous KV cells
@@ -2341,11 +2944,11 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
             const int64_t n_seqs       = ubatch.n_seqs;
             // when using kv cache, the mask needs to match the kv cache size
-            const int64_t n_stride = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens;
+            const int64_t n_stride     = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer));
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask->buffer));
 
-            float * data = (float *) inp_KQ_mask->data;
+            float * data = (float *) inp_self_kq_mask->data;
 
             for (int h = 0; h < 1; ++h) {
                 for (int s1 = 0; s1 < n_seqs; ++s1) {
@@ -2442,14 +3045,14 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
         ggml_backend_tensor_set(inp_embd_enc, embd_enc.data(), 0, ggml_nbytes(inp_embd_enc));
     }
 
-    if (!is_encoding && inp_KQ_mask_cross) {
+    if (!is_encoding && inp_kq_mask_cross) {
         const int64_t n_output_enc = embd_enc.size() / hparams.n_embd;
         const int64_t n_tokens = ubatch.n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_cross->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask_cross->buffer));
         GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
 
-        float * data = (float *) inp_KQ_mask_cross->data;
+        float * data = (float *) inp_kq_mask_cross->data;
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
@@ -2529,11 +3132,11 @@ void llama_context_kv_self::kv_self_update() {
     }
 }
 
-ggml_tensor * llama_context_kv_self::build_inp_k_shift(ggml_context * ctx0) {
-    inp_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx());
-    ggml_set_input(inp_k_shift);
+ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) {
+    inp_self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx());
+    ggml_set_input(inp_self_k_shift);
 
-    return inp_k_shift;
+    return inp_self_k_shift;
 }
 
 void llama_context_kv_self::build_attn_inp(
@@ -2542,28 +3145,28 @@ void llama_context_kv_self::build_attn_inp(
                 bool   causal,
                 bool   swa,
                 bool   worst_case) {
-    const auto & hparams = model.hparams;
-
     const auto n_kv = worst_case ? kv_self.size : kv_self.n;
 
-    inp_KQ_mask = causal
+    inp_self_kq_mask = causal
         ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
         : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    //cb(inp_KQ_mask, "KQ_mask", -1);
-    ggml_set_input(inp_KQ_mask);
+    //cb(inp_self_kq_mask, "KQ_mask", -1);
+    ggml_set_input(inp_self_kq_mask);
 
-    inp_KQ_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask, GGML_TYPE_F16) : inp_KQ_mask;
+    inp_self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_self_kq_mask, GGML_TYPE_F16) : inp_self_kq_mask;
 
     if (swa) {
+        const auto & hparams = model.hparams;
+
         GGML_ASSERT(hparams.n_swa > 0);
 
-        inp_KQ_mask_swa = causal
+        inp_self_kq_mask_swa = causal
             ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
             : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp_KQ_mask_swa, "KQ_mask_swa", -1);
-        ggml_set_input(inp_KQ_mask_swa);
+        //cb(inp_self_kq_mask_swa, "KQ_mask_swa", -1);
+        ggml_set_input(inp_self_kq_mask_swa);
 
-        inp_KQ_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask_swa, GGML_TYPE_F16) : inp_KQ_mask_swa;
+        inp_self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_self_kq_mask_swa, GGML_TYPE_F16) : inp_self_kq_mask_swa;
     }
 }
 
@@ -2598,7 +3201,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
         // note: storing RoPE-ed version of K in the KV cache
         ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
 
-        assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
+        v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens);
 
         struct ggml_tensor * v_cache_view = nullptr;
 
@@ -2641,7 +3244,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
             }
     };
 
-    const auto & kq_mask = is_sliding ? inp_KQ_mask_swa_cnv : inp_KQ_mask_cnv;
+    const auto & kq_mask = is_sliding ? inp_self_kq_mask_swa_cnv : inp_self_kq_mask_cnv;
 
     const auto n_kv = worst_case ? kv_self.size : kv_self.n;
 
@@ -2754,15 +3357,6 @@ ggml_tensor * llama_context_kv_self::build_attn(
     return cur;
 }
 
-ggml_tensor * llama_context_kv_self::build_attn_soft_max(
-        ggml_context * ctx0,
-         ggml_tensor * kq,
-             float     kq_scale) {
-    const auto & hparams = model.hparams;
-
-    return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
-}
-
 void llama_context_kv_self::build_kv_self_shift(
         ggml_context * ctx0,
         ggml_cgraph * gf) {
@@ -2775,7 +3369,7 @@ void llama_context_kv_self::build_kv_self_shift(
 
     //GGML_ASSERT(kv_self.size == n_ctx);
 
-    ggml_tensor * inp_k_shift = build_inp_k_shift(ctx0);
+    ggml_tensor * inp_self_k_shift = build_inp_self_k_shift(ctx0);
 
     for (uint32_t il = 0; il < n_layer; ++il) {
         const int64_t n_head_kv    = hparams.n_head_kv(il);
@@ -2790,7 +3384,7 @@ void llama_context_kv_self::build_kv_self_shift(
                 ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
                 0);
 
-        ggml_tensor * cur = build_rope_shift(ctx0, k, inp_k_shift, rope_factors, kv_self.k_l[il]->buffer);
+        ggml_tensor * cur = build_rope_shift(ctx0, k, inp_self_k_shift, rope_factors, kv_self.k_l[il]->buffer);
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -3082,7 +3676,7 @@ ggml_tensor * llama_context_kv_self::build_inp_embd_enc(
     return inp_embd_enc;
 }
 
-ggml_tensor * llama_context_kv_self::build_inp_KQ_mask_cross(
+ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross(
             ggml_context * ctx0,
                  int32_t   n_tokens,
                  bool   worst_case) {
@@ -3092,10 +3686,10 @@ ggml_tensor * llama_context_kv_self::build_inp_KQ_mask_cross(
     // TODO: not sure if this is correct
     const int32_t n_outputs_enc = worst_case ? n_tokens : embd_enc.size() / n_embd;
 
-    inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    ggml_set_input(inp_KQ_mask_cross);
+    inp_kq_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    ggml_set_input(inp_kq_mask_cross);
 
-    return inp_KQ_mask_cross;
+    return inp_kq_mask_cross;
 }
 
 //
@@ -3765,11 +4359,23 @@ int32_t llama_apply_adapter_cvec(
 //
 
 struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
-    return llama_kv_cache_view_init(*ctx->get_kv_self(), n_seq_max);
+    const auto * kv = ctx->get_kv_self();
+    if (kv == nullptr) {
+        LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
+        return {};
+    }
+
+    return llama_kv_cache_view_init(*kv, n_seq_max);
 }
 
 void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
-    llama_kv_cache_view_update(view, *ctx->get_kv_self());
+    const auto * kv = ctx->get_kv_self();
+    if (kv == nullptr) {
+        LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
+        return;
+    }
+
+    llama_kv_cache_view_update(view, *kv);
 }
 
 //
@@ -3903,7 +4509,7 @@ void llama_kv_cache_defrag(llama_context * ctx) {
 }
 
 void llama_kv_self_defrag(llama_context * ctx) {
-    return llama_kv_cache_defrag(ctx->get_kv_self());
+    llama_kv_cache_defrag(ctx->get_kv_self());
 }
 
 // deprecated
diff --git a/src/llama-context.h b/src/llama-context.h
index 2b3d5f122bbbe..c605cec6f6a19 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -20,6 +20,7 @@ class llama_io_write_i;
 
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
+// basic transformer without KV cache
 struct llama_context : public llama_graph_i {
     llama_context(
             const llama_model & model,
@@ -38,17 +39,19 @@ struct llama_context : public llama_graph_i {
     virtual uint32_t n_ctx_per_seq() const;
     virtual uint32_t n_batch()       const;
     virtual uint32_t n_ubatch()      const;
-    virtual uint32_t n_seq_max()     const = 0;
+    virtual uint32_t n_seq_max()     const;
 
     virtual uint32_t n_threads()       const;
     virtual uint32_t n_threads_batch() const;
 
     virtual int32_t max_nodes() const;
 
-    virtual       llama_kv_cache * get_kv_self()       = 0;
-    virtual const llama_kv_cache * get_kv_self() const = 0;
+    // returns nullptr
+    virtual       llama_kv_cache * get_kv_self();
+    virtual const llama_kv_cache * get_kv_self() const;
 
-    virtual void kv_self_update() = 0;
+    // noop
+    virtual void kv_self_update();
 
     virtual enum llama_pooling_type pooling_type() const;
 
@@ -109,8 +112,6 @@ struct llama_context : public llama_graph_i {
             ggml_cgraph * gf,
                    bool   batched);
 
-    virtual void input_set(const llama_ubatch & ubatch);
-
     // Make sure enough space is available for outputs.
     // Returns max number of outputs for which space was reserved.
     virtual int32_t output_reserve(int32_t n_outputs);
@@ -128,7 +129,7 @@ struct llama_context : public llama_graph_i {
     // return positive int on warning
     // return negative int on error
     //
-    virtual int encode(llama_batch & inp_batch) = 0;
+    virtual int encode(llama_batch & inp_batch);
 
     // decode a batch of tokens by evaluating the transformer
     // in case of unsuccessful decoding (error or warning),
@@ -142,7 +143,7 @@ struct llama_context : public llama_graph_i {
     // return positive int on warning
     // return negative int on error
     //
-    virtual int decode(llama_batch & inp_batch) = 0;
+    virtual int decode(llama_batch & inp_batch);
 
     //
     // graph build API (generic)
@@ -204,6 +205,31 @@ struct llama_context : public llama_graph_i {
             ggml_context * ctx0,
                  int32_t   n_tokens);
 
+    virtual void build_attn_inp(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   causal,
+                    bool   swa,
+                    bool   worst_case);
+
+    virtual ggml_tensor * build_attn(
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+             ggml_tensor * wo,
+             ggml_tensor * wo_b,
+             ggml_tensor * q_cur,
+             ggml_tensor * k_cur,
+             ggml_tensor * v_cur,
+                 int32_t   n_tokens,
+                 float     kq_scale,
+                 int       il,
+                 bool      worst_case);
+
+    // perf
+
+    virtual llama_perf_context_data perf_get_data() const;
+    virtual void perf_reset();
+
     // state save/load
 
     virtual size_t state_get_size();
@@ -238,13 +264,7 @@ struct llama_context : public llama_graph_i {
      const llama_token * tokens,
                 size_t   n_token_count);
 
-    // perf
-
-    virtual llama_perf_context_data perf_get_data() const;
-    virtual void perf_reset();
-
 protected:
-
     // state save/load
 
     virtual size_t state_get_data(llama_io_write_i & io);
@@ -253,14 +273,21 @@ struct llama_context : public llama_graph_i {
     virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id);
     virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id);
 
-    // input tensors
+    // input
+
+    virtual void input_set(const llama_ubatch & ubatch);
 
-    struct ggml_tensor * inp_tokens;  // I32 [n_batch]
-    struct ggml_tensor * inp_embd;    // F32 [n_embd, n_batch]
-    struct ggml_tensor * inp_pos;     // I32 [n_batch]
-    struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
-    struct ggml_tensor * inp_mean;    // F32 [n_batch, n_batch]
-    struct ggml_tensor * inp_cls;     // I32 [n_batch]
+    // base input tensors
+    ggml_tensor * inp_tokens;  // I32 [n_batch]
+    ggml_tensor * inp_embd;    // F32 [n_embd, n_batch]
+    ggml_tensor * inp_pos;     // I32 [n_batch]
+    ggml_tensor * inp_out_ids; // I32 [n_outputs]
+    ggml_tensor * inp_mean;    // F32 [n_batch, n_batch]
+    ggml_tensor * inp_cls;     // I32 [n_batch]
+
+    // KQ mask input tensors
+    ggml_tensor * inp_kq_mask;     // F32 [n_tokens, n_batch]
+    ggml_tensor * inp_kq_mask_cnv; //     [n_tokens, n_batch]
 
     // members
 
@@ -337,8 +364,6 @@ class llama_context_kv_self : public llama_context {
 
     virtual ~llama_context_kv_self();
 
-    virtual uint32_t n_seq_max() const override;
-
     virtual       llama_kv_cache * get_kv_self()       override;
     virtual const llama_kv_cache * get_kv_self() const override;
 
@@ -346,8 +371,6 @@ class llama_context_kv_self : public llama_context {
 
     virtual ggml_cgraph * graph_init() override;
 
-    virtual void input_set(const llama_ubatch & ubatch) override;
-
     virtual int encode(llama_batch & inp_batch) override;
     virtual int decode(llama_batch & inp_batch) override;
 
@@ -357,17 +380,7 @@ class llama_context_kv_self : public llama_context {
     // certain implementations could require a padding for the context size
     uint32_t get_ctx_padding(const llama_cparams & cparams) const;
 
-    // === KV cache ===
-
-    llama_kv_cache kv_self;
-
-    ggml_tensor * inp_KQ_mask;         // F32 [kv_size, n_batch]
-    ggml_tensor * inp_KQ_mask_cnv;     //     [kv_size, n_batch]
-    ggml_tensor * inp_KQ_mask_swa;     // F32 [kv_size, n_batch]
-    ggml_tensor * inp_KQ_mask_swa_cnv; //     [kv_size, n_batch]
-    ggml_tensor * inp_k_shift;         // I32 [kv_size]
-
-    virtual ggml_tensor * build_inp_k_shift(ggml_context * ctx0) override;
+    virtual ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
 
     virtual void build_attn_inp(
             ggml_context * ctx0,
@@ -389,11 +402,6 @@ class llama_context_kv_self : public llama_context {
                  int       il,
                  bool      worst_case) override;
 
-    virtual ggml_tensor * build_attn_soft_max(
-            ggml_context * ctx0,
-             ggml_tensor * kq,
-                 float     kq_scale) override;
-
     virtual void build_kv_self_shift(
             ggml_context * ctx0,
             ggml_cgraph * gf) override;
@@ -414,14 +422,14 @@ class llama_context_kv_self : public llama_context {
 
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
-    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+    struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch]
 
     virtual ggml_tensor * build_inp_embd_enc(
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   worst_case) override;
 
-    virtual ggml_tensor * build_inp_KQ_mask_cross(
+    virtual ggml_tensor * build_inp_kq_mask_cross(
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   worst_case) override;
@@ -432,6 +440,16 @@ class llama_context_kv_self : public llama_context {
 
     virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
     virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
+
+    virtual void input_set(const llama_ubatch & ubatch) override;
+
+    llama_kv_cache kv_self;
+
+    ggml_tensor * inp_self_kq_mask;         // F32 [kv_size, n_batch]
+    ggml_tensor * inp_self_kq_mask_cnv;     //     [kv_size, n_batch]
+    ggml_tensor * inp_self_kq_mask_swa;     // F32 [kv_size, n_batch]
+    ggml_tensor * inp_self_kq_mask_swa_cnv; //     [kv_size, n_batch]
+    ggml_tensor * inp_self_k_shift;         // I32 [kv_size]
 };
 
 // a recurrent transformer (ie.e RWKV, Mamba)
@@ -447,8 +465,6 @@ class llama_context_recurrent : public llama_context_kv_self {
 
     virtual ggml_cgraph * graph_init() override;
 
-    virtual void input_set(const llama_ubatch & ubatch) override;
-
     virtual ggml_tensor * build_inp_s_copy(
             ggml_context * ctx0,
                     bool   worst_case) override;
@@ -506,6 +522,8 @@ class llama_context_recurrent : public llama_context_kv_self {
                     bool   worst_case) override;
 
 protected:
+    virtual void input_set(const llama_ubatch & ubatch) override;
+
     struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
     struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 17605e74cc90b..d9d4e00e98ba0 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -2,6 +2,84 @@
 
 #include "llama-impl.h"
 
+ggml_tensor * llama_graph_i::build_attn(
+        ggml_context * ctx0,
+         ggml_cgraph * gf,
+         ggml_tensor * wo,
+         ggml_tensor * wo_b,
+         ggml_tensor * q_cur,
+         ggml_tensor * k_cur,
+         ggml_tensor * v_cur,
+             int32_t   n_tokens,
+             float     kq_scale,
+             int       il,
+             bool      worst_case) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+    GGML_UNUSED(wo);
+    GGML_UNUSED(wo_b);
+    GGML_UNUSED(q_cur);
+    GGML_UNUSED(k_cur);
+    GGML_UNUSED(v_cur);
+    GGML_UNUSED(n_tokens);
+    GGML_UNUSED(kq_scale);
+    GGML_UNUSED(il);
+    GGML_UNUSED(worst_case);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+    return nullptr;
+}
+
+void llama_graph_i::build_kv_self_shift(
+        ggml_context * ctx0,
+        ggml_cgraph * gf) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+}
+
+void llama_graph_i::build_kv_self_defrag(
+        ggml_context * ctx0,
+        ggml_cgraph * gf) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+}
+
+ggml_tensor * llama_graph_i::build_inp_self_k_shift(
+        ggml_context * ctx0) {
+    GGML_UNUSED(ctx0);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+    return nullptr;
+}
+
+ggml_tensor * llama_graph_i::build_inp_embd_enc(
+        ggml_context * ctx0,
+             int32_t   n_tokens,
+                bool   worst_case) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(n_tokens);
+    GGML_UNUSED(worst_case);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+    return nullptr;
+}
+
+ggml_tensor * llama_graph_i::build_inp_kq_mask_cross(
+        ggml_context * ctx0,
+             int32_t   n_tokens,
+                bool   worst_case) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(n_tokens);
+    GGML_UNUSED(worst_case);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+    return nullptr;
+}
+
 ggml_tensor * llama_graph_i::build_inp_s_copy (
         ggml_context * ctx0,
                 bool   worst_case) {
diff --git a/src/llama-graph.h b/src/llama-graph.h
index b64e0f5f4fdb0..8d237431e657a 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -99,34 +99,29 @@ class llama_graph_i {
                  int32_t   n_tokens,
                  float     kq_scale,
                  int       il,
-                 bool      worst_case) = 0;
-
-    virtual ggml_tensor * build_attn_soft_max(
-            ggml_context * ctx0,
-             ggml_tensor * kq,
-                 float     kq_scale) = 0;
+                 bool      worst_case);
 
     virtual void build_kv_self_shift(
             ggml_context * ctx0,
-            ggml_cgraph * gf) = 0;
+            ggml_cgraph * gf);
 
     // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
     virtual void build_kv_self_defrag(
             ggml_context * ctx0,
-            ggml_cgraph * gf) = 0;
+            ggml_cgraph * gf);
 
-    virtual ggml_tensor * build_inp_k_shift(
-            ggml_context * ctx0) = 0;
+    virtual ggml_tensor * build_inp_self_k_shift(
+            ggml_context * ctx0);
 
     virtual ggml_tensor * build_inp_embd_enc(
             ggml_context * ctx0,
                  int32_t   n_tokens,
-                    bool   worst_case) = 0;
+                    bool   worst_case);
 
-    virtual ggml_tensor * build_inp_KQ_mask_cross(
+    virtual ggml_tensor * build_inp_kq_mask_cross(
             ggml_context * ctx0,
                  int32_t   n_tokens,
-                    bool   worst_case) = 0;
+                    bool   worst_case);
 
     virtual ggml_tensor * build_inp_s_copy(
             ggml_context * ctx0,
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 8a87f91290eed..3aec6495fe02e 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1079,14 +1079,26 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t cell_count)
 //
 
 int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) {
+    if (!kv) {
+        return 0;
+    }
+
     return kv->n_tokens();
 }
 
 int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
+    if (!kv) {
+        return 0;
+    }
+
     return kv->used;
 }
 
 void llama_kv_cache_clear(llama_kv_cache * kv) {
+    if (!kv) {
+        return;
+    }
+
     kv->clear();
 }
 
@@ -1095,6 +1107,10 @@ bool llama_kv_cache_seq_rm(
           llama_seq_id   seq_id,
              llama_pos   p0,
              llama_pos   p1) {
+    if (!kv) {
+        return true;
+    }
+
     return kv->seq_rm(seq_id, p0, p1);
 }
 
@@ -1104,10 +1120,18 @@ void llama_kv_cache_seq_cp(
           llama_seq_id   seq_id_dst,
              llama_pos   p0,
              llama_pos   p1) {
+    if (!kv) {
+        return;
+    }
+
     kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 
 void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id) {
+    if (!kv) {
+        return;
+    }
+
     kv->seq_keep(seq_id);
 }
 
@@ -1117,6 +1141,10 @@ void llama_kv_cache_seq_add(
              llama_pos   p0,
              llama_pos   p1,
              llama_pos   delta) {
+    if (!kv) {
+        return;
+    }
+
     kv->seq_add(seq_id, p0, p1, delta);
 }
 
@@ -1126,18 +1154,34 @@ void llama_kv_cache_seq_div(
              llama_pos   p0,
              llama_pos   p1,
                    int   d) {
+    if (!kv) {
+        return;
+    }
+
     kv->seq_div(seq_id, p0, p1, d);
 }
 
 llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id) {
+    if (!kv) {
+        return 0;
+    }
+
     return kv->seq_pos_max(seq_id);
 }
 
 void llama_kv_cache_defrag(llama_kv_cache * kv) {
+    if (!kv) {
+        return;
+    }
+
     kv->defrag();
 }
 
 bool llama_kv_cache_can_shift(const llama_kv_cache * kv) {
+    if (!kv) {
+        return false;
+    }
+
     return kv->can_shift;
 }
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index debbacbb6183b..a0a7816da2ebf 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3956,8 +3956,8 @@ struct llm_build_context {
     }
 
     // TODO: tmp
-    struct ggml_tensor * build_inp_KQ_mask_cross() {
-        ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
+    struct ggml_tensor * build_inp_kq_mask_cross() {
+        ggml_tensor * cur = lgf->build_inp_kq_mask_cross(ctx0, n_tokens, worst_case);
         cb(cur, "KQ_mask_cross", -1);
 
         return cur;
@@ -5568,7 +5568,6 @@ struct llm_build_context {
             // self-attention
             if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
                 Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
-                cb(Qcur, "Qcur", il);
 
                 if (model.layers[il].attn_q_norm) {
                     Qcur = build_norm(Qcur,
@@ -5578,7 +5577,6 @@ struct llm_build_context {
                 }
 
                 Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
-                cb(Kcur, "Kcur", il);
 
                 if (model.layers[il].attn_k_norm) {
                     Kcur = build_norm(Kcur,
@@ -5586,11 +5584,12 @@ struct llm_build_context {
                             model.layers[il].attn_k_norm_b,
                             LLM_NORM, il);
                 }
+
                 Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
-                cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
             } else {
                 // compute Q and K and RoPE them
                 cur = build_lora_mm(model.layers[il].wqkv, cur);
@@ -5600,10 +5599,6 @@ struct llm_build_context {
                 Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
 
-                cb(Qcur, "Qcur", il);
-                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
-
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -5617,40 +5612,17 @@ struct llm_build_context {
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
-            }
-
-            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
 
-            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-            cb(kq, "kq", il);
-
-            //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
-            kq = lgf->build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
-            cb(kq, "kq_soft_max_ext", il);
-
-            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
-            cb(v, "v", il);
-
-            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
-            cb(kqv, "kqv", il);
-
-            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-            cb(kqv_merged, "kqv_merged", il);
-
-            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-            cb(cur, "kqv_merged_cont", il);
-
-            ggml_build_forward_expand(gf, cur);
-
-            cur = build_lora_mm(model.layers[il].wo, cur);
-            if (model.layers[il].bo) {
-                cb(cur, "kqv_wo", il);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
             }
 
-            if (model.layers[il].bo) {
-                cur = ggml_add(ctx0, cur, model.layers[il].bo);
-            }
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(gf,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             cb(cur, "kqv_out", il);
 
             if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
@@ -9652,7 +9624,7 @@ struct llm_build_context {
     //    struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false);
 
     //    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    //    struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
+    //    struct ggml_tensor * KQ_mask_enc = build_inp_kq_mask(false);
 
     //    for (int il = 0; il < n_layer; ++il) {
     //        struct ggml_tensor * inpSA = inpL;
@@ -9781,8 +9753,8 @@ struct llm_build_context {
     //    struct ggml_tensor * embd_enc       = build_inp_embd_enc();
     //    struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true);
 
-    //    struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
-    //    struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross();
+    //    struct ggml_tensor * KQ_mask_dec   = build_inp_kq_mask();
+    //    struct ggml_tensor * KQ_mask_cross = build_inp_kq_mask_cross();
 
     //    for (int il = 0; il < n_layer; ++il) {
     //        struct ggml_tensor * inpSA = inpL;
diff --git a/src/llama.cpp b/src/llama.cpp
index 3db1644775fe7..9bacc9e9b4bea 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -328,6 +328,11 @@ struct llama_context * llama_init_from_model(
     try {
         // TODO: make static method of llama_context
         switch (model->arch) {
+            case LLM_ARCH_BERT:
+            case LLM_ARCH_JINA_BERT_V2:
+            case LLM_ARCH_NOMIC_BERT:
+                ctx = new llama_context(*model, params);
+                break;
             case LLM_ARCH_RWKV6:
             case LLM_ARCH_RWKV6QWEN2:
             case LLM_ARCH_MAMBA:

From ad870c49f4bc838ed0408bdc4bc976739019c286 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 20 Feb 2025 19:52:42 +0200
Subject: [PATCH 60/84] context : fix causal input for cache-less case

ggml-ci
---
 src/llama-context.cpp | 91 ++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 53 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 6b2a11ad69097..648a669b16e6a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -48,6 +48,7 @@ llama_context::llama_context(
     // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
     // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
     // ref: https://github.com/ggerganov/llama.cpp/pull/5021
+    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
     if (cparams.n_batch < GGML_KQ_MASK_PAD) {
         LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
         cparams.n_batch = GGML_KQ_MASK_PAD;
@@ -2127,60 +2128,44 @@ void llama_context::input_set(const llama_ubatch & ubatch) {
     }
 
     if (inp_kq_mask) {
-        // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
         if (cparams.causal_attn) {
-            // TODO: need to use the batch directly to construct the masks
-            GGML_ABORT("TODO");
-
-            //const int64_t n_kv         = ubatch.n_tokens;
-            //const int64_t n_tokens     = ubatch.n_tokens;
-            //const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-            //const int64_t n_seqs       = ubatch.n_seqs;
-
-            //float * data     = nullptr;
-
-            //if (inp_kq_mask) {
-            //    GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer));
-            //    data = (float *) inp_kq_mask->data;
-            //}
-
-            //// For causal attention, use only the previous KV cells
-            //// of the correct sequence for each token of the ubatch.
-            //// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
-            //for (int h = 0; h < 1; ++h) {
-            //    for (int s = 0; s < n_seqs; ++s) {
-            //        const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            //        for (int j = 0; j < n_seq_tokens; ++j) {
-            //            const llama_pos pos = ubatch.pos[s*n_seq_tokens + j];
-
-            //            for (int i = 0; i < n_kv; ++i) {
-            //                float f;
-            //                if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-            //                    f = -INFINITY;
-            //                } else {
-            //                    if (hparams.use_alibi) {
-            //                        f = -std::abs(kv_self.cells[i].pos - pos);
-            //                    } else {
-            //                        f = 0.0f;
-            //                    }
-            //                }
-
-            //                if (data) {
-            //                    data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-            //                }
-            //            }
-            //        }
-            //    }
-
-            //    if (data) {
-            //        for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-            //            for (int j = 0; j < n_kv; ++j) {
-            //                data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-            //            }
-            //        }
-            //    }
-            //}
+            const int64_t n_kv         = ubatch.n_tokens;
+            const int64_t n_tokens     = ubatch.n_tokens;
+            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+            const int64_t n_seqs       = ubatch.n_seqs;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer));
+            float * data = (float *) inp_kq_mask->data;
+
+            for (int h = 0; h < 1; ++h) {
+                for (int s1 = 0; s1 < n_seqs; ++s1) {
+                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
+
+                    for (int j = 0; j < n_seq_tokens; ++j) {
+                        const int32_t tj = s1*n_seq_tokens + j;
+
+                        for (int s0 = 0; s0 < n_seqs; ++s0) {
+                            for (int i = 0; i < n_seq_tokens; ++i) {
+                                const int32_t ti = s0*n_seq_tokens + i;
+                                float f = -INFINITY;
+
+                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
+                                    if (ubatch.seq_id[s0][s] == seq_id && ubatch.pos[ti] <= ubatch.pos[tj]) {
+                                        if (hparams.use_alibi) {
+                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
+                                        } else {
+                                            f = 0.0f;
+                                        }
+                                        break;
+                                    }
+                                }
+
+                                data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f;
+                            }
+                        }
+                    }
+                }
+            }
         } else {
             const int64_t n_tokens     = ubatch.n_tokens;
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;

From 08011c2ca12ee95b2041561f69ef0cc0be865dca Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 20 Feb 2025 20:54:18 +0200
Subject: [PATCH 61/84] context : add llama_kv_cache_recurrent prototype

ggml-ci
---
 src/llama-context.cpp | 548 +++++++++++++++++++++++++++++++++++-------
 src/llama-context.h   |  20 +-
 src/llama-kv-cache.h  |   9 +-
 3 files changed, 476 insertions(+), 101 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 648a669b16e6a..64728e8b592ef 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -359,17 +359,17 @@ int32_t llama_context::max_nodes() const {
 }
 
 llama_kv_cache * llama_context::get_kv_self() {
-    LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__);
+    LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__);
     return nullptr;
 }
 
 const llama_kv_cache * llama_context::get_kv_self() const {
-    LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__);
+    LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__);
     return nullptr;
 }
 
 void llama_context::kv_self_update() {
-    LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__);
+    LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__);
 }
 
 enum llama_pooling_type llama_context::pooling_type() const {
@@ -2246,14 +2246,7 @@ llama_context_kv_self::llama_context_kv_self(
     ggml_type type_k = params.type_k;
     ggml_type type_v = params.type_v;
 
-    // Mamba only needs a constant number of KV cache cells per sequence
-    if (llama_model_is_recurrent(&model)) {
-        // Mamba needs at least as many KV cells as there are sequences kept at any time
-        kv_size = std::max((uint32_t) 1, params.n_seq_max);
-        // it's probably best to keep as much precision as possible for the states
-        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
-        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
-    }
+    GGML_ASSERT(!llama_model_is_recurrent(&model));
 
     GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
     GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
@@ -2286,6 +2279,61 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
     return &kv_self;
 }
 
+void llama_context_kv_self::kv_self_update() {
+    auto & kv = kv_self;
+
+    if (kv.has_shift) {
+        if (!kv.can_shift) {
+            GGML_ABORT("The current context does not support K-shift");
+        }
+
+        // apply K-shift if needed
+        if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
+            ggml_backend_sched_reset(sched.get());
+
+            auto * gf = graph_init();
+
+            build_kv_self_shift(ctx_compute.get(), gf);
+
+            ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+            input_set({});
+
+            graph_compute(gf, false);
+
+            need_reserve = true;
+        }
+
+        {
+            kv.has_shift = false;
+
+            for (uint32_t i = 0; i < kv.size; ++i) {
+                kv.cells[i].delta = 0;
+            }
+        }
+    }
+
+    // defragment the KV cache if needed
+    if (kv.do_defrag) {
+        ggml_backend_sched_reset(sched.get());
+
+        auto * gf = graph_init();
+
+        build_kv_self_defrag(ctx_compute.get(), gf);
+
+        ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+        // no input
+        //input_set({});
+
+        graph_compute(gf, false);
+
+        kv.do_defrag = false;
+
+        need_reserve = true;
+    }
+}
+
 ggml_cgraph * llama_context_kv_self::graph_init() {
     inp_embd_enc      = nullptr;
     inp_pos_bucket    = nullptr;
@@ -2310,7 +2358,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
 
     // temporary allocate memory for the input batch if needed
     // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
     const int32_t n_tokens = batch.n_tokens;
@@ -2470,7 +2518,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
     // temporary allocate memory for the input batch if needed
     // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
 
@@ -2552,7 +2600,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
     const bool logits_all = n_outputs_all == n_tokens_all;
 
     sbatch.from_batch(batch, n_embd,
-            /* simple_split */ !kv_self.recurrent,
+            /* simple_split */ true,
             /* logits_all   */ logits_all);
 
     // reserve output buffer
@@ -2569,18 +2617,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
         const auto & n_ubatch = cparams.n_ubatch;
 
-        if (kv_self.recurrent) {
-            if (embd_pooled) {
-                // Pooled embeddings cannot be split across ubatches (yet)
-                ubatch = sbatch.split_seq(n_ubatch);
-            } else {
-                // recurrent model architectures are easier to implement
-                // with equal-length sequences
-                ubatch = sbatch.split_equal(n_ubatch);
-            }
-        } else {
-            ubatch = sbatch.split_simple(n_ubatch);
-        }
+        ubatch = sbatch.split_simple(n_ubatch);
 
         // count the outputs in this u_batch
         {
@@ -2617,7 +2654,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
             bg.save(slot_info);
 
-            if (!kv_self.recurrent) {
+            {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
                 // after enough generations, the benefit from this heuristic disappears
                 // if we start defragmenting the cache, the benefit from this will be more important
@@ -2821,10 +2858,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-llama_pos llama_context_kv_self::pos_max() const {
-    return kv_self.pos_max();
-}
-
 uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) const {
     return kv_self.get_padding(cparams);
 }
@@ -3062,61 +3095,6 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     }
 }
 
-void llama_context_kv_self::kv_self_update() {
-    auto & kv = kv_self;
-
-    if (kv.has_shift) {
-        if (!kv.can_shift) {
-            GGML_ABORT("The current context does not support K-shift");
-        }
-
-        // apply K-shift if needed
-        if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
-            ggml_backend_sched_reset(sched.get());
-
-            auto * gf = graph_init();
-
-            build_kv_self_shift(ctx_compute.get(), gf);
-
-            ggml_backend_sched_alloc_graph(sched.get(), gf);
-
-            input_set({});
-
-            graph_compute(gf, false);
-
-            need_reserve = true;
-        }
-
-        {
-            kv.has_shift = false;
-
-            for (uint32_t i = 0; i < kv.size; ++i) {
-                kv.cells[i].delta = 0;
-            }
-        }
-    }
-
-    // defragment the KV cache if needed
-    if (kv.do_defrag) {
-        ggml_backend_sched_reset(sched.get());
-
-        auto * gf = graph_init();
-
-        build_kv_self_defrag(ctx_compute.get(), gf);
-
-        ggml_backend_sched_alloc_graph(sched.get(), gf);
-
-        // no input
-        //input_set({});
-
-        graph_compute(gf, false);
-
-        kv.do_defrag = false;
-
-        need_reserve = true;
-    }
-}
-
 ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) {
     inp_self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx());
     ggml_set_input(inp_self_k_shift);
@@ -3176,7 +3154,9 @@ ggml_tensor * llama_context_kv_self::build_attn(
 
     // store to KV cache
     {
-        const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+        GGML_ASSERT(!kv_self.recurrent);
+
+        const auto kv_head = worst_case ? kv_self.size - n_tokens : kv_self.head;
 
         GGML_ASSERT(kv_self.size == n_ctx);
 
@@ -3684,22 +3664,406 @@ ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross(
 llama_context_recurrent::llama_context_recurrent(
         const llama_model & model,
         const llama_context_params & params) :
-    llama_context_kv_self(model, params) {
+    llama_context(model, params),
+    kv_self(model.hparams) {
     LLAMA_LOG_INFO("%s: constructing llama_context_recurrent\n", __func__);
+
+    const auto & hparams = model.hparams;
+
+    LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
+
+    // Mamba only needs a constant number of KV cache cells per sequence
+    GGML_ASSERT(llama_model_is_recurrent(&model));
+
+    // Mamba needs at least as many KV cells as there are sequences kept at any time
+    uint32_t kv_size = std::max((uint32_t) 1, params.n_seq_max);
+    // it's probably best to keep as much precision as possible for the states
+    ggml_type type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
+    ggml_type type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
+
+    GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
+    GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
+
+    if (!hparams.vocab_only) {
+        if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
+            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
+            throw std::runtime_error("failed to initialize self-attention cache");
+        }
+
+        {
+            const size_t memory_size_k = kv_self.size_k_bytes();
+            const size_t memory_size_v = kv_self.size_v_bytes();
+
+            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+        }
+    }
 }
 
 llama_context_recurrent::~llama_context_recurrent() = default;
 
+llama_kv_cache * llama_context_recurrent::get_kv_self() {
+    return &kv_self;
+}
+
+const llama_kv_cache * llama_context_recurrent::get_kv_self() const {
+    return &kv_self;
+}
+
+void llama_context_recurrent::kv_self_update() {
+    // noop
+}
+
 ggml_cgraph * llama_context_recurrent::graph_init() {
-    inp_s_copy          = nullptr;
-    inp_s_mask          = nullptr;
+    inp_s_copy = nullptr;
+    inp_s_mask = nullptr;
 
-    return llama_context_kv_self::graph_init();
+    return llama_context::graph_init();
+}
+
+int llama_context_recurrent::encode(llama_batch & inp_batch) {
+    GGML_UNUSED(inp_batch);
+
+    LLAMA_LOG_ERROR("%s: encode() not supported for recurrent models\n", __func__);
+    return -1;
+}
+
+int llama_context_recurrent::decode(llama_batch & inp_batch) {
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1);
+
+    const llama_batch & batch = batch_allocr.batch;
+
+    const auto & vocab   = model.vocab;
+    const auto & hparams = model.hparams;
+
+    const int32_t n_vocab = vocab.n_tokens();
+
+    const int64_t n_tokens_all = batch.n_tokens;
+    const int64_t n_embd       = hparams.n_embd;
+
+    // TODO: remove this stuff
+    class batch_guard {
+    public:
+        batch_guard(llama_kv_cache & kv_self) : kv_slot_restorer(kv_self) {
+        }
+
+        ~batch_guard() {
+            if (!is_done) {
+                kv_slot_restorer.restore();
+            }
+        }
+
+        void done() {
+            is_done = true;
+        }
+
+        void save(const llama_kv_cache_slot_info & slot_info) {
+            kv_slot_restorer.save(slot_info);
+        }
+
+    private:
+        bool is_done = false;
+
+        llama_kv_slot_restorer kv_slot_restorer;
+    };
+
+    batch_guard bg(kv_self);
+
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+    if (batch.token) {
+        for (int64_t i = 0; i < n_tokens_all; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
+                throw std::runtime_error("invalid token");
+            }
+        }
+    }
+
+    GGML_ASSERT(n_tokens_all <= cparams.n_batch);
+
+    GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
+
+    if (t_compute_start_us == 0) {
+        t_compute_start_us = ggml_time_us();
+    }
+    n_queued_tokens += n_tokens_all;
+
+    // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
+    embd_seq.clear();
+
+    int64_t n_outputs_all = 0;
+
+    // count outputs
+    if (batch.logits && !embd_pooled) {
+        for (uint32_t i = 0; i < n_tokens_all; ++i) {
+            n_outputs_all += batch.logits[i] != 0;
+        }
+    } else if (logits_all || embd_pooled) {
+        n_outputs_all = n_tokens_all;
+    } else {
+        // keep last output only
+        n_outputs_all = 1;
+    }
+
+    const bool logits_all = n_outputs_all == n_tokens_all;
+
+    sbatch.from_batch(batch, n_embd,
+            /* simple_split */ false,
+            /* logits_all   */ logits_all);
+
+    // reserve output buffer
+    if (output_reserve(n_outputs_all) < n_outputs_all) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
+        return -2;
+    };
+
+    int64_t n_outputs_prev = 0;
+
+    while (sbatch.n_tokens > 0) {
+        llama_ubatch ubatch = llama_ubatch();
+
+        const auto & n_ubatch = cparams.n_ubatch;
+
+        if (embd_pooled) {
+            // Pooled embeddings cannot be split across ubatches (yet)
+            ubatch = sbatch.split_seq(n_ubatch);
+        } else {
+            // recurrent model architectures are easier to implement
+            // with equal-length sequences
+            ubatch = sbatch.split_equal(n_ubatch);
+        }
+
+        // count the outputs in this u_batch
+        {
+            int32_t n_outputs_new = 0;
+
+            if (n_outputs_all == n_tokens_all) {
+                n_outputs_new = ubatch.n_tokens;
+            } else {
+                GGML_ASSERT(ubatch.output);
+                for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
+                    n_outputs_new += (int32_t) (ubatch.output[i] != 0);
+                }
+            }
+
+            // needs to happen before the graph is built
+            n_outputs = n_outputs_new;
+        }
+
+        // non-causal masks do not use the KV cache
+        if (hparams.causal_attn) {
+            kv_self_update();
+
+            // if we have enough unused cells before the current head ->
+            //   better to start searching from the beginning of the cache, hoping to fill it
+            if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
+                kv_self.head = 0;
+            }
+
+            const auto slot_info = kv_self.find_slot(ubatch);
+            if (!slot_info) {
+                LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
+                return -3;
+            }
+
+            bg.save(slot_info);
+        }
+
+        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
+
+        // reserve a worst case graph if needed
+        if (need_reserve) {
+            LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
+
+            // build worst-case graph
+            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+            auto * gf = graph_init();
+            graph_build(ctx_compute.get(), gf, ubatch, true);
+
+            // initialize scheduler with the worst-case graph
+            ggml_backend_sched_reset(sched.get());
+            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+            }
+
+            need_reserve = false;
+        }
+
+        ggml_backend_sched_reset(sched.get());
+        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+        auto * gf = graph_init();
+        auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
+
+        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+        ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+        input_set(ubatch);
+
+        const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
+        if (compute_status != GGML_STATUS_SUCCESS) {
+            switch (compute_status) {
+                case GGML_STATUS_ABORTED:
+                    return 2;
+                case GGML_STATUS_ALLOC_FAILED:
+                    return -2;
+                case GGML_STATUS_FAILED:
+                default:
+                    return -3;
+            }
+        }
+
+        // update the kv ring buffer
+        {
+            kv_self.head += ubatch.n_tokens;
+
+            // Ensure kv cache head points to a valid index.
+            if (kv_self.head >= kv_self.size) {
+                kv_self.head = 0;
+            }
+        }
+
+        // plot the computation graph in dot format (for debugging purposes)
+        //if (n_past%100 == 0) {
+        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+        //}
+
+        auto * t_logits = cparams.embeddings ? nullptr    : res.t_logits;
+        auto * t_embd   = cparams.embeddings ? res.t_embd : nullptr;
+
+        if (t_embd && res.t_embd_pooled) {
+            t_embd = res.t_embd_pooled;
+        }
+
+        // extract logits
+        if (t_logits && n_outputs > 0) {
+            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
+            GGML_ASSERT(backend_res != nullptr);
+            GGML_ASSERT(logits != nullptr);
+
+            float * logits_out = logits + n_outputs_prev*n_vocab;
+
+            if (n_outputs) {
+                GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
+                GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size);
+                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+            }
+        }
+
+        // extract embeddings
+        if (t_embd && n_outputs > 0) {
+            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+            GGML_ASSERT(backend_embd != nullptr);
+
+            switch (cparams.pooling_type) {
+                case LLAMA_POOLING_TYPE_NONE:
+                    {
+                        // extract token embeddings
+                        GGML_ASSERT(embd != nullptr);
+                        float * embd_out = embd + n_outputs_prev*n_embd;
+
+                        if (n_outputs) {
+                            GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
+                            GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_MEAN:
+                case LLAMA_POOLING_TYPE_CLS:
+                case LLAMA_POOLING_TYPE_LAST:
+                    {
+                        // extract sequence embeddings (cleared before processing each batch)
+                        auto & embd_seq_out = embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(n_embd);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // extract the rerank score - a single float per sequence
+                        auto & embd_seq_out = embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(1);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                    {
+                        GGML_ABORT("unknown pooling type");
+                    }
+            }
+        }
+
+        n_outputs_prev += n_outputs;
+    }
+
+    // finalize the batch processing
+    bg.done();
+
+    // set output mappings
+    {
+        bool sorted_output = true;
+
+        GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
+
+        for (int64_t i = 0; i < n_outputs_all; ++i) {
+            int64_t out_id = sbatch.out_ids[i];
+            output_ids[out_id] = i;
+            if (out_id != i) {
+                sorted_output = false;
+            }
+        }
+
+        if (sorted_output) {
+            sbatch.out_ids.clear();
+        }
+    }
+
+    // set to total number of outputs in the batch, for use in llama_get_logits_ith
+    n_outputs = n_outputs_all;
+
+    // wait for the computation to finish (automatically done when obtaining the model output)
+    //synchronize();
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    return 0;
 }
 
 void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
     // call base functionality
-    llama_context_kv_self::input_set(ubatch);
+    llama_context::input_set(ubatch);
 
     GGML_ASSERT(kv_self.recurrent);
 
diff --git a/src/llama-context.h b/src/llama-context.h
index c605cec6f6a19..df6acb265d52f 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -374,9 +374,6 @@ class llama_context_kv_self : public llama_context {
     virtual int encode(llama_batch & inp_batch) override;
     virtual int decode(llama_batch & inp_batch) override;
 
-    // max token position across all sequences in the current context
-    llama_pos pos_max() const;
-
     // certain implementations could require a padding for the context size
     uint32_t get_ctx_padding(const llama_cparams & cparams) const;
 
@@ -453,9 +450,7 @@ class llama_context_kv_self : public llama_context {
 };
 
 // a recurrent transformer (ie.e RWKV, Mamba)
-// TODO: temporary reuse kv_self, but in the future, implement recurrent-specific context with specific cache
-//class llama_context_recurrent : public llama_context {
-class llama_context_recurrent : public llama_context_kv_self {
+class llama_context_recurrent : public llama_context {
 public:
     llama_context_recurrent(
             const llama_model & model,
@@ -463,8 +458,16 @@ class llama_context_recurrent : public llama_context_kv_self {
 
     virtual ~llama_context_recurrent();
 
+    virtual       llama_kv_cache * get_kv_self()       override;
+    virtual const llama_kv_cache * get_kv_self() const override;
+
+    virtual void kv_self_update() override;
+
     virtual ggml_cgraph * graph_init() override;
 
+    virtual int encode(llama_batch & inp_batch) override;
+    virtual int decode(llama_batch & inp_batch) override;
+
     virtual ggml_tensor * build_inp_s_copy(
             ggml_context * ctx0,
                     bool   worst_case) override;
@@ -524,10 +527,11 @@ class llama_context_recurrent : public llama_context_kv_self {
 protected:
     virtual void input_set(const llama_ubatch & ubatch) override;
 
+    // TODO: change name to something more meaningful -- does "KV cache" make sense for recurrent models?
+    llama_kv_cache_recurrent kv_self;
+
     struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
     struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
-
-    // TODO: add recurrent cache
 };
 
 // For internal test use
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 049193fd0f176..dda9bfec48846 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -48,7 +48,6 @@ struct llama_kv_cache_slot_info {
 // ring-buffer of cached KV data
 // TODO: pimpl
 // TODO: add notion of max sequences
-// TODO: add llama_hparams &
 struct llama_kv_cache {
     llama_kv_cache(const llama_hparams & hparams);
     virtual ~llama_kv_cache() = default;
@@ -108,7 +107,10 @@ struct llama_kv_cache {
 
     bool has_shift = false;
     bool do_defrag = false;
+
+    // TODO: remove this and implement llama_kv_cache_recurrent instead
     bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
+
     bool v_trans   = true;  // the value tensor is transposed
     bool can_shift = false;
 
@@ -141,6 +143,11 @@ struct llama_kv_cache {
     bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 
+// TODO: temporary reusing llama_kv_cache -- implement recurrent cache and simplify llama_kv_cache
+struct llama_kv_cache_recurrent : public llama_kv_cache {
+    using llama_kv_cache::llama_kv_cache;
+};
+
 //
 // kv cache restore
 //

From 2645a7d9a999de249e15ff3dae5eea1866221b57 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 21 Feb 2025 10:28:42 +0200
Subject: [PATCH 62/84] context : add save/load for recurrent context

ggml-ci
---
 src/llama-context.cpp | 42 ++++++++++++++++++++++++++++++++++++++----
 src/llama-context.h   |  6 ++++++
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 64728e8b592ef..4ce54b0d6f890 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -3657,6 +3657,40 @@ ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross(
     return inp_kq_mask_cross;
 }
 
+// state save/load
+
+size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
+    llama_context::state_get_data(io);
+
+    kv_self.state_write(io);
+
+    return io.n_bytes();
+}
+
+size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
+    llama_context::state_set_data(io);
+
+    kv_self.state_read(io);
+
+    return io.n_bytes();
+}
+
+size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
+    llama_context::state_seq_get_data(io, seq_id);
+
+    kv_self.state_write(io, seq_id);
+
+    return io.n_bytes();
+}
+
+size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
+    llama_context::state_seq_set_data(io, seq_id);
+
+    kv_self.state_read(io, seq_id);
+
+    return io.n_bytes();
+}
+
 //
 // llama_context_recurrent
 //
@@ -4527,7 +4561,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
 
 // state save/load
 
-size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
+size_t llama_context_recurrent::state_get_data(llama_io_write_i & io) {
     llama_context::state_get_data(io);
 
     kv_self.state_write(io);
@@ -4535,7 +4569,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
     return io.n_bytes();
 }
 
-size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
+size_t llama_context_recurrent::state_set_data(llama_io_read_i & io) {
     llama_context::state_set_data(io);
 
     kv_self.state_read(io);
@@ -4543,7 +4577,7 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
     return io.n_bytes();
 }
 
-size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
+size_t llama_context_recurrent::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
     llama_context::state_seq_get_data(io, seq_id);
 
     kv_self.state_write(io, seq_id);
@@ -4551,7 +4585,7 @@ size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_se
     return io.n_bytes();
 }
 
-size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
+size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
     llama_context::state_seq_set_data(io, seq_id);
 
     kv_self.state_read(io, seq_id);
diff --git a/src/llama-context.h b/src/llama-context.h
index df6acb265d52f..9d8b702208b0b 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -525,6 +525,12 @@ class llama_context_recurrent : public llama_context {
                     bool   worst_case) override;
 
 protected:
+    virtual size_t state_get_data(llama_io_write_i & io) override;
+    virtual size_t state_set_data(llama_io_read_i  & io) override;
+
+    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
+    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
+
     virtual void input_set(const llama_ubatch & ubatch) override;
 
     // TODO: change name to something more meaningful -- does "KV cache" make sense for recurrent models?

From 548c230dff1060820b7ef66653896accee3772cc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 21 Feb 2025 12:10:57 +0200
Subject: [PATCH 63/84] graph : remove worst_case from the API

ggml-ci
---
 src/llama-context.cpp  | 1902 ++++++++++++++++++++--------------------
 src/llama-context.h    |  274 +++---
 src/llama-graph.cpp    |   44 +-
 src/llama-graph.h      |   39 +-
 src/llama-kv-cache.cpp |    1 +
 src/llama-model.cpp    |  132 ++-
 src/llama-model.h      |    3 +-
 7 files changed, 1193 insertions(+), 1202 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4ce54b0d6f890..dc1eb70b85a5e 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -17,11 +17,12 @@
 llama_context::llama_context(
         const llama_model & model,
         const llama_context_params & params) :
-    model     (model),
-    t_start_us(model.t_start_us),
-    t_load_us (model.t_load_us) {
+    model     (model) {
     LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
 
+    t_start_us = model.t_start_us;
+    t_load_us  = model.t_load_us;
+
     const auto & hparams = model.hparams;
 
     cparams.n_seq_max        = std::max(1u, params.n_seq_max);
@@ -186,136 +187,174 @@ void llama_context::init() {
         return;
     }
 
-    // buffer types used for the compute buffer of each backend
-    std::vector<ggml_backend_buffer_type_t> backend_buft;
-    std::vector<ggml_backend_t>             backend_ptrs;
-    for (auto & backend : backends) {
-        auto * buft = ggml_backend_get_default_buffer_type(backend.get());
-        auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
-        if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
-            // use the host buffer of the first device CPU for faster transfer of the intermediate state
-            auto * dev = model.devices[0];
-            auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
-            if (host_buft) {
-                buft = host_buft;
+    {
+        // buffer types used for the compute buffer of each backend
+        backend_buft.clear();
+        backend_ptrs.clear();
+
+        for (auto & backend : backends) {
+            auto * buft = ggml_backend_get_default_buffer_type(backend.get());
+            auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+            if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
+                // use the host buffer of the first device CPU for faster transfer of the intermediate state
+                auto * dev = model.devices[0];
+                auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
+                if (host_buft) {
+                    buft = host_buft;
+                }
             }
+            backend_buft.push_back(buft);
+            backend_ptrs.push_back(backend.get());
         }
-        backend_buft.push_back(buft);
-        backend_ptrs.push_back(backend.get());
-    }
 
-    const size_t max_nodes = this->max_nodes();
+        const size_t max_nodes = this->max_nodes();
 
-    // buffer used to store the computation graph and the tensor meta data
-    // TODO: move to base class
-    buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+        // buffer used to store the computation graph and the tensor meta data
+        // TODO: move to base class
+        buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
 
-    // TODO: move these checks to ggml_backend_sched
-    // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
-    bool pipeline_parallel =
-        model.n_devices() > 1 &&
-        model.params.n_gpu_layers > (int) model.hparams.n_layer &&
-        model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
-        cparams.offload_kqv;
+        // TODO: move these checks to ggml_backend_sched
+        // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
+        bool pipeline_parallel =
+            model.n_devices() > 1 &&
+            model.params.n_gpu_layers > (int) model.hparams.n_layer &&
+            model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
+            cparams.offload_kqv;
 
-    // pipeline parallelism requires support for async compute and events in all devices
-    if (pipeline_parallel) {
-        for (auto & backend : backends) {
-            auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
-            if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
-                // ignore CPU backend
-                continue;
-            }
-            auto * dev = ggml_backend_get_device(backend.get());
-            ggml_backend_dev_props props;
-            ggml_backend_dev_get_props(dev, &props);
-            if (!props.caps.async || !props.caps.events) {
-                // device does not support async compute or events
-                pipeline_parallel = false;
-                break;
+        // pipeline parallelism requires support for async compute and events in all devices
+        if (pipeline_parallel) {
+            for (auto & backend : backends) {
+                auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                    // ignore CPU backend
+                    continue;
+                }
+                auto * dev = ggml_backend_get_device(backend.get());
+                ggml_backend_dev_props props;
+                ggml_backend_dev_get_props(dev, &props);
+                if (!props.caps.async || !props.caps.events) {
+                    // device does not support async compute or events
+                    pipeline_parallel = false;
+                    break;
+                }
             }
         }
+
+        sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
+
+        if (pipeline_parallel) {
+            LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
+        }
     }
 
-    sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
+    reserve();
+}
+
+void llama_context::synchronize() {
+    ggml_backend_sched_synchronize(sched.get());
+
+    // FIXME: if multiple single tokens are evaluated without a synchronization,
+    // the stats will be added to the prompt evaluation stats
+    // this should only happen when using batch size 1 to evaluate a batch
+
+    // add the evaluation to the stats
+    if (n_queued_tokens == 1) {
+        if (!cparams.no_perf) {
+            t_eval_us += ggml_time_us() - t_compute_start_us;
+        }
+        n_eval++;
+    } else if (n_queued_tokens > 1) {
+        if (!cparams.no_perf) {
+            t_p_eval_us += ggml_time_us() - t_compute_start_us;
+        }
+        n_p_eval += n_queued_tokens;
+    }
 
-    if (pipeline_parallel) {
-        LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
+    // get a more accurate load time, upon first eval
+    if (n_queued_tokens > 0 && !has_evaluated_once) {
+        t_load_us = ggml_time_us() - t_start_us;
+        has_evaluated_once = true;
     }
 
-    // initialize scheduler with the worst-case graph
-    {
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+    n_queued_tokens = 0;
+    t_compute_start_us = 0;
+}
 
-        int n_splits_pp = -1;
-        int n_nodes_pp  = -1;
+void llama_context::reserve() {
+    uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+    uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
-        int n_splits_tg = -1;
-        int n_nodes_tg  = -1;
+    llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
-        // reserve pp graph first so that buffers are only allocated once
-        {
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch_pp, true);
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
-                throw std::runtime_error("failed to allocate compute buffers");
-            }
+    int n_splits_pp = -1;
+    int n_nodes_pp  = -1;
 
-            n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
-            n_nodes_pp  = ggml_graph_n_nodes(gf);
-        }
+    int n_splits_tg = -1;
+    int n_nodes_tg  = -1;
 
-        // reserve with tg graph to get the number of splits and nodes
-        {
-            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch_tg, true);
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
-                throw std::runtime_error("failed to allocate compute buffers");
-            }
-            n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
-            n_nodes_tg  = ggml_graph_n_nodes(gf);
-        }
+    // max number of outputs
+    n_outputs = n_tokens;
 
-        // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-        {
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch_pp, true);
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
-                throw std::runtime_error("failed to allocate compute buffers");
-            }
+    // reserve pp graph first so that buffers are only allocated once
+    {
+        llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+        auto * gf = graph_init();
+        graph_build(ctx_compute.get(), gf, ubatch_pp);
+        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
+            throw std::runtime_error("failed to allocate compute buffers");
         }
 
-        for (size_t i = 0; i < backend_ptrs.size(); ++i) {
-            ggml_backend_t backend = backend_ptrs[i];
-            ggml_backend_buffer_type_t buft = backend_buft[i];
-            size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
-            if (size > 1) {
-                LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
-                        ggml_backend_buft_name(buft),
-                        size / 1024.0 / 1024.0);
-            }
+        n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
+        n_nodes_pp  = ggml_graph_n_nodes(gf);
+    }
+
+    // reserve with tg graph to get the number of splits and nodes
+    {
+        llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+        auto * gf = graph_init();
+        graph_build(ctx_compute.get(), gf, ubatch_tg);
+        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
+            throw std::runtime_error("failed to allocate compute buffers");
         }
+        n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
+        n_nodes_tg  = ggml_graph_n_nodes(gf);
+    }
 
-        if (n_nodes_pp == n_nodes_tg) {
-            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
-        } else {
-            LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+    // reserve again with pp graph to avoid ggml-alloc reallocations during inference
+    {
+        llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+        auto * gf = graph_init();
+        graph_build(ctx_compute.get(), gf, ubatch_pp);
+        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
+            throw std::runtime_error("failed to allocate compute buffers");
         }
+    }
 
-        if (n_splits_pp == n_splits_tg) {
-            LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
-        } else {
-            LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
+    for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+        ggml_backend_t             backend = backend_ptrs[i];
+        ggml_backend_buffer_type_t buft    = backend_buft[i];
+        size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+        if (size > 1) {
+            LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+                    ggml_backend_buft_name(buft),
+                    size / 1024.0 / 1024.0);
         }
     }
+
+    if (n_nodes_pp == n_nodes_tg) {
+        LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
+    } else {
+        LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+    }
+
+    if (n_splits_pp == n_splits_tg) {
+        LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
+    } else {
+        LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
+    }
 }
 
 const llama_model & llama_context::get_model() const {
@@ -547,201 +586,141 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
-void llama_context::synchronize() {
-    ggml_backend_sched_synchronize(sched.get());
-
-    // FIXME: if multiple single tokens are evaluated without a synchronization,
-    // the stats will be added to the prompt evaluation stats
-    // this should only happen when using batch size 1 to evaluate a batch
-
-    // add the evaluation to the stats
-    if (n_queued_tokens == 1) {
-        if (!cparams.no_perf) {
-            t_eval_us += ggml_time_us() - t_compute_start_us;
-        }
-        n_eval++;
-    } else if (n_queued_tokens > 1) {
-        if (!cparams.no_perf) {
-            t_p_eval_us += ggml_time_us() - t_compute_start_us;
-        }
-        n_p_eval += n_queued_tokens;
+int llama_context::encode(llama_batch & inp_batch) {
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
     }
 
-    // get a more accurate load time, upon first eval
-    if (n_queued_tokens > 0 && !has_evaluated_once) {
-        t_load_us = ggml_time_us() - t_start_us;
-        has_evaluated_once = true;
-    }
+    // temporary allocate memory for the input batch if needed
+    llama_batch_allocr batch_allocr(inp_batch, 0);
 
-    n_queued_tokens = 0;
-    t_compute_start_us = 0;
-}
+    const llama_batch & batch = batch_allocr.batch;
 
-ggml_cgraph * llama_context::graph_init() {
-    inp_tokens  = nullptr;
-    inp_embd    = nullptr;
-    inp_pos     = nullptr;
-    inp_out_ids = nullptr;
-    inp_mean    = nullptr;
-    inp_cls     = nullptr;
+    const int32_t n_tokens = batch.n_tokens;
 
-    inp_kq_mask     = nullptr;
-    inp_kq_mask_cnv = nullptr;
+    const auto & hparams = model.hparams;
 
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute_meta.size(),
-        /*.mem_buffer =*/ buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
-    ctx_compute.reset(ggml_init(params));
+    if (batch.token) {
+        for (int32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
 
-    return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false);
-}
+    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
+    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
 
-llama_graph_result llama_context::graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-                    bool   worst_case) {
-    return model.build_graph(ctx, gf, this, cparams, ubatch, worst_case);
-}
+    if (t_compute_start_us == 0) {
+        t_compute_start_us = ggml_time_us();
+    }
 
-enum ggml_status llama_context::graph_compute(
-            ggml_cgraph * gf,
-                   bool   batched) {
-    int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
-    ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
+    n_queued_tokens += n_tokens;
 
-    if (backend_cpu != nullptr) {
-        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
-        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
-        set_threadpool_fn(backend_cpu, tp);
-    }
+    const int64_t n_embd = hparams.n_embd;
 
-    // set the number of threads for all the backends
-    for (const auto & set_n_threads_fn : set_n_threads_fns) {
-        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
-    }
+    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
 
-    auto status = ggml_backend_sched_graph_compute_async(sched.get(), gf);
-    if (status != GGML_STATUS_SUCCESS) {
-        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
-    }
+    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
 
-    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
+    // reserve output buffer
+    if (output_reserve(n_tokens) < n_tokens) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+        return -2;
+    };
 
-    return status;
-}
+    for (int32_t i = 0; i < n_tokens; ++i) {
+        output_ids[i] = i;
+    }
 
-int32_t llama_context::output_reserve(int32_t n_outputs) {
-    const auto & hparams = model.hparams;
-    const auto & vocab   = model.vocab;
+    n_outputs = n_tokens;
 
-    const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
+    ggml_backend_sched_reset(sched.get());
+    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-    const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_tokens();
-    const auto n_embd  = hparams.n_embd;
+    auto * gf = graph_init();
+    auto res = graph_build(ctx_compute.get(), gf, ubatch);
 
-    // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+    ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
+    input_set(ubatch);
 
-    if (output_ids.empty()) {
-        // init, never resized afterwards
-        output_ids.resize(n_batch);
+    const auto compute_status = graph_compute(gf, n_tokens > 1);
+    switch (compute_status) {
+        case GGML_STATUS_SUCCESS:
+            break;
+        case GGML_STATUS_ABORTED:
+            return 2;
+        case GGML_STATUS_ALLOC_FAILED:
+            return -2;
+        case GGML_STATUS_FAILED:
+        default:
+            return -3;
     }
 
-    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
-    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
+    auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd;
 
-    // alloc only when more than the current capacity is required
-    // TODO: also consider shrinking the buffer
-    if (!buf_output || prev_size < new_size) {
-        if (buf_output) {
-#ifndef NDEBUG
-            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
-#endif
-            buf_output = nullptr;
-            logits = nullptr;
-            embd = nullptr;
-        }
+    // extract embeddings
+    if (t_embd) {
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+        GGML_ASSERT(backend_embd != nullptr);
 
-        auto * buft = ggml_backend_cpu_buffer_type();
-        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
-        auto * output_dev = model.dev_output();
-        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
-        if (output_dev_host_buft) {
-            buft = output_dev_host_buft;
-        }
-        buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
-        if (buf_output == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
-            return 0;
-        }
-    }
+        switch (cparams.pooling_type) {
+            case LLAMA_POOLING_TYPE_NONE:
+                {
+                    GGML_ASSERT(embd != nullptr);
 
-    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
+                    // extract token embeddings
+                    float * embd_out = embd;
 
-    logits = has_logits ? output_base               : nullptr;
-    embd   = has_embd   ? output_base + logits_size : nullptr;
-
-    output_size = n_outputs_max;
-
-    // set all ids as invalid (negative)
-    std::fill(output_ids.begin(), output_ids.end(), -1);
-
-    ggml_backend_buffer_clear(buf_output.get(), 0);
-
-    n_outputs = 0;
-
-    return n_outputs_max;
-}
-
-void llama_context::output_reorder() {
-    auto & out_ids = sbatch.out_ids;
-    if (!out_ids.empty()) {
-        const uint32_t n_vocab = model.vocab.n_tokens();
-        const uint32_t n_embd  = model.hparams.n_embd;
+                    GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
+                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                } break;
+            case LLAMA_POOLING_TYPE_MEAN:
+            case LLAMA_POOLING_TYPE_CLS:
+            case LLAMA_POOLING_TYPE_LAST:
+                {
+                    // extract sequence embeddings
+                    auto & embd_seq_out = embd_seq;
+                    embd_seq_out.clear();
 
-        GGML_ASSERT((size_t) n_outputs == out_ids.size());
+                    GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
 
-        // TODO: is there something more efficient which also minimizes swaps?
-        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-        for (int32_t i = 0; i < n_outputs - 1; ++i) {
-            int32_t j_min = i;
-            for (int32_t j = i + 1; j < n_outputs; ++j) {
-                if (out_ids[j] < out_ids[j_min]) {
-                    j_min = j;
-                }
-            }
-            if (j_min == i) { continue; }
-            std::swap(out_ids[i], out_ids[j_min]);
-            if (logits_size > 0) {
-                for (uint32_t k = 0; k < n_vocab; k++) {
-                    std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
+                    for (int32_t i = 0; i < n_tokens; i++) {
+                        const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
+                        embd_seq_out[seq_id].resize(n_embd);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                    }
+                } break;
+            case LLAMA_POOLING_TYPE_RANK:
+                {
+                    // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                    //       wait for an encoder model that requires this pooling type in order to test it
+                    //       https://github.com/ggerganov/llama.cpp/pull/9510
+                    GGML_ABORT("RANK pooling not implemented yet");
                 }
-            }
-            if (embd_size > 0) {
-                for (uint32_t k = 0; k < n_embd; k++) {
-                    std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
+            case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                {
+                    GGML_ABORT("unknown pooling type");
                 }
-            }
-        }
-        std::fill(output_ids.begin(), output_ids.end(), -1);
-        for (int32_t i = 0; i < n_outputs; ++i) {
-            output_ids[out_ids[i]] = i;
         }
-        out_ids.clear();
     }
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    return 0;
 }
 
-int llama_context::encode(llama_batch & inp_batch) {
+int llama_context::decode(llama_batch & inp_batch) {
     if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
@@ -752,103 +731,142 @@ int llama_context::encode(llama_batch & inp_batch) {
 
     const llama_batch & batch = batch_allocr.batch;
 
-    const int32_t n_tokens = batch.n_tokens;
-
+    const auto & vocab   = model.vocab;
     const auto & hparams = model.hparams;
 
+    const int32_t n_vocab = vocab.n_tokens();
+
+    const int64_t n_tokens = batch.n_tokens;
+    const int64_t n_embd   = hparams.n_embd;
+
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
     if (batch.token) {
-        for (int32_t i = 0; i < n_tokens; ++i) {
+        for (int64_t i = 0; i < n_tokens; ++i) {
             if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
-                return -1;
+                LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
+                throw std::runtime_error("invalid token");
             }
         }
     }
 
-    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
-    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
+    // micro-batching is not possible without KV cache
+    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "llama_context requires n_ubatch >= n_tokens");
 
     if (t_compute_start_us == 0) {
         t_compute_start_us = ggml_time_us();
     }
-
     n_queued_tokens += n_tokens;
 
-    const int64_t n_embd = hparams.n_embd;
+    // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
 
-    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+    embd_seq.clear();
+
+    int64_t n_outputs_all = 0;
+
+    // count outputs
+    if (batch.logits && !embd_pooled) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            n_outputs_all += batch.logits[i] != 0;
+        }
+    } else if (logits_all || embd_pooled) {
+        n_outputs_all = n_tokens;
+    } else {
+        // keep last output only
+        n_outputs_all = 1;
+    }
+
+    const bool logits_all = n_outputs_all == n_tokens;
+
+    sbatch.from_batch(batch, n_embd,
+            /* simple_split */ true,
+            /* logits_all   */ logits_all);
 
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
 
     // reserve output buffer
-    if (output_reserve(n_tokens) < n_tokens) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+    if (output_reserve(n_outputs_all) < n_outputs_all) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
         return -2;
     };
 
-    for (int32_t i = 0; i < n_tokens; ++i) {
-        output_ids[i] = i;
-    }
-
-    n_outputs = n_tokens;
-
-    GGML_ASSERT(need_reserve == false);
+    n_outputs = n_outputs_all;
 
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
     auto * gf = graph_init();
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
+    auto res = graph_build(ctx_compute.get(), gf, ubatch);
+
+    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
     input_set(ubatch);
 
-    const auto compute_status = graph_compute(gf, n_tokens > 1);
-    switch (compute_status) {
-        case GGML_STATUS_SUCCESS:
-            break;
-        case GGML_STATUS_ABORTED:
-            return 2;
-        case GGML_STATUS_ALLOC_FAILED:
-            return -2;
-        case GGML_STATUS_FAILED:
-        default:
-            return -3;
+    const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
+    if (compute_status != GGML_STATUS_SUCCESS) {
+        switch (compute_status) {
+            case GGML_STATUS_ABORTED:
+                return 2;
+            case GGML_STATUS_ALLOC_FAILED:
+                return -2;
+            case GGML_STATUS_FAILED:
+            default:
+                return -3;
+        }
     }
 
-    auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd;
+    auto * t_logits = cparams.embeddings ? nullptr    : res.t_logits;
+    auto * t_embd   = cparams.embeddings ? res.t_embd : nullptr;
+
+    if (t_embd && res.t_embd_pooled) {
+        t_embd = res.t_embd_pooled;
+    }
+
+    // extract logits
+    if (t_logits && n_outputs > 0) {
+        ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
+        GGML_ASSERT(backend_res != nullptr);
+        GGML_ASSERT(logits != nullptr);
+
+        float * logits_out = logits;
+
+        if (n_outputs) {
+            GGML_ASSERT(n_outputs <= n_outputs_all);
+            GGML_ASSERT(n_outputs*n_vocab <= (int64_t) logits_size);
+            ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+        }
+    }
 
     // extract embeddings
-    if (t_embd) {
+    if (t_embd && n_outputs > 0) {
         ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
         GGML_ASSERT(backend_embd != nullptr);
 
         switch (cparams.pooling_type) {
             case LLAMA_POOLING_TYPE_NONE:
                 {
-                    GGML_ASSERT(embd != nullptr);
-
                     // extract token embeddings
+                    GGML_ASSERT(embd != nullptr);
                     float * embd_out = embd;
 
-                    GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
-                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                    if (n_outputs) {
+                        GGML_ASSERT(n_outputs <= n_outputs_all);
+                        GGML_ASSERT(n_outputs*n_embd <= (int64_t) embd_size);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
+                    }
                 } break;
             case LLAMA_POOLING_TYPE_MEAN:
             case LLAMA_POOLING_TYPE_CLS:
             case LLAMA_POOLING_TYPE_LAST:
                 {
-                    // extract sequence embeddings
+                    // extract sequence embeddings (cleared before processing each batch)
                     auto & embd_seq_out = embd_seq;
-                    embd_seq_out.clear();
-
-                    GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
 
-                    for (int32_t i = 0; i < n_tokens; i++) {
-                        const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
                         if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                             continue;
                         }
@@ -858,11 +876,18 @@ int llama_context::encode(llama_batch & inp_batch) {
                 } break;
             case LLAMA_POOLING_TYPE_RANK:
                 {
-                    // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
-                    //       wait for an encoder model that requires this pooling type in order to test it
-                    //       https://github.com/ggerganov/llama.cpp/pull/9510
-                    GGML_ABORT("RANK pooling not implemented yet");
-                }
+                    // extract the rerank score - a single float per sequence
+                    auto & embd_seq_out = embd_seq;
+
+                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
+                        embd_seq_out[seq_id].resize(1);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                    }
+                } break;
             case LLAMA_POOLING_TYPE_UNSPECIFIED:
                 {
                     GGML_ABORT("unknown pooling type");
@@ -870,6 +895,28 @@ int llama_context::encode(llama_batch & inp_batch) {
         }
     }
 
+    // set output mappings
+    {
+        bool sorted_output = true;
+
+        GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
+
+        for (int64_t i = 0; i < n_outputs_all; ++i) {
+            int64_t out_id = sbatch.out_ids[i];
+            output_ids[out_id] = i;
+            if (out_id != i) {
+                sorted_output = false;
+            }
+        }
+
+        if (sorted_output) {
+            sbatch.out_ids.clear();
+        }
+    }
+
+    // wait for the computation to finish (automatically done when obtaining the model output)
+    //synchronize();
+
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
     ggml_backend_sched_reset(sched.get());
@@ -877,212 +924,438 @@ int llama_context::encode(llama_batch & inp_batch) {
     return 0;
 }
 
-int llama_context::decode(llama_batch & inp_batch) {
-    if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
-        return -1;
-    }
-
-    // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, 0);
+//
+// input
+//
 
-    const llama_batch & batch = batch_allocr.batch;
+void llama_context::input_set(const llama_ubatch & ubatch) {
+    const llama_hparams & hparams = model.hparams;
 
-    const auto & vocab   = model.vocab;
-    const auto & hparams = model.hparams;
+    if (ubatch.token) {
+        const int64_t n_tokens = ubatch.n_tokens;
 
-    const int32_t n_vocab = vocab.n_tokens();
+        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
+    }
 
-    const int64_t n_tokens = batch.n_tokens;
-    const int64_t n_embd   = hparams.n_embd;
+    if (ubatch.embd) {
+        const int64_t n_embd   = hparams.n_embd;
+        const int64_t n_tokens = ubatch.n_tokens;
 
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+        ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd));
+    }
 
-    if (batch.token) {
-        for (int64_t i = 0; i < n_tokens; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
-                throw std::runtime_error("invalid token");
+    if (ubatch.pos && inp_pos) {
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos));
+    }
+
+    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs");
+
+        if (!inp_out_ids) {
+            LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__);
+        } else {
+            const int64_t n_tokens = ubatch.n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer));
+            int32_t * data = (int32_t *) inp_out_ids->data;
+
+            if (n_outputs == n_tokens) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = i;
+                }
+            } else if (ubatch.output) {
+                int32_t n_outputs = 0;
+                for (int i = 0; i < n_tokens; ++i) {
+                    if (ubatch.output[i]) {
+                        data[n_outputs++] = i;
+                    }
+                }
+                // the graph needs to have been passed the correct number of outputs
+                GGML_ASSERT(n_outputs == n_outputs);
+            } else if (n_outputs == 1) {
+                // only keep last output
+                data[0] = n_tokens - 1;
+            } else {
+                GGML_ASSERT(n_outputs == 0);
             }
         }
     }
 
-    // micro-batching is not possible without KV cache
-    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "llama_context requires n_ubatch >= n_tokens");
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
 
-    if (t_compute_start_us == 0) {
-        t_compute_start_us = ggml_time_us();
+        GGML_ASSERT(inp_mean);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer));
+
+        float * data = (float *) inp_mean->data;
+        memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean));
+
+        std::vector<uint64_t> sum(n_tokens, 0);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
+
+            sum[seq_id] += ubatch.n_seq_tokens;
+        }
+
+        std::vector<float> div(n_tokens, 0.0f);
+        for (int i = 0; i < n_tokens; ++i) {
+            const uint64_t s = sum[i];
+            if (s > 0) {
+                div[i] = 1.0f/float(s);
+            }
+        }
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
+            }
+        }
     }
-    n_queued_tokens += n_tokens;
 
-    // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
-    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+    if (cparams.embeddings && (
+                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
 
-    embd_seq.clear();
+        GGML_ASSERT(inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
 
-    int64_t n_outputs_all = 0;
+        uint32_t * data = (uint32_t *) inp_cls->data;
+        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
 
-    // count outputs
-    if (batch.logits && !embd_pooled) {
-        for (uint32_t i = 0; i < n_tokens; ++i) {
-            n_outputs_all += batch.logits[i] != 0;
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+
+                if (pos == 0) {
+                    data[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+    }
+
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+        const int64_t n_tokens     = ubatch.n_tokens;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+        const int64_t n_seqs       = ubatch.n_seqs;
+
+        GGML_ASSERT(inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
+
+        uint32_t * data = (uint32_t *) inp_cls->data;
+        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
+
+        std::vector<int> last_pos(n_tokens, -1);
+        std::vector<int> last_row(n_tokens, -1);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+
+                if (pos >= last_pos[seq_id]) {
+                    last_pos[seq_id] = pos;
+                    last_row[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+
+        for (int i = 0; i < n_tokens; ++i) {
+            if (last_row[i] >= 0) {
+                data[i] = last_row[i];
+            }
+        }
+    }
+
+    if (inp_kq_mask) {
+        if (cparams.causal_attn) {
+            const int64_t n_kv         = ubatch.n_tokens;
+            const int64_t n_tokens     = ubatch.n_tokens;
+            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+            const int64_t n_seqs       = ubatch.n_seqs;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer));
+            float * data = (float *) inp_kq_mask->data;
+
+            for (int h = 0; h < 1; ++h) {
+                for (int s1 = 0; s1 < n_seqs; ++s1) {
+                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
+
+                    for (int j = 0; j < n_seq_tokens; ++j) {
+                        const int32_t tj = s1*n_seq_tokens + j;
+
+                        for (int s0 = 0; s0 < n_seqs; ++s0) {
+                            for (int i = 0; i < n_seq_tokens; ++i) {
+                                const int32_t ti = s0*n_seq_tokens + i;
+                                float f = -INFINITY;
+
+                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
+                                    if (ubatch.seq_id[s0][s] == seq_id && ubatch.pos[ti] <= ubatch.pos[tj]) {
+                                        if (hparams.use_alibi) {
+                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
+                                        } else {
+                                            f = 0.0f;
+                                        }
+                                        break;
+                                    }
+                                }
+
+                                data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f;
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            const int64_t n_tokens     = ubatch.n_tokens;
+            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+            const int64_t n_seqs       = ubatch.n_seqs;
+            const int64_t n_stride     = ubatch.n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer));
+
+            float * data = (float *) inp_kq_mask->data;
+
+            for (int h = 0; h < 1; ++h) {
+                for (int s1 = 0; s1 < n_seqs; ++s1) {
+                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
+
+                    for (int j = 0; j < n_seq_tokens; ++j) {
+                        const int32_t tj = s1*n_seq_tokens + j;
+
+                        for (int s0 = 0; s0 < n_seqs; ++s0) {
+                            for (int i = 0; i < n_seq_tokens; ++i) {
+                                const int32_t ti = s0*n_seq_tokens + i;
+                                float f = -INFINITY;
+
+                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
+                                    if (ubatch.seq_id[s0][s] == seq_id) {
+                                        if (hparams.use_alibi) {
+                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
+                                        } else {
+                                            f = 0.0f;
+                                        }
+                                        break;
+                                    }
+                                }
+
+                                data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
+                            }
+                        }
+
+                        for (int i = n_tokens; i < n_stride; ++i) {
+                            data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    GGML_ASSERT(
+            // (!a || b) is a logical implication (a -> b)
+            // !hparams.causal_attn -> !cparams.causal_attn
+            (hparams.causal_attn || !cparams.causal_attn) &&
+            "causal attention is not supported by this model"
+            );
+}
+
+//
+// output
+//
+
+int32_t llama_context::output_reserve(int32_t n_outputs) {
+    const auto & hparams = model.hparams;
+    const auto & vocab   = model.vocab;
+
+    const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
+
+    const auto n_batch = cparams.n_batch;
+    const auto n_vocab = vocab.n_tokens();
+    const auto n_embd  = hparams.n_embd;
+
+    // TODO: use a per-batch flag for logits presence instead
+    const bool has_logits = !cparams.embeddings;
+    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+
+    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
+    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
+
+    if (output_ids.empty()) {
+        // init, never resized afterwards
+        output_ids.resize(n_batch);
+    }
+
+    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
+
+    // alloc only when more than the current capacity is required
+    // TODO: also consider shrinking the buffer
+    if (!buf_output || prev_size < new_size) {
+        if (buf_output) {
+#ifndef NDEBUG
+            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+            buf_output = nullptr;
+            logits = nullptr;
+            embd = nullptr;
+        }
+
+        auto * buft = ggml_backend_cpu_buffer_type();
+        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
+        auto * output_dev = model.dev_output();
+        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+        if (output_dev_host_buft) {
+            buft = output_dev_host_buft;
+        }
+        buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
+        if (buf_output == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
+            return 0;
         }
-    } else if (logits_all || embd_pooled) {
-        n_outputs_all = n_tokens;
-    } else {
-        // keep last output only
-        n_outputs_all = 1;
     }
 
-    const bool logits_all = n_outputs_all == n_tokens;
-
-    sbatch.from_batch(batch, n_embd,
-            /* simple_split */ true,
-            /* logits_all   */ logits_all);
-
-    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
+    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
 
-    // reserve output buffer
-    if (output_reserve(n_outputs_all) < n_outputs_all) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
-        return -2;
-    };
+    logits = has_logits ? output_base               : nullptr;
+    embd   = has_embd   ? output_base + logits_size : nullptr;
 
-    n_outputs = n_outputs_all;
+    output_size = n_outputs_max;
 
-    GGML_ASSERT(need_reserve == false);
+    // set all ids as invalid (negative)
+    std::fill(output_ids.begin(), output_ids.end(), -1);
 
-    ggml_backend_sched_reset(sched.get());
-    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+    ggml_backend_buffer_clear(buf_output.get(), 0);
 
-    auto * gf = graph_init();
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
+    n_outputs = 0;
 
-    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+    return n_outputs_max;
+}
 
-    ggml_backend_sched_alloc_graph(sched.get(), gf);
+void llama_context::output_reorder() {
+    auto & out_ids = sbatch.out_ids;
+    if (!out_ids.empty()) {
+        const uint32_t n_vocab = model.vocab.n_tokens();
+        const uint32_t n_embd  = model.hparams.n_embd;
 
-    input_set(ubatch);
+        GGML_ASSERT((size_t) n_outputs == out_ids.size());
 
-    const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
-    if (compute_status != GGML_STATUS_SUCCESS) {
-        switch (compute_status) {
-            case GGML_STATUS_ABORTED:
-                return 2;
-            case GGML_STATUS_ALLOC_FAILED:
-                return -2;
-            case GGML_STATUS_FAILED:
-            default:
-                return -3;
+        // TODO: is there something more efficient which also minimizes swaps?
+        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+        for (int32_t i = 0; i < n_outputs - 1; ++i) {
+            int32_t j_min = i;
+            for (int32_t j = i + 1; j < n_outputs; ++j) {
+                if (out_ids[j] < out_ids[j_min]) {
+                    j_min = j;
+                }
+            }
+            if (j_min == i) { continue; }
+            std::swap(out_ids[i], out_ids[j_min]);
+            if (logits_size > 0) {
+                for (uint32_t k = 0; k < n_vocab; k++) {
+                    std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
+                }
+            }
+            if (embd_size > 0) {
+                for (uint32_t k = 0; k < n_embd; k++) {
+                    std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
+                }
+            }
+        }
+        std::fill(output_ids.begin(), output_ids.end(), -1);
+        for (int32_t i = 0; i < n_outputs; ++i) {
+            output_ids[out_ids[i]] = i;
         }
+        out_ids.clear();
     }
+}
 
-    auto * t_logits = cparams.embeddings ? nullptr    : res.t_logits;
-    auto * t_embd   = cparams.embeddings ? res.t_embd : nullptr;
-
-    if (t_embd && res.t_embd_pooled) {
-        t_embd = res.t_embd_pooled;
-    }
+//
+// graph
+//
 
-    // extract logits
-    if (t_logits && n_outputs > 0) {
-        ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
-        GGML_ASSERT(backend_res != nullptr);
-        GGML_ASSERT(logits != nullptr);
+ggml_cgraph * llama_context::graph_init() {
+    inp_tokens  = nullptr;
+    inp_embd    = nullptr;
+    inp_pos     = nullptr;
+    inp_out_ids = nullptr;
+    inp_mean    = nullptr;
+    inp_cls     = nullptr;
 
-        float * logits_out = logits;
+    inp_kq_mask     = nullptr;
+    inp_kq_mask_cnv = nullptr;
 
-        if (n_outputs) {
-            GGML_ASSERT(n_outputs <= n_outputs_all);
-            GGML_ASSERT(n_outputs*n_vocab <= (int64_t) logits_size);
-            ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
-        }
-    }
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_compute_meta.size(),
+        /*.mem_buffer =*/ buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
+    };
 
-    // extract embeddings
-    if (t_embd && n_outputs > 0) {
-        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
-        GGML_ASSERT(backend_embd != nullptr);
+    ctx_compute.reset(ggml_init(params));
 
-        switch (cparams.pooling_type) {
-            case LLAMA_POOLING_TYPE_NONE:
-                {
-                    // extract token embeddings
-                    GGML_ASSERT(embd != nullptr);
-                    float * embd_out = embd;
+    return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false);
+}
 
-                    if (n_outputs) {
-                        GGML_ASSERT(n_outputs <= n_outputs_all);
-                        GGML_ASSERT(n_outputs*n_embd <= (int64_t) embd_size);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
-                    }
-                } break;
-            case LLAMA_POOLING_TYPE_MEAN:
-            case LLAMA_POOLING_TYPE_CLS:
-            case LLAMA_POOLING_TYPE_LAST:
-                {
-                    // extract sequence embeddings (cleared before processing each batch)
-                    auto & embd_seq_out = embd_seq;
+llama_graph_result llama_context::graph_build(
+            ggml_context * ctx,
+             ggml_cgraph * gf,
+      const llama_ubatch & ubatch) {
+    return model.build_graph(ctx, gf, this, cparams, ubatch);
+}
 
-                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                            continue;
-                        }
-                        embd_seq_out[seq_id].resize(n_embd);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
-                    }
-                } break;
-            case LLAMA_POOLING_TYPE_RANK:
-                {
-                    // extract the rerank score - a single float per sequence
-                    auto & embd_seq_out = embd_seq;
+enum ggml_status llama_context::graph_compute(
+            ggml_cgraph * gf,
+                   bool   batched) {
+    int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
+    ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
 
-                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                            continue;
-                        }
-                        embd_seq_out[seq_id].resize(1);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
-                    }
-                } break;
-            case LLAMA_POOLING_TYPE_UNSPECIFIED:
-                {
-                    GGML_ABORT("unknown pooling type");
-                }
-        }
+    if (backend_cpu != nullptr) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
+        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
+        set_threadpool_fn(backend_cpu, tp);
     }
 
-    // set output mappings
-    {
-        bool sorted_output = true;
-
-        GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
-
-        for (int64_t i = 0; i < n_outputs_all; ++i) {
-            int64_t out_id = sbatch.out_ids[i];
-            output_ids[out_id] = i;
-            if (out_id != i) {
-                sorted_output = false;
-            }
-        }
-
-        if (sorted_output) {
-            sbatch.out_ids.clear();
-        }
+    // set the number of threads for all the backends
+    for (const auto & set_n_threads_fn : set_n_threads_fns) {
+        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
     }
 
-    // wait for the computation to finish (automatically done when obtaining the model output)
-    //synchronize();
+    auto status = ggml_backend_sched_graph_compute_async(sched.get(), gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
+    }
 
-    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-    // overlap with device computation.
-    ggml_backend_sched_reset(sched.get());
+    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
 
-    return 0;
+    return status;
 }
 
+//
+// graph build API
+//
+
 void llama_context::build_cb(
          ggml_tensor * cur,
           const char * name,
@@ -1307,10 +1580,8 @@ ggml_tensor * llama_context::build_inp_pos(
 }
 
 ggml_tensor * llama_context::build_inp_out_ids(
-        ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   worst_case) {
-    const int32_t n_out_ids = worst_case ? n_tokens : n_outputs;
+        ggml_context * ctx0) {
+    const int32_t n_out_ids = n_outputs;
 
     inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids);
     ggml_set_input(inp_out_ids);
@@ -1336,6 +1607,22 @@ ggml_tensor * llama_context::build_inp_cls(
     return inp_cls;
 }
 
+void llama_context::build_attn_inp(
+        ggml_context * ctx0,
+             int32_t   n_tokens,
+                bool   causal,
+                bool   swa) {
+    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
+    GGML_UNUSED(causal);
+    GGML_UNUSED(swa);
+
+    inp_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    //cb(inp_kq_mask, "KQ_mask", -1);
+    ggml_set_input(inp_kq_mask);
+
+    inp_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_kq_mask, GGML_TYPE_F16) : inp_kq_mask;
+}
+
 ggml_tensor * llama_context::build_attn(
         ggml_context * ctx0,
          ggml_cgraph * gf,
@@ -1346,8 +1633,7 @@ ggml_tensor * llama_context::build_attn(
          ggml_tensor * v_cur,
              int32_t   n_tokens,
              float     kq_scale,
-             int       il,
-             bool      worst_case) {
+             int       il) {
     const auto & hparams = model.hparams;
 
     const auto & n_ctx = cparams.n_ctx;
@@ -1364,7 +1650,6 @@ ggml_tensor * llama_context::build_attn(
     const auto & n_embd_head_v = hparams.n_embd_head_v;
 
     // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    GGML_UNUSED(worst_case);
     const auto n_kv = n_tokens;
 
     struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
@@ -1450,27 +1735,9 @@ ggml_tensor * llama_context::build_attn(
 
     if (wo_b) {
         cur = ggml_add(ctx0, cur, wo_b);
-    }
-
-    return cur;
-}
-
-void llama_context::build_attn_inp(
-        ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   causal,
-                bool   swa,
-                bool   worst_case) {
-    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    GGML_UNUSED(causal);
-    GGML_UNUSED(swa);
-    GGML_UNUSED(worst_case);
-
-    inp_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    //cb(inp_kq_mask, "KQ_mask", -1);
-    ggml_set_input(inp_kq_mask);
+    }
 
-    inp_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_kq_mask, GGML_TYPE_F16) : inp_kq_mask;
+    return cur;
 }
 
 //
@@ -1497,7 +1764,7 @@ void llama_context::perf_reset() {
 }
 
 //
-// state
+// state save/load
 //
 
 class llama_io_write_dummy : public llama_io_write_i {
@@ -1857,367 +2124,110 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
 
     // write logits
     {
-        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
-
-        io.write(&logits_size, sizeof(logits_size));
-
-        if (logits_size) {
-            io.write(logits, logits_size * sizeof(float));
-        }
-    }
-
-    // write embeddings
-    {
-        const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd);
-
-        io.write(&embd_size, sizeof(embd_size));
-
-        if (embd_size) {
-            io.write(embd, embd_size * sizeof(float));
-        }
-    }
-
-    return io.n_bytes();
-}
-
-size_t llama_context::state_set_data(llama_io_read_i & io) {
-    // read model info
-    {
-        const std::string cur_arch_str = llm_arch_name(model.arch);
-
-        std::string arch_str;
-        io.read_string(arch_str);
-        if (cur_arch_str != arch_str) {
-            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
-        }
-        // TODO: add more info which needs to be identical but which is not verified otherwise
-    }
-
-    // read output ids
-    {
-        auto n_outputs = this->n_outputs;
-        io.read_to(&n_outputs, sizeof(n_outputs));
-
-        if (n_outputs > output_reserve(n_outputs)) {
-            throw std::runtime_error("could not reserve outputs");
-        }
-
-        std::vector<int32_t> output_pos;
-
-        if (n_outputs) {
-            output_pos.resize(n_outputs);
-            io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
-
-            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
-                int32_t id = output_pos[i];
-                if ((uint32_t) id >= n_batch()) {
-                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch()));
-                }
-                this->output_ids[id] = i;
-            }
-
-            this->n_outputs = n_outputs;
-        }
-    }
-
-    // read logits
-    {
-        uint64_t logits_size;
-        io.read_to(&logits_size, sizeof(logits_size));
-
-        if (this->logits_size < logits_size) {
-            throw std::runtime_error("logits buffer too small");
-        }
-
-        if (logits_size) {
-            io.read_to(this->logits, logits_size * sizeof(float));
-        }
-    }
-
-    // read embeddings
-    {
-        uint64_t embd_size;
-        io.read_to(&embd_size, sizeof(embd_size));
-
-        if (this->embd_size < embd_size) {
-            throw std::runtime_error("embeddings buffer too small");
-        }
-
-        if (embd_size) {
-            io.read_to(this->embd, embd_size * sizeof(float));
-        }
-    }
-
-    return io.n_bytes();
-}
-
-size_t llama_context::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
-    GGML_UNUSED(seq_id);
-
-    return io.n_bytes();
-}
-
-size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
-    GGML_UNUSED(seq_id);
-
-    return io.n_bytes();
-}
-
-//
-// input
-//
-
-void llama_context::input_set(const llama_ubatch & ubatch) {
-    const llama_hparams & hparams = model.hparams;
-
-    if (ubatch.token) {
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
-    }
-
-    if (ubatch.embd) {
-        const int64_t n_embd   = hparams.n_embd;
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd));
-    }
-
-    if (ubatch.pos && inp_pos) {
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos));
-    }
-
-    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs");
-
-        if (!inp_out_ids) {
-            LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__);
-        } else {
-            const int64_t n_tokens = ubatch.n_tokens;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer));
-            int32_t * data = (int32_t *) inp_out_ids->data;
-
-            if (n_outputs == n_tokens) {
-                for (int i = 0; i < n_tokens; ++i) {
-                    data[i] = i;
-                }
-            } else if (ubatch.output) {
-                int32_t n_outputs = 0;
-                for (int i = 0; i < n_tokens; ++i) {
-                    if (ubatch.output[i]) {
-                        data[n_outputs++] = i;
-                    }
-                }
-                // the graph needs to have been passed the correct number of outputs
-                GGML_ASSERT(n_outputs == n_outputs);
-            } else if (n_outputs == 1) {
-                // only keep last output
-                data[0] = n_tokens - 1;
-            } else {
-                GGML_ASSERT(n_outputs == 0);
-            }
-        }
-    }
-
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp_mean);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer));
-
-        float * data = (float *) inp_mean->data;
-        memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean));
-
-        std::vector<uint64_t> sum(n_tokens, 0);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
-            sum[seq_id] += ubatch.n_seq_tokens;
-        }
-
-        std::vector<float> div(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
-            const uint64_t s = sum[i];
-            if (s > 0) {
-                div[i] = 1.0f/float(s);
-            }
-        }
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
-            }
-        }
-    }
-
-    if (cparams.embeddings && (
-                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
-                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
-
-        uint32_t * data = (uint32_t *) inp_cls->data;
-        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
-
-                if (pos == 0) {
-                    data[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-    }
-
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
-
-        uint32_t * data = (uint32_t *) inp_cls->data;
-        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
-
-        std::vector<int> last_pos(n_tokens, -1);
-        std::vector<int> last_row(n_tokens, -1);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+        const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
 
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
+        io.write(&logits_size, sizeof(logits_size));
 
-                if (pos >= last_pos[seq_id]) {
-                    last_pos[seq_id] = pos;
-                    last_row[seq_id] = s*n_seq_tokens + i;
-                }
-            }
+        if (logits_size) {
+            io.write(logits, logits_size * sizeof(float));
         }
+    }
 
-        for (int i = 0; i < n_tokens; ++i) {
-            if (last_row[i] >= 0) {
-                data[i] = last_row[i];
-            }
+    // write embeddings
+    {
+        const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd);
+
+        io.write(&embd_size, sizeof(embd_size));
+
+        if (embd_size) {
+            io.write(embd, embd_size * sizeof(float));
         }
     }
 
-    if (inp_kq_mask) {
-        if (cparams.causal_attn) {
-            const int64_t n_kv         = ubatch.n_tokens;
-            const int64_t n_tokens     = ubatch.n_tokens;
-            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-            const int64_t n_seqs       = ubatch.n_seqs;
+    return io.n_bytes();
+}
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer));
-            float * data = (float *) inp_kq_mask->data;
+size_t llama_context::state_set_data(llama_io_read_i & io) {
+    // read model info
+    {
+        const std::string cur_arch_str = llm_arch_name(model.arch);
 
-            for (int h = 0; h < 1; ++h) {
-                for (int s1 = 0; s1 < n_seqs; ++s1) {
-                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
+        std::string arch_str;
+        io.read_string(arch_str);
+        if (cur_arch_str != arch_str) {
+            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
+        }
+        // TODO: add more info which needs to be identical but which is not verified otherwise
+    }
 
-                    for (int j = 0; j < n_seq_tokens; ++j) {
-                        const int32_t tj = s1*n_seq_tokens + j;
+    // read output ids
+    {
+        auto n_outputs = this->n_outputs;
+        io.read_to(&n_outputs, sizeof(n_outputs));
 
-                        for (int s0 = 0; s0 < n_seqs; ++s0) {
-                            for (int i = 0; i < n_seq_tokens; ++i) {
-                                const int32_t ti = s0*n_seq_tokens + i;
-                                float f = -INFINITY;
+        if (n_outputs > output_reserve(n_outputs)) {
+            throw std::runtime_error("could not reserve outputs");
+        }
 
-                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
-                                    if (ubatch.seq_id[s0][s] == seq_id && ubatch.pos[ti] <= ubatch.pos[tj]) {
-                                        if (hparams.use_alibi) {
-                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
-                                        } else {
-                                            f = 0.0f;
-                                        }
-                                        break;
-                                    }
-                                }
+        std::vector<int32_t> output_pos;
 
-                                data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f;
-                            }
-                        }
-                    }
+        if (n_outputs) {
+            output_pos.resize(n_outputs);
+            io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
+
+            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
+                int32_t id = output_pos[i];
+                if ((uint32_t) id >= n_batch()) {
+                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch()));
                 }
+                this->output_ids[id] = i;
             }
-        } else {
-            const int64_t n_tokens     = ubatch.n_tokens;
-            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-            const int64_t n_seqs       = ubatch.n_seqs;
-            const int64_t n_stride     = ubatch.n_tokens;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer));
 
-            float * data = (float *) inp_kq_mask->data;
+            this->n_outputs = n_outputs;
+        }
+    }
 
-            for (int h = 0; h < 1; ++h) {
-                for (int s1 = 0; s1 < n_seqs; ++s1) {
-                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
+    // read logits
+    {
+        uint64_t logits_size;
+        io.read_to(&logits_size, sizeof(logits_size));
 
-                    for (int j = 0; j < n_seq_tokens; ++j) {
-                        const int32_t tj = s1*n_seq_tokens + j;
+        if (this->logits_size < logits_size) {
+            throw std::runtime_error("logits buffer too small");
+        }
 
-                        for (int s0 = 0; s0 < n_seqs; ++s0) {
-                            for (int i = 0; i < n_seq_tokens; ++i) {
-                                const int32_t ti = s0*n_seq_tokens + i;
-                                float f = -INFINITY;
+        if (logits_size) {
+            io.read_to(this->logits, logits_size * sizeof(float));
+        }
+    }
 
-                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
-                                    if (ubatch.seq_id[s0][s] == seq_id) {
-                                        if (hparams.use_alibi) {
-                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
-                                        } else {
-                                            f = 0.0f;
-                                        }
-                                        break;
-                                    }
-                                }
+    // read embeddings
+    {
+        uint64_t embd_size;
+        io.read_to(&embd_size, sizeof(embd_size));
 
-                                data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
-                            }
-                        }
+        if (this->embd_size < embd_size) {
+            throw std::runtime_error("embeddings buffer too small");
+        }
 
-                        for (int i = n_tokens; i < n_stride; ++i) {
-                            data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
-                        }
-                    }
-                }
-            }
+        if (embd_size) {
+            io.read_to(this->embd, embd_size * sizeof(float));
         }
     }
 
-    GGML_ASSERT(
-            // (!a || b) is a logical implication (a -> b)
-            // !hparams.causal_attn -> !cparams.causal_attn
-            (hparams.causal_attn || !cparams.causal_attn) &&
-            "causal attention is not supported by this model"
-            );
+    return io.n_bytes();
+}
+
+size_t llama_context::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
+    GGML_UNUSED(seq_id);
+
+    return io.n_bytes();
+}
+
+size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
+    GGML_UNUSED(seq_id);
+
+    return io.n_bytes();
 }
 
 //
@@ -2235,7 +2245,7 @@ llama_context_kv_self::llama_context_kv_self(
 
     LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
 
-    cparams.n_ctx = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams));
+    cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv_self.get_padding(cparams));
 
     LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
 
@@ -2271,6 +2281,13 @@ llama_context_kv_self::llama_context_kv_self(
 
 llama_context_kv_self::~llama_context_kv_self() = default;
 
+void llama_context_kv_self::reserve() {
+    // simulate full KV cache
+    kv_self.n = kv_self.size;
+
+    llama_context::reserve();
+}
+
 llama_kv_cache * llama_context_kv_self::get_kv_self() {
     return &kv_self;
 }
@@ -2282,6 +2299,8 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
 void llama_context_kv_self::kv_self_update() {
     auto & kv = kv_self;
 
+    bool need_reserve = false;
+
     if (kv.has_shift) {
         if (!kv.can_shift) {
             GGML_ABORT("The current context does not support K-shift");
@@ -2332,20 +2351,30 @@ void llama_context_kv_self::kv_self_update() {
 
         need_reserve = true;
     }
-}
 
-ggml_cgraph * llama_context_kv_self::graph_init() {
-    inp_embd_enc      = nullptr;
-    inp_pos_bucket    = nullptr;
-    inp_kq_mask_cross = nullptr;
+    // reserve a worst case graph if needed
+    if (need_reserve) {
+        LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
 
-    inp_self_kq_mask         = nullptr;
-    inp_self_kq_mask_cnv     = nullptr;
-    inp_self_kq_mask_swa     = nullptr;
-    inp_self_kq_mask_swa_cnv = nullptr;
-    inp_self_k_shift         = nullptr;
+        // build worst-case graph
+        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
-    return llama_context::graph_init();
+        // simulate full KV cache
+        kv_self.n = kv_self.size;
+
+        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+        auto * gf = graph_init();
+        graph_build(ctx_compute.get(), gf, ubatch);
+
+        // initialize scheduler with the worst-case graph
+        ggml_backend_sched_reset(sched.get());
+        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+        }
+    }
 }
 
 int llama_context_kv_self::encode(llama_batch & inp_batch) {
@@ -2406,14 +2435,11 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
 
     //batch_manager->prepare(ubatch);
 
-    // TODO: do reserve
-    GGML_ASSERT(need_reserve == false);
-
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
     auto * gf = graph_init();
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
+    auto res = graph_build(ctx_compute.get(), gf, ubatch);
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
@@ -2658,42 +2684,18 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
                 // after enough generations, the benefit from this heuristic disappears
                 // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = get_ctx_padding(cparams);
+                const uint32_t pad = kv_self.get_padding(cparams);
                 kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
-                //kv_self.n = llama_kv_cache_cell_max(kv_self);
             }
         }
 
         //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
-        // reserve a worst case graph if needed
-        if (need_reserve) {
-            LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
-            // build worst-case graph
-            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch, true);
-
-            // initialize scheduler with the worst-case graph
-            ggml_backend_sched_reset(sched.get());
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-            }
-
-            need_reserve = false;
-        }
-
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
         auto * gf = graph_init();
-        auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
+        auto res = graph_build(ctx_compute.get(), gf, ubatch);
 
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
@@ -2841,7 +2843,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
     if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
         // - do not defrag small contexts (i.e. < 2048 tokens)
         // - count the padding towards the number of used tokens
-        const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + get_ctx_padding(cparams))/float(kv_self.n)) : 0.0f;
+        const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + kv_self.get_padding(cparams))/float(kv_self.n)) : 0.0f;
 
         // queue defragmentation for next llama_kv_cache_update
         if (fragmentation > cparams.defrag_thold) {
@@ -2858,12 +2860,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) const {
-    return kv_self.get_padding(cparams);
-}
-
-// llama input
-
 void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     const llama_hparams & hparams = model.hparams;
 
@@ -3095,6 +3091,20 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     }
 }
 
+ggml_cgraph * llama_context_kv_self::graph_init() {
+    inp_embd_enc      = nullptr;
+    inp_pos_bucket    = nullptr;
+    inp_kq_mask_cross = nullptr;
+
+    inp_self_kq_mask         = nullptr;
+    inp_self_kq_mask_cnv     = nullptr;
+    inp_self_kq_mask_swa     = nullptr;
+    inp_self_kq_mask_swa_cnv = nullptr;
+    inp_self_k_shift         = nullptr;
+
+    return llama_context::graph_init();
+}
+
 ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) {
     inp_self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx());
     ggml_set_input(inp_self_k_shift);
@@ -3106,9 +3116,8 @@ void llama_context_kv_self::build_attn_inp(
         ggml_context * ctx0,
              int32_t   n_tokens,
                 bool   causal,
-                bool   swa,
-                bool   worst_case) {
-    const auto n_kv = worst_case ? kv_self.size : kv_self.n;
+                bool   swa) {
+    const auto n_kv = kv_self.n;
 
     inp_self_kq_mask = causal
         ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
@@ -3143,8 +3152,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
          ggml_tensor * v_cur,
              int32_t   n_tokens,
              float     kq_scale,
-             int       il,
-             bool      worst_case) {
+             int       il) {
     const auto & hparams = model.hparams;
 
     const auto & n_ctx = cparams.n_ctx;
@@ -3156,7 +3164,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
     {
         GGML_ASSERT(!kv_self.recurrent);
 
-        const auto kv_head = worst_case ? kv_self.size - n_tokens : kv_self.head;
+        const auto kv_head = kv_self.head;
 
         GGML_ASSERT(kv_self.size == n_ctx);
 
@@ -3211,7 +3219,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
 
     const auto & kq_mask = is_sliding ? inp_self_kq_mask_swa_cnv : inp_self_kq_mask_cnv;
 
-    const auto n_kv = worst_case ? kv_self.size : kv_self.n;
+    const auto n_kv = kv_self.n;
 
     const int64_t n_head    = hparams.n_head(il);
     const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -3626,14 +3634,12 @@ void llama_context_kv_self::build_kv_self_defrag(
 }
 
 ggml_tensor * llama_context_kv_self::build_inp_embd_enc(
-        ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   worst_case) {
+        ggml_context * ctx0) {
     const auto & hparams = model.hparams;
     const int64_t n_embd = hparams.n_embd;
 
     // TODO: not sure if this is correct
-    const int32_t n_outputs_enc = worst_case ? n_tokens : embd_enc.size() / n_embd;
+    const int32_t n_outputs_enc = embd_enc.size() / n_embd;
 
     inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
     ggml_set_input(inp_embd_enc);
@@ -3643,13 +3649,12 @@ ggml_tensor * llama_context_kv_self::build_inp_embd_enc(
 
 ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross(
             ggml_context * ctx0,
-                 int32_t   n_tokens,
-                 bool   worst_case) {
+                 int32_t   n_tokens) {
     const auto & hparams = model.hparams;
     const int64_t n_embd = hparams.n_embd;
 
     // TODO: not sure if this is correct
-    const int32_t n_outputs_enc = worst_case ? n_tokens : embd_enc.size() / n_embd;
+    const int32_t n_outputs_enc = embd_enc.size() / n_embd;
 
     inp_kq_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
     ggml_set_input(inp_kq_mask_cross);
@@ -3738,6 +3743,11 @@ llama_context_recurrent::llama_context_recurrent(
 
 llama_context_recurrent::~llama_context_recurrent() = default;
 
+void llama_context_recurrent::reserve() {
+    // TODO: implement recurrent-specific reserve logic
+    llama_context::reserve();
+}
+
 llama_kv_cache * llama_context_recurrent::get_kv_self() {
     return &kv_self;
 }
@@ -3750,13 +3760,6 @@ void llama_context_recurrent::kv_self_update() {
     // noop
 }
 
-ggml_cgraph * llama_context_recurrent::graph_init() {
-    inp_s_copy = nullptr;
-    inp_s_mask = nullptr;
-
-    return llama_context::graph_init();
-}
-
 int llama_context_recurrent::encode(llama_batch & inp_batch) {
     GGML_UNUSED(inp_batch);
 
@@ -3917,34 +3920,11 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
 
         //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
-        // reserve a worst case graph if needed
-        if (need_reserve) {
-            LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
-            // build worst-case graph
-            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch, true);
-
-            // initialize scheduler with the worst-case graph
-            ggml_backend_sched_reset(sched.get());
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-            }
-
-            need_reserve = false;
-        }
-
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
         auto * gf = graph_init();
-        auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
+        auto res = graph_build(ctx_compute.get(), gf, ubatch);
 
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
@@ -4147,24 +4127,32 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
     }
 }
 
+ggml_cgraph * llama_context_recurrent::graph_init() {
+    inp_s_copy = nullptr;
+    inp_s_mask = nullptr;
+
+    return llama_context::graph_init();
+}
+
 ggml_tensor * llama_context_recurrent::build_inp_s_copy(
-        ggml_context * ctx0,
-                bool   worst_case) {
-    const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
+        ggml_context * ctx0) {
+    const auto n_kv = kv_self.n;
 
     inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
     //cb(inp_s_copy, "inp_s_copy", -1);
     ggml_set_input(inp_s_copy);
+
     return inp_s_copy;
 }
 
 ggml_tensor * llama_context_recurrent::build_inp_s_mask(
-        ggml_context * ctx0,
-                bool   worst_case) {
-    const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
+        ggml_context * ctx0) {
+    const auto n_kv = kv_self.n;
+
     inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
     //cb(inp_s_mask, "inp_s_mask", -1);
     ggml_set_input(inp_s_mask);
+
     return inp_s_mask;
 }
 
@@ -4174,12 +4162,10 @@ ggml_tensor * llama_context_recurrent::build_copy_mask_state(
          ggml_tensor * s,
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
-             int32_t   n_tokens,
              int32_t   n_state,
-             int32_t   n_seqs,
-                bool   worst_case) {
-    const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
-    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+             int32_t   n_seqs) {
+    const auto n_kv    = kv_self.n;
+    const auto kv_head = kv_self.head;
 
     struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self.size);
 
@@ -4210,13 +4196,10 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer(
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
     const auto & hparams = model.hparams;
 
-    const auto & n_tokens = ubatch.n_tokens;
-
-    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+    const auto kv_head = kv_self.head;
 
     const int64_t d_conv  = hparams.ssm_d_conv;
     const int64_t d_inner = hparams.ssm_d_inner;
@@ -4240,11 +4223,11 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer(
     // (ab)using the KV cache to store the states
     struct ggml_tensor * conv = build_copy_mask_state(
             ctx0, gf, conv_states_all, state_copy, state_mask,
-            n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
+            hparams.n_embd_k_s(), n_seqs);
     conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
     struct ggml_tensor * ssm = build_copy_mask_state(
             ctx0, gf, ssm_states_all, state_copy, state_mask,
-            n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
+            hparams.n_embd_v_s(), n_seqs);
     ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
 
     // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
@@ -4345,20 +4328,18 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_load(
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
     const auto & hparams = model.hparams;
 
     const auto token_shift_count = hparams.token_shift_count;
 
-    const auto & n_tokens = ubatch.n_tokens;
     const int64_t n_seqs  = ubatch.n_seqs;
 
     struct ggml_tensor * token_shift_all = kv_self.k_l[il];
 
     struct ggml_tensor * token_shift = build_copy_mask_state(
             ctx0, gf, token_shift_all, state_copy, state_mask,
-            n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
+            hparams.n_embd_k_s(), n_seqs);
 
     token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
 
@@ -4369,17 +4350,15 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_store(
         ggml_context * ctx0,
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
     const auto & hparams = model.hparams;
 
     const auto token_shift_count = hparams.token_shift_count;
     const auto n_embd = hparams.n_embd;
 
-    const auto & n_tokens = ubatch.n_tokens;
-    const int64_t n_seqs  = ubatch.n_seqs;
+    const int64_t n_seqs = ubatch.n_seqs;
 
-    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+    const auto kv_head = kv_self.head;
 
     return ggml_cpy(
         ctx0,
@@ -4396,8 +4375,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
     const auto & hparams = model.hparams;
 
     const auto n_tokens = ubatch.n_tokens;
@@ -4407,7 +4385,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
     const auto n_head = n_embd / head_size;
     const auto n_head_kv = hparams.n_head_kv(il);
 
-    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+    const auto kv_head = kv_self.head;
 
     const auto & layer = model.layers[il];
 
@@ -4516,7 +4494,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
 
     struct ggml_tensor * wkv_state = build_copy_mask_state(
             ctx0, gf, kv_self.v_l[il], state_copy, state_mask,
-            n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
+            hparams.n_embd_v_s(), n_seqs);
 
     struct ggml_tensor * wkv_output;
     if (is_qrwkv) {
diff --git a/src/llama-context.h b/src/llama-context.h
index 9d8b702208b0b..d4ab5d509b155 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -22,16 +22,25 @@ using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
 // basic transformer without KV cache
 struct llama_context : public llama_graph_i {
+public:
     llama_context(
             const llama_model & model,
             const llama_context_params & params);
 
     virtual ~llama_context();
 
-    // init scheduler and compute buffers
+    // init scheduler and compute buffers, reserve worst-case graphs
     // call once after the context is constructed
     virtual void init();
 
+    virtual void synchronize();
+
+protected:
+    // called by init() to reserve the worst-case graphs
+    // override in child classes
+    virtual void reserve();
+
+public:
     const llama_model   & get_model()   const;
     const llama_cparams & get_cparams() const;
 
@@ -93,33 +102,6 @@ struct llama_context : public llama_graph_i {
                 int32_t   il_start,
                 int32_t   il_end);
 
-    ////
-
-    virtual void synchronize();
-
-    // zero-out inputs and create ggml_context
-    virtual ggml_cgraph * graph_init();
-
-    // TODO: add encode/decode graphs
-    virtual llama_graph_result graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-                    bool   worst_case);
-
-    // returns the result of ggml_backend_sched_graph_compute_async execution
-    virtual enum ggml_status graph_compute(
-            ggml_cgraph * gf,
-                   bool   batched);
-
-    // Make sure enough space is available for outputs.
-    // Returns max number of outputs for which space was reserved.
-    virtual int32_t output_reserve(int32_t n_outputs);
-
-    // make the outputs have the same order they had in the user-provided batch
-    // TODO: maybe remove this
-    virtual void output_reorder();
-
     // encode a batch of tokens by evaluating the encoder part of the transformer
     //
     //   - lctx:      llama context
@@ -145,6 +127,60 @@ struct llama_context : public llama_graph_i {
     //
     virtual int decode(llama_batch & inp_batch);
 
+protected:
+    //
+    // input
+    //
+
+    // when the compute graph is built, it creates the input tensors that it needs
+    // the contents of the input tensors are set by the input_set() function
+
+    virtual void input_set(const llama_ubatch & ubatch);
+
+    // base input tensors
+    ggml_tensor * inp_tokens;  // I32 [n_batch]
+    ggml_tensor * inp_embd;    // F32 [n_embd, n_batch]
+    ggml_tensor * inp_pos;     // I32 [n_batch]
+    ggml_tensor * inp_out_ids; // I32 [n_outputs]
+    ggml_tensor * inp_mean;    // F32 [n_batch, n_batch]
+    ggml_tensor * inp_cls;     // I32 [n_batch]
+
+    // KQ mask input tensors
+    ggml_tensor * inp_kq_mask;     // F32 [n_tokens, n_batch]
+    ggml_tensor * inp_kq_mask_cnv; //     [n_tokens, n_batch]
+
+    //
+    // output
+    //
+
+    // Make sure enough space is available for outputs.
+    // Returns max number of outputs for which space was reserved.
+    virtual int32_t output_reserve(int32_t n_outputs);
+
+    // make the outputs have the same order they had in the user-provided batch
+    // TODO: maybe remove this
+    virtual void output_reorder();
+
+    //
+    // graph
+    //
+
+    // zero-out inputs and create the ctx_context for the compute graph
+    virtual ggml_cgraph * graph_init();
+
+    // TODO: add encode/decode graphs
+    virtual llama_graph_result graph_build(
+            ggml_context * ctx,
+             ggml_cgraph * gf,
+      const llama_ubatch & ubatch);
+
+    // returns the result of ggml_backend_sched_graph_compute_async execution
+    virtual enum ggml_status graph_compute(
+            ggml_cgraph * gf,
+                   bool   batched);
+
+    ggml_context_ptr ctx_compute;
+
     //
     // graph build API (generic)
     //
@@ -193,9 +229,7 @@ struct llama_context : public llama_graph_i {
                  int32_t   n_tokens);
 
     virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case);
+            ggml_context * ctx0);
 
     virtual ggml_tensor * build_inp_mean(
             ggml_context * ctx0,
@@ -209,8 +243,7 @@ struct llama_context : public llama_graph_i {
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
-                    bool   swa,
-                    bool   worst_case);
+                    bool   swa);
 
     virtual ggml_tensor * build_attn(
             ggml_context * ctx0,
@@ -222,15 +255,32 @@ struct llama_context : public llama_graph_i {
              ggml_tensor * v_cur,
                  int32_t   n_tokens,
                  float     kq_scale,
-                 int       il,
-                 bool      worst_case);
+                 int       il);
 
+public:
+    //
     // perf
+    //
 
     virtual llama_perf_context_data perf_get_data() const;
     virtual void perf_reset();
 
+protected:
+    mutable int64_t t_start_us  = 0;
+    mutable int64_t t_load_us   = 0;
+    mutable int64_t t_p_eval_us = 0;
+    mutable int64_t t_eval_us   = 0;
+
+    mutable int64_t t_compute_start_us = 0;
+    mutable int64_t n_queued_tokens    = 0;
+
+    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    mutable int32_t n_eval   = 0; // number of eval calls
+
+public:
+    //
     // state save/load
+    //
 
     virtual size_t state_get_size();
     virtual size_t state_get_data(      uint8_t * dst, size_t size);
@@ -265,31 +315,15 @@ struct llama_context : public llama_graph_i {
                 size_t   n_token_count);
 
 protected:
-    // state save/load
-
     virtual size_t state_get_data(llama_io_write_i & io);
     virtual size_t state_set_data(llama_io_read_i  & io);
 
     virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id);
     virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id);
 
-    // input
-
-    virtual void input_set(const llama_ubatch & ubatch);
-
-    // base input tensors
-    ggml_tensor * inp_tokens;  // I32 [n_batch]
-    ggml_tensor * inp_embd;    // F32 [n_embd, n_batch]
-    ggml_tensor * inp_pos;     // I32 [n_batch]
-    ggml_tensor * inp_out_ids; // I32 [n_outputs]
-    ggml_tensor * inp_mean;    // F32 [n_batch, n_batch]
-    ggml_tensor * inp_cls;     // I32 [n_batch]
-
-    // KQ mask input tensors
-    ggml_tensor * inp_kq_mask;     // F32 [n_tokens, n_batch]
-    ggml_tensor * inp_kq_mask_cnv; //     [n_tokens, n_batch]
-
+    //
     // members
+    //
 
     const llama_model & model;
 
@@ -311,7 +345,9 @@ struct llama_context : public llama_graph_i {
 
     ggml_backend_sched_ptr sched;
 
-    ggml_context_ptr ctx_compute;
+    // buffer types used for the compute buffer of each backend
+    std::vector<ggml_backend_t>             backend_ptrs;
+    std::vector<ggml_backend_buffer_type_t> backend_buft;
 
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
@@ -340,19 +376,7 @@ struct llama_context : public llama_graph_i {
 
     std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
 
-    bool need_reserve       = false;
     bool has_evaluated_once = false;
-
-    mutable int64_t t_start_us  = 0;
-    mutable int64_t t_load_us   = 0;
-    mutable int64_t t_p_eval_us = 0;
-    mutable int64_t t_eval_us   = 0;
-
-    mutable int64_t t_compute_start_us = 0;
-    mutable int64_t n_queued_tokens    = 0;
-
-    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
-    mutable int32_t n_eval   = 0; // number of eval calls
 };
 
 // transformer with a self-attention KV cache
@@ -364,18 +388,40 @@ class llama_context_kv_self : public llama_context {
 
     virtual ~llama_context_kv_self();
 
+protected:
+    virtual void reserve() override;
+
+public:
     virtual       llama_kv_cache * get_kv_self()       override;
     virtual const llama_kv_cache * get_kv_self() const override;
 
     virtual void kv_self_update() override;
 
-    virtual ggml_cgraph * graph_init() override;
-
     virtual int encode(llama_batch & inp_batch) override;
     virtual int decode(llama_batch & inp_batch) override;
 
-    // certain implementations could require a padding for the context size
-    uint32_t get_ctx_padding(const llama_cparams & cparams) const;
+protected:
+    //
+    // input
+    //
+
+    virtual void input_set(const llama_ubatch & ubatch) override;
+
+    ggml_tensor * inp_self_kq_mask;         // F32 [kv_size, n_batch]
+    ggml_tensor * inp_self_kq_mask_cnv;     //     [kv_size, n_batch]
+    ggml_tensor * inp_self_kq_mask_swa;     // F32 [kv_size, n_batch]
+    ggml_tensor * inp_self_kq_mask_swa_cnv; //     [kv_size, n_batch]
+    ggml_tensor * inp_self_k_shift;         // I32 [kv_size]
+
+    //
+    // graph
+    //
+
+    virtual ggml_cgraph * graph_init() override;
+
+    //
+    // graph build
+    //
 
     virtual ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
 
@@ -383,8 +429,7 @@ class llama_context_kv_self : public llama_context {
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
-                    bool   swa,
-                    bool   worst_case) override;
+                    bool   swa) override;
 
     virtual ggml_tensor * build_attn(
             ggml_context * ctx0,
@@ -396,8 +441,7 @@ class llama_context_kv_self : public llama_context {
              ggml_tensor * v_cur,
                  int32_t   n_tokens,
                  float     kq_scale,
-                 int       il,
-                 bool      worst_case) override;
+                 int       il) override;
 
     virtual void build_kv_self_shift(
             ggml_context * ctx0,
@@ -422,31 +466,27 @@ class llama_context_kv_self : public llama_context {
     struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch]
 
     virtual ggml_tensor * build_inp_embd_enc(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case) override;
+            ggml_context * ctx0) override;
 
     virtual ggml_tensor * build_inp_kq_mask_cross(
             ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case) override;
+                 int32_t   n_tokens) override;
+
+    //
+    // state save/load
+    //
 
-protected:
     virtual size_t state_get_data(llama_io_write_i & io) override;
     virtual size_t state_set_data(llama_io_read_i  & io) override;
 
     virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
     virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
 
-    virtual void input_set(const llama_ubatch & ubatch) override;
+    //
+    // members
+    //
 
     llama_kv_cache kv_self;
-
-    ggml_tensor * inp_self_kq_mask;         // F32 [kv_size, n_batch]
-    ggml_tensor * inp_self_kq_mask_cnv;     //     [kv_size, n_batch]
-    ggml_tensor * inp_self_kq_mask_swa;     // F32 [kv_size, n_batch]
-    ggml_tensor * inp_self_kq_mask_swa_cnv; //     [kv_size, n_batch]
-    ggml_tensor * inp_self_k_shift;         // I32 [kv_size]
 };
 
 // a recurrent transformer (ie.e RWKV, Mamba)
@@ -458,23 +498,43 @@ class llama_context_recurrent : public llama_context {
 
     virtual ~llama_context_recurrent();
 
+protected:
+    virtual void reserve() override;
+
+public:
     virtual       llama_kv_cache * get_kv_self()       override;
     virtual const llama_kv_cache * get_kv_self() const override;
 
     virtual void kv_self_update() override;
 
-    virtual ggml_cgraph * graph_init() override;
-
     virtual int encode(llama_batch & inp_batch) override;
     virtual int decode(llama_batch & inp_batch) override;
 
+protected:
+    //
+    // input
+    //
+
+    virtual void input_set(const llama_ubatch & ubatch) override;
+
+    struct ggml_tensor * inp_s_copy; // I32 [kv_size]
+    struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
+
+    //
+    // graph
+    //
+
+    virtual ggml_cgraph * graph_init() override;
+
+    //
+    // graph build
+    //
+
     virtual ggml_tensor * build_inp_s_copy(
-            ggml_context * ctx0,
-                    bool   worst_case) override;
+            ggml_context * ctx0) override;
 
     virtual ggml_tensor * build_inp_s_mask(
-            ggml_context * ctx0,
-                    bool   worst_case) override;
+            ggml_context * ctx0) override;
 
     virtual ggml_tensor * build_copy_mask_state(
             ggml_context * ctx0,
@@ -482,10 +542,8 @@ class llama_context_recurrent : public llama_context {
              ggml_tensor * s,
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
-                 int32_t   n_tokens,
                  int32_t   n_state,
-                 int32_t   n_seqs,
-                    bool   worst_case) override;
+                 int32_t   n_seqs) override;
 
     virtual ggml_tensor * build_mamba_layer(
             ggml_context * ctx0,
@@ -494,8 +552,7 @@ class llama_context_recurrent : public llama_context {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) override;
+                     int   il) override;
 
     virtual ggml_tensor * build_rwkv_token_shift_load(
             ggml_context * ctx0,
@@ -503,15 +560,13 @@ class llama_context_recurrent : public llama_context {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) override;
+                     int   il) override;
 
     virtual ggml_tensor * build_rwkv_token_shift_store(
             ggml_context * ctx0,
              ggml_tensor * token_shift,
       const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) override;
+                     int   il) override;
 
     virtual ggml_tensor * build_rwkv6_time_mix(
             ggml_context * ctx0,
@@ -521,23 +576,24 @@ class llama_context_recurrent : public llama_context {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) override;
+                     int   il) override;
+
+    //
+    // state save/load
+    //
 
-protected:
     virtual size_t state_get_data(llama_io_write_i & io) override;
     virtual size_t state_set_data(llama_io_read_i  & io) override;
 
     virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
     virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
 
-    virtual void input_set(const llama_ubatch & ubatch) override;
+    //
+    // members
+    //
 
     // TODO: change name to something more meaningful -- does "KV cache" make sense for recurrent models?
     llama_kv_cache_recurrent kv_self;
-
-    struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
-    struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
 };
 
 // For internal test use
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index d9d4e00e98ba0..af556f5bb81f0 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -12,8 +12,7 @@ ggml_tensor * llama_graph_i::build_attn(
          ggml_tensor * v_cur,
              int32_t   n_tokens,
              float     kq_scale,
-             int       il,
-             bool      worst_case) {
+             int       il) {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(wo);
@@ -24,7 +23,6 @@ ggml_tensor * llama_graph_i::build_attn(
     GGML_UNUSED(n_tokens);
     GGML_UNUSED(kq_scale);
     GGML_UNUSED(il);
-    GGML_UNUSED(worst_case);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
     return nullptr;
@@ -57,12 +55,8 @@ ggml_tensor * llama_graph_i::build_inp_self_k_shift(
 }
 
 ggml_tensor * llama_graph_i::build_inp_embd_enc(
-        ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   worst_case) {
+        ggml_context * ctx0) {
     GGML_UNUSED(ctx0);
-    GGML_UNUSED(n_tokens);
-    GGML_UNUSED(worst_case);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
     return nullptr;
@@ -70,21 +64,17 @@ ggml_tensor * llama_graph_i::build_inp_embd_enc(
 
 ggml_tensor * llama_graph_i::build_inp_kq_mask_cross(
         ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   worst_case) {
+             int32_t   n_tokens) {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(n_tokens);
-    GGML_UNUSED(worst_case);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
     return nullptr;
 }
 
 ggml_tensor * llama_graph_i::build_inp_s_copy (
-        ggml_context * ctx0,
-                bool   worst_case) {
+        ggml_context * ctx0) {
     GGML_UNUSED(ctx0);
-    GGML_UNUSED(worst_case);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 
@@ -92,10 +82,8 @@ ggml_tensor * llama_graph_i::build_inp_s_copy (
 }
 
 ggml_tensor * llama_graph_i::build_inp_s_mask(
-        ggml_context * ctx0,
-                bool   worst_case) {
+        ggml_context * ctx0) {
     GGML_UNUSED(ctx0);
-    GGML_UNUSED(worst_case);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 
@@ -108,19 +96,15 @@ ggml_tensor * llama_graph_i::build_copy_mask_state(
          ggml_tensor * s,
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
-             int32_t   n_tokens,
              int32_t   n_state,
-             int32_t   n_seqs,
-                bool   worst_case) {
+             int32_t   n_seqs) {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(s);
     GGML_UNUSED(state_copy);
     GGML_UNUSED(state_mask);
-    GGML_UNUSED(n_tokens);
     GGML_UNUSED(n_state);
     GGML_UNUSED(n_seqs);
-    GGML_UNUSED(worst_case);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 
@@ -134,8 +118,7 @@ ggml_tensor * llama_graph_i::build_mamba_layer(
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(cur);
@@ -143,7 +126,6 @@ ggml_tensor * llama_graph_i::build_mamba_layer(
     GGML_UNUSED(state_mask);
     GGML_UNUSED(ubatch);
     GGML_UNUSED(il);
-    GGML_UNUSED(worst_case);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 
@@ -156,15 +138,13 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(state_copy);
     GGML_UNUSED(state_mask);
     GGML_UNUSED(ubatch);
     GGML_UNUSED(il);
-    GGML_UNUSED(worst_case);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 
@@ -175,13 +155,11 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
         ggml_context * ctx0,
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(token_shift);
     GGML_UNUSED(ubatch);
     GGML_UNUSED(il);
-    GGML_UNUSED(worst_case);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 
@@ -196,8 +174,7 @@ ggml_tensor * llama_graph_i::build_rwkv6_time_mix(
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(cur);
@@ -206,7 +183,6 @@ ggml_tensor * llama_graph_i::build_rwkv6_time_mix(
     GGML_UNUSED(state_mask);
     GGML_UNUSED(ubatch);
     GGML_UNUSED(il);
-    GGML_UNUSED(worst_case);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 8d237431e657a..05349e5872710 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -69,9 +69,7 @@ class llama_graph_i {
                  int32_t   n_tokens) = 0;
 
     virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case) = 0;
+            ggml_context * ctx0) = 0;
 
     virtual ggml_tensor * build_inp_mean(
             ggml_context * ctx0,
@@ -85,8 +83,7 @@ class llama_graph_i {
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
-                    bool   swa,
-                    bool   worst_case) = 0;
+                    bool   swa) = 0;
 
     virtual ggml_tensor * build_attn(
             ggml_context * ctx0,
@@ -98,8 +95,7 @@ class llama_graph_i {
              ggml_tensor * v_cur,
                  int32_t   n_tokens,
                  float     kq_scale,
-                 int       il,
-                 bool      worst_case);
+                 int       il);
 
     virtual void build_kv_self_shift(
             ggml_context * ctx0,
@@ -114,22 +110,17 @@ class llama_graph_i {
             ggml_context * ctx0);
 
     virtual ggml_tensor * build_inp_embd_enc(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case);
+            ggml_context * ctx0);
 
     virtual ggml_tensor * build_inp_kq_mask_cross(
             ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case);
+                 int32_t   n_tokens);
 
     virtual ggml_tensor * build_inp_s_copy(
-            ggml_context * ctx0,
-                    bool   worst_case);
+            ggml_context * ctx0);
 
     virtual ggml_tensor * build_inp_s_mask(
-            ggml_context * ctx0,
-                    bool   worst_case);
+            ggml_context * ctx0);
 
     virtual ggml_tensor * build_copy_mask_state(
             ggml_context * ctx0,
@@ -137,10 +128,8 @@ class llama_graph_i {
              ggml_tensor * s,
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
-                 int32_t   n_tokens,
                  int32_t   n_state,
-                 int32_t   n_seqs,
-                    bool   worst_case);
+                 int32_t   n_seqs);
 
     virtual ggml_tensor * build_mamba_layer(
             ggml_context * ctx0,
@@ -149,8 +138,7 @@ class llama_graph_i {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case);
+                     int   il);
 
     virtual ggml_tensor * build_rwkv_token_shift_load(
             ggml_context * ctx0,
@@ -158,15 +146,13 @@ class llama_graph_i {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case);
+                     int   il);
 
     virtual ggml_tensor * build_rwkv_token_shift_store(
             ggml_context * ctx0,
              ggml_tensor * token_shift,
       const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case);
+                     int   il);
 
     virtual ggml_tensor * build_rwkv6_time_mix(
             ggml_context * ctx0,
@@ -176,6 +162,5 @@ class llama_graph_i {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case);
+                     int   il);
 };
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 3aec6495fe02e..e1b07c9932166 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -610,6 +610,7 @@ struct llama_kv_cache_slot_info llama_kv_cache::find_slot(
         // sanity check
         return llama_kv_cache_slot_info(n >= n_seqs);
     }
+
     // otherwise, one cell per token.
 
     if (n_tokens > size) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index a0a7816da2ebf..8eb99995ea232 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3834,7 +3834,6 @@ struct llm_build_context {
     const int32_t n_tokens;
     const int32_t n_ctx_orig;
 
-    const bool worst_case;
     const bool flash_attn;
 
     const enum llama_pooling_type pooling_type;
@@ -3851,8 +3850,7 @@ struct llm_build_context {
             llama_graph_i * lgf,
       const llama_model   & model,
       const llama_cparams & cparams,
-      const llama_ubatch  & ubatch,
-            bool            worst_case) :
+      const llama_ubatch  & ubatch) :
         model            (model),
         hparams          (model.hparams),
         cparams          (cparams),
@@ -3879,7 +3877,6 @@ struct llm_build_context {
         norm_rms_eps     (hparams.f_norm_rms_eps),
         n_tokens         (ubatch.n_tokens),
         n_ctx_orig       (cparams.n_ctx_orig_yarn),
-        worst_case       (worst_case),
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
@@ -3910,7 +3907,7 @@ struct llm_build_context {
 
     // TODO: tmp
     struct ggml_tensor * build_inp_out_ids() {
-        ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf->build_inp_out_ids(ctx0);
         cb(cur, "inp_out_ids", -1);
 
         return cur;
@@ -3949,7 +3946,7 @@ struct llm_build_context {
 
     // TODO: tmp
     struct ggml_tensor * build_inp_embd_enc() {
-        ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0);
         cb(cur, "embd_enc", -1);
 
         return cur;
@@ -3957,7 +3954,7 @@ struct llm_build_context {
 
     // TODO: tmp
     struct ggml_tensor * build_inp_kq_mask_cross() {
-        ggml_tensor * cur = lgf->build_inp_kq_mask_cross(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf->build_inp_kq_mask_cross(ctx0, n_tokens);
         cb(cur, "KQ_mask_cross", -1);
 
         return cur;
@@ -4258,7 +4255,7 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, k_cur);
         ggml_build_forward_expand(gf, v_cur);
 
-        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il, worst_case);
+        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il);
         cb(cur, "kqv_out", il);
 
         return cur;
@@ -4405,7 +4402,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -4566,7 +4563,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -4722,7 +4719,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4838,7 +4835,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4943,7 +4940,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
@@ -5066,7 +5063,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5218,7 +5215,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5340,7 +5337,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -5441,7 +5438,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5555,7 +5552,7 @@ struct llm_build_context {
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
-        lgf->build_attn_inp(ctx0, n_tokens, false, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, false, false);
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
@@ -5700,7 +5697,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         inpL = build_norm(inpL,
                 model.tok_norm,
@@ -5803,7 +5800,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         if (model.pos_embd) {
             // inp_pos - contains the positions
@@ -5945,7 +5942,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -6096,7 +6093,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6210,7 +6207,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6323,7 +6320,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         int sections[4];
         std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -6441,7 +6438,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6588,7 +6585,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             attn_norm_output = build_norm(inpL,
@@ -6711,7 +6708,7 @@ struct llm_build_context {
         struct ggml_tensor * inp_pos = build_inp_pos();
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, true);
 
         for (int il = 0; il < n_layer; ++il) {
             auto * residual = inpL;
@@ -6855,7 +6852,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -6961,7 +6958,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -7067,7 +7064,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -7178,7 +7175,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7297,7 +7294,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7425,7 +7422,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7626,7 +7623,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7734,7 +7731,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, true);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7864,7 +7861,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7977,8 +7974,8 @@ struct llm_build_context {
         // {n_embd, n_tokens}
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7988,7 +7985,7 @@ struct llm_build_context {
             cb(cur, "attn_norm", il);
 
             //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
-            cur = lgf->build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf->build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il);
 
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
@@ -8039,7 +8036,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -8187,7 +8184,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, true);
 
         // sliding window switch pattern
         const int32_t sliding_window_pattern = 4;
@@ -8322,7 +8319,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8442,7 +8439,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8566,7 +8563,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8687,7 +8684,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             const int64_t n_head    = hparams.n_head(il);
@@ -8815,7 +8812,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -8959,7 +8956,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9089,7 +9086,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
@@ -9252,7 +9249,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9470,7 +9467,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9951,7 +9948,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -10045,7 +10042,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10175,7 +10172,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10296,7 +10293,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10414,8 +10411,8 @@ struct llm_build_context {
         inpL = build_inp_embd(model.tok_embd);
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10425,7 +10422,7 @@ struct llm_build_context {
             const llama_layer * layer = &model.layers[il];
 
             struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load(
-                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
+                ctx0, gf, state_copy, state_mask, ubatch, il
             );
 
             struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
@@ -10441,7 +10438,7 @@ struct llm_build_context {
                 1
             );
 
-            cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
 
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
@@ -10464,7 +10461,7 @@ struct llm_build_context {
                 ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
                 1
             );
-            ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+            ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il));
 
             if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
                 cur = ggml_scale(ctx0, cur, 0.5F);
@@ -10506,8 +10503,8 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10519,7 +10516,7 @@ struct llm_build_context {
             const llama_layer * layer = &model.layers[il];
 
             struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load(
-                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
+                ctx0, gf, state_copy, state_mask, ubatch, il
             );
 
             struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
@@ -10532,10 +10529,10 @@ struct llm_build_context {
                 1
             );
 
-            cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
 
             token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
-            ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+            ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il));
 
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
@@ -10601,7 +10598,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10912,9 +10909,8 @@ llama_graph_result llama_model::build_graph(
            ggml_cgraph * gf,
          llama_graph_i * lgf,
    const llama_cparams & cparams,
-   const llama_ubatch  & ubatch,
-                  bool   worst_case) const {
-    struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch, worst_case);
+   const llama_ubatch  & ubatch) const {
+    struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch);
 
     switch (arch) {
         case LLM_ARCH_LLAMA:
diff --git a/src/llama-model.h b/src/llama-model.h
index 94e7622943937..b2d75e593f2f3 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -374,8 +374,7 @@ struct llama_model {
                ggml_cgraph * gf,
              llama_graph_i * lgf,
        const llama_cparams & cparams,
-       const llama_ubatch  & ubatch,
-                      bool   worst_case) const;
+       const llama_ubatch  & ubatch) const;
 
 private:
     struct impl;

From ebf1bdf97bed94d46c48b3c3b14f1893fa5bfa5e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 21 Feb 2025 14:35:23 +0200
Subject: [PATCH 64/84] context : add logs

ggml-ci
---
 examples/save-load-state/save-load-state.cpp |  2 +-
 src/llama-context.cpp                        | 76 ++++++++++++++++++--
 src/llama-context.h                          | 10 +--
 3 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 77b1572a9dec5..760ebbbf08788 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -15,7 +15,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    print_build_info();
+    common_init();
 
     if (params.n_predict < 0) {
         params.n_predict = 16;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index dc1eb70b85a5e..2a7a4083b547f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -180,6 +180,8 @@ llama_context::llama_context(
 llama_context::~llama_context() = default;
 
 void llama_context::init() {
+    LLAMA_LOG_DEBUG("%s: call\n", __func__);
+
     const auto & hparams = model.hparams;
 
     if (hparams.vocab_only) {
@@ -188,13 +190,15 @@ void llama_context::init() {
     }
 
     {
-        // buffer types used for the compute buffer of each backend
+        LLAMA_LOG_DEBUG("%s: enumerating backends\n", __func__);
+
         backend_buft.clear();
         backend_ptrs.clear();
 
         for (auto & backend : backends) {
             auto * buft = ggml_backend_get_default_buffer_type(backend.get());
             auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+
             if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
                 // use the host buffer of the first device CPU for faster transfer of the intermediate state
                 auto * dev = model.devices[0];
@@ -203,14 +207,18 @@ void llama_context::init() {
                     buft = host_buft;
                 }
             }
+
             backend_buft.push_back(buft);
             backend_ptrs.push_back(backend.get());
         }
 
+        LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
+
         const size_t max_nodes = this->max_nodes();
 
+        LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
+
         // buffer used to store the computation graph and the tensor meta data
-        // TODO: move to base class
         buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
 
         // TODO: move these checks to ggml_backend_sched
@@ -247,6 +255,8 @@ void llama_context::init() {
         }
     }
 
+    LLAMA_LOG_DEBUG("%s: calling reserve()\n", __func__);
+
     reserve();
 }
 
@@ -286,15 +296,17 @@ void llama_context::reserve() {
 
     llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
+    // max number of outputs
+    n_outputs = n_tokens;
+
+    LLAMA_LOG_DEBUG("%s: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
+
     int n_splits_pp = -1;
     int n_nodes_pp  = -1;
 
     int n_splits_tg = -1;
     int n_nodes_tg  = -1;
 
-    // max number of outputs
-    n_outputs = n_tokens;
-
     // reserve pp graph first so that buffers are only allocated once
     {
         llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
@@ -521,21 +533,29 @@ int64_t llama_context::n_pos_per_token() const {
 void llama_context::attach_threadpool(
            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch) {
+    LLAMA_LOG_DEBUG("%s: call\n", __func__);
+
     this->threadpool       = threadpool;
     this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
 }
 
 void llama_context::detach_threadpool() {
+    LLAMA_LOG_DEBUG("%s: call\n", __func__);
+
     this->threadpool       = nullptr;
     this->threadpool_batch = nullptr;
 }
 
 void llama_context::set_n_threads(int32_t n_threads, int32_t n_threads_batch) {
+    LLAMA_LOG_DEBUG("%s: n_threads = %d, n_threads_batch = %d\n", __func__, n_threads, n_threads_batch);
+
     cparams.n_threads       = n_threads;
     cparams.n_threads_batch = n_threads_batch;
 }
 
 void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) {
+    LLAMA_LOG_DEBUG("%s: call\n", __func__);
+
     this->abort_callback      = abort_callback;
     this->abort_callback_data = abort_callback_data;
 
@@ -549,21 +569,29 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
 }
 
 void llama_context::set_embeddings(bool value) {
+    LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+
     cparams.embeddings = value;
 }
 
 void llama_context::set_causal_attn(bool value) {
+    LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+
     cparams.causal_attn = value;
 }
 
 void llama_context::set_adapter_lora(
-            struct llama_adapter_lora * adapter,
+            llama_adapter_lora * adapter,
             float scale) {
+    LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
+
     loras[adapter] = scale;
 }
 
 bool llama_context::rm_adapter_lora(
-            struct llama_adapter_lora * adapter) {
+            llama_adapter_lora * adapter) {
+    LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
+
     auto pos = loras.find(adapter);
     if (pos != loras.end()) {
         loras.erase(pos);
@@ -574,6 +602,8 @@ bool llama_context::rm_adapter_lora(
 }
 
 void llama_context::clear_adapter_lora() {
+    LLAMA_LOG_DEBUG("%s: call\n", __func__);
+
     loras.clear();
 }
 
@@ -583,6 +613,8 @@ bool llama_context::apply_adapter_cvec(
                 int32_t   n_embd,
                 int32_t   il_start,
                 int32_t   il_end) {
+    LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
+
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
@@ -2085,8 +2117,12 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
 }
 
 size_t llama_context::state_get_data(llama_io_write_i & io) {
+    LLAMA_LOG_DEBUG("%s: writing state\n", __func__);
+
     // write model info
     {
+        LLAMA_LOG_DEBUG("%s: - writing model info\n", __func__);
+
         const std::string arch_str = llm_arch_name(model.arch);
         io.write_string(arch_str);
         // TODO: add more model-specific info which should prevent loading the session file if not identical
@@ -2094,6 +2130,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
 
     // write output ids
     {
+        LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
+
         output_reorder();
 
         const auto n_outputs    = this->n_outputs;
@@ -2124,6 +2162,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
 
     // write logits
     {
+        LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
+
         const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
 
         io.write(&logits_size, sizeof(logits_size));
@@ -2135,6 +2175,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
 
     // write embeddings
     {
+        LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__);
+
         const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd);
 
         io.write(&embd_size, sizeof(embd_size));
@@ -2148,8 +2190,12 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
 }
 
 size_t llama_context::state_set_data(llama_io_read_i & io) {
+    LLAMA_LOG_DEBUG("%s: reading state\n", __func__);
+
     // read model info
     {
+        LLAMA_LOG_DEBUG("%s: - reading model info\n", __func__);
+
         const std::string cur_arch_str = llm_arch_name(model.arch);
 
         std::string arch_str;
@@ -2162,6 +2208,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
 
     // read output ids
     {
+        LLAMA_LOG_DEBUG("%s: - reading output ids\n", __func__);
+
         auto n_outputs = this->n_outputs;
         io.read_to(&n_outputs, sizeof(n_outputs));
 
@@ -2189,6 +2237,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
 
     // read logits
     {
+        LLAMA_LOG_DEBUG("%s: - reading logits\n", __func__);
+
         uint64_t logits_size;
         io.read_to(&logits_size, sizeof(logits_size));
 
@@ -2203,6 +2253,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
 
     // read embeddings
     {
+        LLAMA_LOG_DEBUG("%s: - reading embeddings\n", __func__);
+
         uint64_t embd_size;
         io.read_to(&embd_size, sizeof(embd_size));
 
@@ -2285,6 +2337,8 @@ void llama_context_kv_self::reserve() {
     // simulate full KV cache
     kv_self.n = kv_self.size;
 
+    LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n);
+
     llama_context::reserve();
 }
 
@@ -2297,6 +2351,8 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
 }
 
 void llama_context_kv_self::kv_self_update() {
+    LLAMA_LOG_DEBUG("%s: kv_self_update()\n", __func__);
+
     auto & kv = kv_self;
 
     bool need_reserve = false;
@@ -2306,6 +2362,8 @@ void llama_context_kv_self::kv_self_update() {
             GGML_ABORT("The current context does not support K-shift");
         }
 
+        LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
+
         // apply K-shift if needed
         if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
             ggml_backend_sched_reset(sched.get());
@@ -2334,6 +2392,8 @@ void llama_context_kv_self::kv_self_update() {
 
     // defragment the KV cache if needed
     if (kv.do_defrag) {
+        LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+
         ggml_backend_sched_reset(sched.get());
 
         auto * gf = graph_init();
@@ -3667,6 +3727,7 @@ ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross(
 size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
     llama_context::state_get_data(io);
 
+    LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
     kv_self.state_write(io);
 
     return io.n_bytes();
@@ -3675,6 +3736,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
 size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
     llama_context::state_set_data(io);
 
+    LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
     kv_self.state_read(io);
 
     return io.n_bytes();
diff --git a/src/llama-context.h b/src/llama-context.h
index d4ab5d509b155..bc6a0e291edbd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -55,11 +55,13 @@ struct llama_context : public llama_graph_i {
 
     virtual int32_t max_nodes() const;
 
-    // returns nullptr
+    // self-attention:
+
+    // if the context does not have a KV cache, return nullptr
     virtual       llama_kv_cache * get_kv_self();
     virtual const llama_kv_cache * get_kv_self() const;
 
-    // noop
+    // if the context does not have a KV cache, noop
     virtual void kv_self_update();
 
     virtual enum llama_pooling_type pooling_type() const;
@@ -87,11 +89,11 @@ struct llama_context : public llama_graph_i {
     virtual void set_causal_attn(bool value);
 
     virtual void set_adapter_lora(
-            struct llama_adapter_lora * adapter,
+            llama_adapter_lora * adapter,
             float scale);
 
     virtual bool rm_adapter_lora(
-            struct llama_adapter_lora * adapter);
+            llama_adapter_lora * adapter);
 
     virtual void clear_adapter_lora();
 

From f588a70da3a1177d98e8bc00fe074ab010093709 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 21 Feb 2025 15:08:25 +0200
Subject: [PATCH 65/84] context : wrap input tensors in struct

ggml-ci
---
 src/llama-context.cpp | 196 ++++++++++++++++++++----------------------
 src/llama-context.h   |  40 +++++----
 2 files changed, 115 insertions(+), 121 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 2a7a4083b547f..40d4e47a448bc 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -966,32 +966,32 @@ void llama_context::input_set(const llama_ubatch & ubatch) {
     if (ubatch.token) {
         const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
+        ggml_backend_tensor_set(inp.tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp.tokens));
     }
 
     if (ubatch.embd) {
         const int64_t n_embd   = hparams.n_embd;
         const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd));
+        ggml_backend_tensor_set(inp.embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp.embd));
     }
 
-    if (ubatch.pos && inp_pos) {
+    if (ubatch.pos && inp.pos) {
         const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos));
+        ggml_backend_tensor_set(inp.pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp.pos));
     }
 
     if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs");
+        //GGML_ASSERT(inp.out_ids && "every model that can must skip unused outputs");
 
-        if (!inp_out_ids) {
-            LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__);
+        if (!inp.out_ids) {
+            LLAMA_LOG_WARN("%s: 'inp.out_ids' is not created\n", __func__);
         } else {
             const int64_t n_tokens = ubatch.n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer));
-            int32_t * data = (int32_t *) inp_out_ids->data;
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp.out_ids->buffer));
+            int32_t * data = (int32_t *) inp.out_ids->data;
 
             if (n_outputs == n_tokens) {
                 for (int i = 0; i < n_tokens; ++i) {
@@ -1020,11 +1020,11 @@ void llama_context::input_set(const llama_ubatch & ubatch) {
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
 
-        GGML_ASSERT(inp_mean);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer));
+        GGML_ASSERT(inp.mean);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp.mean->buffer));
 
-        float * data = (float *) inp_mean->data;
-        memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean));
+        float * data = (float *) inp.mean->data;
+        memset(inp.mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp.mean));
 
         std::vector<uint64_t> sum(n_tokens, 0);
 
@@ -1061,11 +1061,11 @@ void llama_context::input_set(const llama_ubatch & ubatch) {
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
 
-        GGML_ASSERT(inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
+        GGML_ASSERT(inp.cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp.cls->buffer));
 
-        uint32_t * data = (uint32_t *) inp_cls->data;
-        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
+        uint32_t * data = (uint32_t *) inp.cls->data;
+        memset(inp.cls->data, 0, n_tokens * ggml_element_size(inp.cls));
 
         for (int s = 0; s < n_seqs; ++s) {
             const llama_seq_id seq_id = ubatch.seq_id[s][0];
@@ -1088,11 +1088,11 @@ void llama_context::input_set(const llama_ubatch & ubatch) {
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
 
-        GGML_ASSERT(inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
+        GGML_ASSERT(inp.cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp.cls->buffer));
 
-        uint32_t * data = (uint32_t *) inp_cls->data;
-        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
+        uint32_t * data = (uint32_t *) inp.cls->data;
+        memset(inp.cls->data, 0, n_tokens * ggml_element_size(inp.cls));
 
         std::vector<int> last_pos(n_tokens, -1);
         std::vector<int> last_row(n_tokens, -1);
@@ -1120,15 +1120,15 @@ void llama_context::input_set(const llama_ubatch & ubatch) {
         }
     }
 
-    if (inp_kq_mask) {
+    if (inp.kq_mask) {
         if (cparams.causal_attn) {
             const int64_t n_kv         = ubatch.n_tokens;
             const int64_t n_tokens     = ubatch.n_tokens;
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
             const int64_t n_seqs       = ubatch.n_seqs;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer));
-            float * data = (float *) inp_kq_mask->data;
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp.kq_mask->buffer));
+            float * data = (float *) inp.kq_mask->data;
 
             for (int h = 0; h < 1; ++h) {
                 for (int s1 = 0; s1 < n_seqs; ++s1) {
@@ -1165,9 +1165,9 @@ void llama_context::input_set(const llama_ubatch & ubatch) {
             const int64_t n_seqs       = ubatch.n_seqs;
             const int64_t n_stride     = ubatch.n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer));
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp.kq_mask->buffer));
 
-            float * data = (float *) inp_kq_mask->data;
+            float * data = (float *) inp.kq_mask->data;
 
             for (int h = 0; h < 1; ++h) {
                 for (int s1 = 0; s1 < n_seqs; ++s1) {
@@ -1329,15 +1329,7 @@ void llama_context::output_reorder() {
 //
 
 ggml_cgraph * llama_context::graph_init() {
-    inp_tokens  = nullptr;
-    inp_embd    = nullptr;
-    inp_pos     = nullptr;
-    inp_out_ids = nullptr;
-    inp_mean    = nullptr;
-    inp_cls     = nullptr;
-
-    inp_kq_mask     = nullptr;
-    inp_kq_mask_cnv = nullptr;
+    inp = {};
 
     struct ggml_init_params params = {
         /*.mem_size   =*/ buf_compute_meta.size(),
@@ -1563,11 +1555,11 @@ ggml_tensor * llama_context::build_inp_embd(
     struct ggml_tensor * inpL;
 
     if (ubatch.token) {
-        inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-        //cb(inp_tokens, "inp_tokens", -1);
-        ggml_set_input(inp_tokens);
+        inp.tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+        //cb(inp.tokens, "inp_tokens", -1);
+        ggml_set_input(inp.tokens);
 
-        inpL = ggml_get_rows(ctx0, tok_embd, inp_tokens);
+        inpL = ggml_get_rows(ctx0, tok_embd, inp.tokens);
 
         // apply lora for embedding tokens if needed
         for (const auto & lora : loras) {
@@ -1581,15 +1573,15 @@ ggml_tensor * llama_context::build_inp_embd(
 
             struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
                         ctx0, lw->b, // non-transposed lora_b
-                        ggml_get_rows(ctx0, lw->a, inp_tokens)
+                        ggml_get_rows(ctx0, lw->a, inp.tokens)
                         ), scale);
 
             inpL = ggml_add(ctx0, inpL, inpL_delta);
         }
     } else {
-        inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
-        inpL = inp_embd;
-        ggml_set_input(inp_embd);
+        inp.embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+        inpL = inp.embd;
+        ggml_set_input(inp.embd);
     }
 
     // For Granite architecture
@@ -1605,38 +1597,38 @@ ggml_tensor * llama_context::build_inp_embd(
 ggml_tensor * llama_context::build_inp_pos(
         ggml_context * ctx0,
              int32_t   n_tokens) {
-    inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
-    ggml_set_input(inp_pos);
+    inp.pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
+    ggml_set_input(inp.pos);
 
-    return inp_pos;
+    return inp.pos;
 }
 
 ggml_tensor * llama_context::build_inp_out_ids(
         ggml_context * ctx0) {
     const int32_t n_out_ids = n_outputs;
 
-    inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids);
-    ggml_set_input(inp_out_ids);
+    inp.out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids);
+    ggml_set_input(inp.out_ids);
 
-    return inp_out_ids;
+    return inp.out_ids;
 }
 
 ggml_tensor * llama_context::build_inp_mean(
         ggml_context * ctx0,
              int32_t   n_tokens) {
-    inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
-    ggml_set_input(inp_mean);
+    inp.mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+    ggml_set_input(inp.mean);
 
-    return inp_mean;
+    return inp.mean;
 }
 
 ggml_tensor * llama_context::build_inp_cls(
         ggml_context * ctx0,
              int32_t   n_tokens) {
-    inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_set_input(inp_cls);
+    inp.cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp.cls);
 
-    return inp_cls;
+    return inp.cls;
 }
 
 void llama_context::build_attn_inp(
@@ -1648,11 +1640,11 @@ void llama_context::build_attn_inp(
     GGML_UNUSED(causal);
     GGML_UNUSED(swa);
 
-    inp_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    inp.kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
     //cb(inp_kq_mask, "KQ_mask", -1);
-    ggml_set_input(inp_kq_mask);
+    ggml_set_input(inp.kq_mask);
 
-    inp_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_kq_mask, GGML_TYPE_F16) : inp_kq_mask;
+    inp.kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.kq_mask, GGML_TYPE_F16) : inp.kq_mask;
 }
 
 ggml_tensor * llama_context::build_attn(
@@ -1673,7 +1665,7 @@ ggml_tensor * llama_context::build_attn(
   //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
     const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
 
-    const auto & kq_mask = inp_kq_mask_cnv;
+    const auto & kq_mask = inp.kq_mask_cnv;
 
     const int64_t n_head    = hparams.n_head(il);
     const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -2923,10 +2915,10 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     const llama_hparams & hparams = model.hparams;
 
-    if (inp_self_k_shift) {
-        assert(ggml_backend_buffer_is_host(inp_self_k_shift->buffer));
+    if (inp.self_k_shift) {
+        assert(ggml_backend_buffer_is_host(inp.self_k_shift->buffer));
 
-        int32_t * data = (int32_t *) inp_self_k_shift->data;
+        int32_t * data = (int32_t *) inp.self_k_shift->data;
 
         for (uint32_t i = 0; i < kv_self.size; ++i) {
             data[i] = kv_self.cells[i].delta;
@@ -2939,7 +2931,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     // call base functionality
     llama_context::input_set(ubatch);
 
-    if (inp_self_kq_mask || inp_self_kq_mask_swa) {
+    if (inp.self_kq_mask || inp.self_kq_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
         if (cparams.causal_attn && !is_encoding) {
             const int64_t n_kv         = kv_self.n;
@@ -2950,14 +2942,14 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
             float * data     = nullptr;
             float * data_swa = nullptr;
 
-            if (inp_self_kq_mask) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask->buffer));
-                data = (float *) inp_self_kq_mask->data;
+            if (inp.self_kq_mask) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask->buffer));
+                data = (float *) inp.self_kq_mask->data;
             }
 
-            if (inp_self_kq_mask_swa) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask_swa->buffer));
-                data_swa = (float *) inp_self_kq_mask_swa->data;
+            if (inp.self_kq_mask_swa) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask_swa->buffer));
+                data_swa = (float *) inp.self_kq_mask_swa->data;
             }
 
             // For causal attention, use only the previous KV cells
@@ -3020,9 +3012,9 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
             // when using kv cache, the mask needs to match the kv cache size
             const int64_t n_stride     = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask->buffer));
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask->buffer));
 
-            float * data = (float *) inp_self_kq_mask->data;
+            float * data = (float *) inp.self_kq_mask->data;
 
             for (int h = 0; h < 1; ++h) {
                 for (int s1 = 0; s1 < n_seqs; ++s1) {
@@ -3156,20 +3148,16 @@ ggml_cgraph * llama_context_kv_self::graph_init() {
     inp_pos_bucket    = nullptr;
     inp_kq_mask_cross = nullptr;
 
-    inp_self_kq_mask         = nullptr;
-    inp_self_kq_mask_cnv     = nullptr;
-    inp_self_kq_mask_swa     = nullptr;
-    inp_self_kq_mask_swa_cnv = nullptr;
-    inp_self_k_shift         = nullptr;
+    inp = {};
 
     return llama_context::graph_init();
 }
 
 ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) {
-    inp_self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx());
-    ggml_set_input(inp_self_k_shift);
+    inp.self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx());
+    ggml_set_input(inp.self_k_shift);
 
-    return inp_self_k_shift;
+    return inp.self_k_shift;
 }
 
 void llama_context_kv_self::build_attn_inp(
@@ -3179,26 +3167,26 @@ void llama_context_kv_self::build_attn_inp(
                 bool   swa) {
     const auto n_kv = kv_self.n;
 
-    inp_self_kq_mask = causal
+    inp.self_kq_mask = causal
         ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
         : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    //cb(inp_self_kq_mask, "KQ_mask", -1);
-    ggml_set_input(inp_self_kq_mask);
+    //cb(inp.self_kq_mask, "KQ_mask", -1);
+    ggml_set_input(inp.self_kq_mask);
 
-    inp_self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_self_kq_mask, GGML_TYPE_F16) : inp_self_kq_mask;
+    inp.self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.self_kq_mask, GGML_TYPE_F16) : inp.self_kq_mask;
 
     if (swa) {
         const auto & hparams = model.hparams;
 
         GGML_ASSERT(hparams.n_swa > 0);
 
-        inp_self_kq_mask_swa = causal
+        inp.self_kq_mask_swa = causal
             ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
             : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp_self_kq_mask_swa, "KQ_mask_swa", -1);
-        ggml_set_input(inp_self_kq_mask_swa);
+        //cb(inp.self_kq_mask_swa, "KQ_mask_swa", -1);
+        ggml_set_input(inp.self_kq_mask_swa);
 
-        inp_self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_self_kq_mask_swa, GGML_TYPE_F16) : inp_self_kq_mask_swa;
+        inp.self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.self_kq_mask_swa, GGML_TYPE_F16) : inp.self_kq_mask_swa;
     }
 }
 
@@ -3277,7 +3265,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
             }
     };
 
-    const auto & kq_mask = is_sliding ? inp_self_kq_mask_swa_cnv : inp_self_kq_mask_cnv;
+    const auto & kq_mask = is_sliding ? inp.self_kq_mask_swa_cnv : inp.self_kq_mask_cnv;
 
     const auto n_kv = kv_self.n;
 
@@ -4145,9 +4133,9 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
 
     const int64_t n_kv = kv_self.n;
 
-    if (inp_s_mask) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer));
-        float * data = (float *) inp_s_mask->data;
+    if (inp.s_mask) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_mask->buffer));
+        float * data = (float *) inp.s_mask->data;
 
         // clear unused states
         for (int i = 0; i < n_kv; ++i) {
@@ -4164,9 +4152,9 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
         }
     }
 
-    if (inp_s_copy) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer));
-        int32_t * data = (int32_t *) inp_s_copy->data;
+    if (inp.s_copy) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_copy->buffer));
+        int32_t * data = (int32_t *) inp.s_copy->data;
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
         for (uint32_t i = 0; i < n_kv; ++i) {
@@ -4190,8 +4178,8 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
 }
 
 ggml_cgraph * llama_context_recurrent::graph_init() {
-    inp_s_copy = nullptr;
-    inp_s_mask = nullptr;
+    inp.s_copy = nullptr;
+    inp.s_mask = nullptr;
 
     return llama_context::graph_init();
 }
@@ -4200,22 +4188,22 @@ ggml_tensor * llama_context_recurrent::build_inp_s_copy(
         ggml_context * ctx0) {
     const auto n_kv = kv_self.n;
 
-    inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
-    //cb(inp_s_copy, "inp_s_copy", -1);
-    ggml_set_input(inp_s_copy);
+    inp.s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
+    //cb(inp.s_copy, "inp_s_copy", -1);
+    ggml_set_input(inp.s_copy);
 
-    return inp_s_copy;
+    return inp.s_copy;
 }
 
 ggml_tensor * llama_context_recurrent::build_inp_s_mask(
         ggml_context * ctx0) {
     const auto n_kv = kv_self.n;
 
-    inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
-    //cb(inp_s_mask, "inp_s_mask", -1);
-    ggml_set_input(inp_s_mask);
+    inp.s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
+    //cb(inp.s_mask, "inp_s_mask", -1);
+    ggml_set_input(inp.s_mask);
 
-    return inp_s_mask;
+    return inp.s_mask;
 }
 
 ggml_tensor * llama_context_recurrent::build_copy_mask_state(
diff --git a/src/llama-context.h b/src/llama-context.h
index bc6a0e291edbd..ccb84874f8b62 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -139,17 +139,19 @@ struct llama_context : public llama_graph_i {
 
     virtual void input_set(const llama_ubatch & ubatch);
 
-    // base input tensors
-    ggml_tensor * inp_tokens;  // I32 [n_batch]
-    ggml_tensor * inp_embd;    // F32 [n_embd, n_batch]
-    ggml_tensor * inp_pos;     // I32 [n_batch]
-    ggml_tensor * inp_out_ids; // I32 [n_outputs]
-    ggml_tensor * inp_mean;    // F32 [n_batch, n_batch]
-    ggml_tensor * inp_cls;     // I32 [n_batch]
+    struct {
+        // base input tensors
+        ggml_tensor * tokens;  // I32 [n_batch]
+        ggml_tensor * embd;    // F32 [n_embd, n_batch]
+        ggml_tensor * pos;     // I32 [n_batch]
+        ggml_tensor * out_ids; // I32 [n_outputs]
+        ggml_tensor * mean;    // F32 [n_batch, n_batch]
+        ggml_tensor * cls;     // I32 [n_batch]
 
-    // KQ mask input tensors
-    ggml_tensor * inp_kq_mask;     // F32 [n_tokens, n_batch]
-    ggml_tensor * inp_kq_mask_cnv; //     [n_tokens, n_batch]
+        // KQ mask input tensors
+        ggml_tensor * kq_mask;     // F32 [n_tokens, n_batch]
+        ggml_tensor * kq_mask_cnv; //     [n_tokens, n_batch]
+    } inp;
 
     //
     // output
@@ -409,11 +411,13 @@ class llama_context_kv_self : public llama_context {
 
     virtual void input_set(const llama_ubatch & ubatch) override;
 
-    ggml_tensor * inp_self_kq_mask;         // F32 [kv_size, n_batch]
-    ggml_tensor * inp_self_kq_mask_cnv;     //     [kv_size, n_batch]
-    ggml_tensor * inp_self_kq_mask_swa;     // F32 [kv_size, n_batch]
-    ggml_tensor * inp_self_kq_mask_swa_cnv; //     [kv_size, n_batch]
-    ggml_tensor * inp_self_k_shift;         // I32 [kv_size]
+    struct {
+        ggml_tensor * self_kq_mask;         // F32 [kv_size, n_batch]
+        ggml_tensor * self_kq_mask_cnv;     //     [kv_size, n_batch]
+        ggml_tensor * self_kq_mask_swa;     // F32 [kv_size, n_batch]
+        ggml_tensor * self_kq_mask_swa_cnv; //     [kv_size, n_batch]
+        ggml_tensor * self_k_shift;         // I32 [kv_size]
+    } inp;
 
     //
     // graph
@@ -519,8 +523,10 @@ class llama_context_recurrent : public llama_context {
 
     virtual void input_set(const llama_ubatch & ubatch) override;
 
-    struct ggml_tensor * inp_s_copy; // I32 [kv_size]
-    struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
+    struct {
+        ggml_tensor * s_copy; // I32 [kv_size]
+        ggml_tensor * s_mask; // F32 [1, n_kv]
+    } inp;
 
     //
     // graph

From 3753b30d658c93c62f1481d4ed0b2d0800f0d284 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 21 Feb 2025 15:50:27 +0200
Subject: [PATCH 66/84] context : fix n_outputs init

ggml-ci
---
 src/llama-context.cpp | 8 +++-----
 src/llama-context.h   | 4 ++--
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 40d4e47a448bc..ce68d410a3795 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1274,14 +1274,13 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     logits = has_logits ? output_base               : nullptr;
     embd   = has_embd   ? output_base + logits_size : nullptr;
 
-    output_size = n_outputs_max;
-
     // set all ids as invalid (negative)
     std::fill(output_ids.begin(), output_ids.end(), -1);
 
     ggml_backend_buffer_clear(buf_output.get(), 0);
 
-    n_outputs = 0;
+    this->n_outputs     = 0;
+    this->n_outputs_max = n_outputs_max;
 
     return n_outputs_max;
 }
@@ -2131,7 +2130,7 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
 
         std::vector<int32_t> w_output_pos;
 
-        GGML_ASSERT(n_outputs <= output_size);
+        GGML_ASSERT(n_outputs <= n_outputs_max);
 
         w_output_pos.resize(n_outputs);
 
@@ -2682,7 +2681,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
             /* logits_all   */ logits_all);
 
     // reserve output buffer
-    // TODO: move to batch manager?
     if (output_reserve(n_outputs_all) < n_outputs_all) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
         return -2;
diff --git a/src/llama-context.h b/src/llama-context.h
index ccb84874f8b62..f8f01e1bdfe25 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -375,8 +375,8 @@ struct llama_context : public llama_graph_i {
     // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
     std::map<llama_seq_id, std::vector<float>> embd_seq;
 
-    int32_t output_size = 0; // capacity (of tokens positions) for the output buffers
-    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
+    int32_t n_outputs     = 0; // number of actually-used outputs in the current ubatch or last logical batch
+    int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
 
     std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
 

From f5e80208c51ea9ec7c3aa0baac0c029278c86c7c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 21 Feb 2025 19:17:47 +0200
Subject: [PATCH 67/84] wip enc-dec

---
 src/llama-context.cpp | 32 ++++++++++++++++++++++++++------
 src/llama-context.h   | 26 +++++++++++++++++++++++---
 src/llama-graph.cpp   |  2 ++
 src/llama-graph.h     | 15 +++++++++++++++
 src/llama-model.h     |  2 --
 src/llama.cpp         |  9 ++++++---
 6 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index ce68d410a3795..9b341aa1824e6 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -16,8 +16,10 @@
 
 llama_context::llama_context(
         const llama_model & model,
-        const llama_context_params & params) :
-    model     (model) {
+        const llama_context_params & params,
+              llama_graph_type gtype) :
+    llama_graph_i(gtype),
+    model(model) {
     LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
 
     t_start_us = model.t_start_us;
@@ -2279,8 +2281,9 @@ size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_
 
 llama_context_kv_self::llama_context_kv_self(
         const llama_model & model,
-        const llama_context_params & params) :
-    llama_context(model, params),
+        const llama_context_params & params,
+              llama_graph_type gtype) :
+    llama_context(model, params, gtype),
     kv_self(model.hparams) {
     LLAMA_LOG_INFO("%s: constructing llama_context_kv_self\n", __func__);
 
@@ -3750,8 +3753,9 @@ size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq
 
 llama_context_recurrent::llama_context_recurrent(
         const llama_model & model,
-        const llama_context_params & params) :
-    llama_context(model, params),
+        const llama_context_params & params,
+              llama_graph_type gtype) :
+    llama_context(model, params, gtype),
     kv_self(model.hparams) {
     LLAMA_LOG_INFO("%s: constructing llama_context_recurrent\n", __func__);
 
@@ -4619,6 +4623,22 @@ size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_s
     return io.n_bytes();
 }
 
+//
+// llama_context_enc_dec
+//
+
+llama_context_enc_dec::llama_context_enc_dec(
+        const llama_model & model,
+        const llama_context_params & params) :
+    llama_context(model, params, LLAMA_GRAPH_TYPE_ENCODER),
+    ctx_dec(model, params, LLAMA_GRAPH_TYPE_DECODER) {
+    LLAMA_LOG_INFO("%s: constructing llama_context_enc_dec\n", __func__);
+}
+
+llama_context_enc_dec::~llama_context_enc_dec() {
+    LLAMA_LOG_INFO("%s: destructing llama_context_enc_dec\n", __func__);
+}
+
 //
 // interface implementation
 //
diff --git a/src/llama-context.h b/src/llama-context.h
index f8f01e1bdfe25..7cc982e10bef0 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -25,7 +25,8 @@ struct llama_context : public llama_graph_i {
 public:
     llama_context(
             const llama_model & model,
-            const llama_context_params & params);
+            const llama_context_params & params,
+                  llama_graph_type gtype);
 
     virtual ~llama_context();
 
@@ -388,7 +389,8 @@ class llama_context_kv_self : public llama_context {
 public:
     llama_context_kv_self(
             const llama_model & model,
-            const llama_context_params & params);
+            const llama_context_params & params,
+                  llama_graph_type gtype);
 
     virtual ~llama_context_kv_self();
 
@@ -500,7 +502,8 @@ class llama_context_recurrent : public llama_context {
 public:
     llama_context_recurrent(
             const llama_model & model,
-            const llama_context_params & params);
+            const llama_context_params & params,
+                  llama_graph_type gtype);
 
     virtual ~llama_context_recurrent();
 
@@ -604,6 +607,23 @@ class llama_context_recurrent : public llama_context {
     llama_kv_cache_recurrent kv_self;
 };
 
+class llama_context_enc : public llama_context {
+public:
+    using llama_context::llama_context;
+};
+
+class llama_context_enc_dec : public llama_context {
+public:
+    llama_context_enc_dec(
+            const llama_model & model,
+            const llama_context_params & params);
+
+    virtual ~llama_context_enc_dec();
+
+protected:
+    llama_context_kv_self ctx_dec;
+};
+
 // For internal test use
 // TODO: remove
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index af556f5bb81f0..af2c94be7f85a 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -2,6 +2,8 @@
 
 #include "llama-impl.h"
 
+llama_graph_i::llama_graph_i(llama_graph_type type) : type(type) {}
+
 ggml_tensor * llama_graph_i::build_attn(
         ggml_context * ctx0,
          ggml_cgraph * gf,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 05349e5872710..82d2dc736257a 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -11,6 +11,12 @@ struct ggml_tensor;
 struct ggml_backend_buffer;
 struct llama_ubatch;
 
+enum llama_graph_type {
+    LLAMA_GRAPH_TYPE_DEFAULT,
+    LLAMA_GRAPH_TYPE_ENCODER,
+    LLAMA_GRAPH_TYPE_DECODER,
+};
+
 struct llama_graph_result {
     // important graph nodes
     ggml_tensor * t_logits      = nullptr;
@@ -20,6 +26,15 @@ struct llama_graph_result {
 
 // TODO: can become more granular in the future
 class llama_graph_i {
+public:
+    llama_graph_i(llama_graph_type type);
+    virtual ~llama_graph_i() = default;
+
+    llama_graph_type get_type() const { return type; }
+
+protected:
+    llama_graph_type type;
+
 public:
     // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
     virtual void build_cb(
diff --git a/src/llama-model.h b/src/llama-model.h
index b2d75e593f2f3..447fc0d0576d6 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -5,8 +5,6 @@
 #include "llama-hparams.h"
 #include "llama-vocab.h"
 
-#include "ggml-cpp.h"
-
 #include <memory>
 #include <string>
 #include <unordered_map>
diff --git a/src/llama.cpp b/src/llama.cpp
index 9bacc9e9b4bea..4ce0c92c4df35 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -331,17 +331,20 @@ struct llama_context * llama_init_from_model(
             case LLM_ARCH_BERT:
             case LLM_ARCH_JINA_BERT_V2:
             case LLM_ARCH_NOMIC_BERT:
-                ctx = new llama_context(*model, params);
+                ctx = new llama_context_enc(*model, params, LLAMA_GRAPH_TYPE_DEFAULT);
+                break;
+            case LLM_ARCH_T5:
+                ctx = new llama_context_enc_dec(*model, params);
                 break;
             case LLM_ARCH_RWKV6:
             case LLM_ARCH_RWKV6QWEN2:
             case LLM_ARCH_MAMBA:
                 GGML_ASSERT(llama_model_is_recurrent(model));
-                ctx = new llama_context_recurrent(*model, params);
+                ctx = new llama_context_recurrent(*model, params, LLAMA_GRAPH_TYPE_DEFAULT);
                 break;
             default:
                 GGML_ASSERT(!llama_model_is_recurrent(model));
-                ctx = new llama_context_kv_self(*model, params);
+                ctx = new llama_context_kv_self(*model, params, LLAMA_GRAPH_TYPE_DEFAULT);
         };
 
         ctx->init();

From 372fa3a894757cdd844a27141c6396718fce4f4c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 23 Feb 2025 11:38:59 +0200
Subject: [PATCH 68/84] cont : enc should work now, next is dec

ggml-ci
---
 src/llama-context.cpp | 188 +++++++++++++++++++----------
 src/llama-context.h   |  41 ++++---
 src/llama-graph.cpp   |   2 +
 src/llama-graph.h     |   5 +
 src/llama-model.cpp   | 274 +++++++++++++++++++++---------------------
 5 files changed, 293 insertions(+), 217 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 9b341aa1824e6..d98f4662c2463 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -10,21 +10,64 @@
 #include <stdexcept>
 #include <cinttypes>
 
+//
+// helpers
+//
+
+static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+    // TODO move to hparams if a T5 variant appears that uses a different value
+    const int64_t max_distance = 128;
+
+    if (bidirectional) {
+        n_buckets >>= 1;
+    }
+
+    const int64_t max_exact = n_buckets >> 1;
+
+    int32_t relative_position = x - y;
+    int32_t relative_bucket = 0;
+
+    if (bidirectional) {
+        relative_bucket += (relative_position > 0) * n_buckets;
+        relative_position = abs(relative_position);
+    } else {
+        relative_position = -std::min<int32_t>(relative_position, 0);
+    }
+
+    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
+    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
+    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
+
+    return relative_bucket;
+}
+
 //
 // llama_context
 //
 
 llama_context::llama_context(
         const llama_model & model,
-        const llama_context_params & params,
+              llama_context_params params,
               llama_graph_type gtype) :
     llama_graph_i(gtype),
     model(model) {
-    LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
+    LLAMA_LOG_INFO("%s: constructing llama_context, gtype = %d\n", __func__, gtype);
 
     t_start_us = model.t_start_us;
     t_load_us  = model.t_load_us;
 
+    switch (gtype) {
+        case LLAMA_GRAPH_TYPE_DEFAULT:
+        case LLAMA_GRAPH_TYPE_DECODER:
+            {
+            } break;
+        case LLAMA_GRAPH_TYPE_ENCODER:
+            {
+                params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL;
+                params.embeddings = true;
+            }  break;
+    }
+
     const auto & hparams = model.hparams;
 
     cparams.n_seq_max        = std::max(1u, params.n_seq_max);
@@ -45,20 +88,6 @@ llama_context::llama_context(
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
-    // with causal attention, the batch size is limited by the context size
-    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
-
-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
-    }
-
-    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
-
     cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
                                hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
                                                               hparams.n_ctx_train;
@@ -95,6 +124,20 @@ llama_context::llama_context(
         cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
     }
 
+    // with causal attention, the batch size is limited by the context size
+    cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
+
+    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
+    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
+    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
+    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
+    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
+        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
+        cparams.n_batch = GGML_KQ_MASK_PAD;
+    }
+
+    cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+
     const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
 
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
@@ -102,6 +145,7 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
     LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
+    LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
     LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
@@ -1207,6 +1251,23 @@ void llama_context::input_set(const llama_ubatch & ubatch) {
         }
     }
 
+    if (inp.pos_bucket) {
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp.pos_bucket->buffer));
+        GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
+
+        int32_t * data = (int32_t *) inp.pos_bucket->data;
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, true);
+                }
+            }
+        }
+    }
+
     GGML_ASSERT(
             // (!a || b) is a logical implication (a -> b)
             // !hparams.causal_attn -> !cparams.causal_attn
@@ -1604,6 +1665,15 @@ ggml_tensor * llama_context::build_inp_pos(
     return inp.pos;
 }
 
+ggml_tensor * llama_context::build_inp_pos_bucket(
+        ggml_context * ctx0,
+             int32_t   n_tokens) {
+    inp.pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
+    ggml_set_input(inp.pos_bucket);
+
+    return inp.pos_bucket;
+}
+
 ggml_tensor * llama_context::build_inp_out_ids(
         ggml_context * ctx0) {
     const int32_t n_out_ids = n_outputs;
@@ -1656,6 +1726,7 @@ ggml_tensor * llama_context::build_attn(
          ggml_tensor * q_cur,
          ggml_tensor * k_cur,
          ggml_tensor * v_cur,
+         ggml_tensor * kq_b,
              int32_t   n_tokens,
              float     kq_scale,
              int       il) {
@@ -1690,6 +1761,8 @@ ggml_tensor * llama_context::build_attn(
         GGML_UNUSED(model);
         GGML_UNUSED(n_ctx);
 
+        GGML_ASSERT(kq_b == nullptr);
+
         struct ggml_tensor * v = ggml_cont(ctx0, ggml_permute(ctx0, v_cur, 0, 2, 1, 3));
         v = ggml_reshape_3d(ctx0, v, n_embd_head_v, n_kv, n_head_kv);
 
@@ -1720,10 +1793,14 @@ ggml_tensor * llama_context::build_attn(
 
         if (hparams.attn_soft_cap) {
             kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
-            kq = ggml_tanh(ctx0, kq);
+            kq = ggml_tanh (ctx0, kq);
             kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
         }
 
+        if (kq_b) {
+            kq = ggml_add(ctx0, kq, kq_b);
+        }
+
         kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
         //cb(kq, "kq_soft_max_ext", il);
 
@@ -2281,7 +2358,7 @@ size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_
 
 llama_context_kv_self::llama_context_kv_self(
         const llama_model & model,
-        const llama_context_params & params,
+              llama_context_params params,
               llama_graph_type gtype) :
     llama_context(model, params, gtype),
     kv_self(model.hparams) {
@@ -3053,53 +3130,19 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
         }
     }
 
-    if (inp_pos_bucket) {
+    if (inp.self_pos_bucket) {
         const int64_t n_tokens = ubatch.n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_pos_bucket->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_pos_bucket->buffer));
         GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
 
-        static const auto relative_position_bucket = [](llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
-            // TODO move to hparams if a T5 variant appears that uses a different value
-            const int64_t max_distance = 128;
-
-            if (bidirectional) {
-                n_buckets >>= 1;
-            }
+        int32_t * data = (int32_t *) inp.self_pos_bucket->data;
 
-            const int64_t max_exact = n_buckets >> 1;
-
-            int32_t relative_position = x - y;
-            int32_t relative_bucket = 0;
-            if (bidirectional) {
-                relative_bucket += (relative_position > 0) * n_buckets;
-                relative_position = abs(relative_position);
-            } else {
-                relative_position = -std::min<int32_t>(relative_position, 0);
-            }
-            int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
-            relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
-            relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
-            return relative_bucket;
-        };
-
-        int32_t * data = (int32_t *) inp_pos_bucket->data;
-
-        if (!is_encoding) {
-            const int64_t n_kv = kv_self.n;
-            for (int h = 0; h < 1; ++h) {
-                for (int j = 0; j < n_tokens; ++j) {
-                    for (int i = 0; i < n_kv; ++i) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding);
-                    }
-                }
-            }
-        } else {
-            for (int h = 0; h < 1; ++h) {
-                for (int j = 0; j < n_tokens; ++j) {
-                    for (int i = 0; i < n_tokens; ++i) {
-                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding);
-                    }
+        const int64_t n_kv = kv_self.n;
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_kv; ++i) {
+                    data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, false);
                 }
             }
         }
@@ -3146,7 +3189,6 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
 
 ggml_cgraph * llama_context_kv_self::graph_init() {
     inp_embd_enc      = nullptr;
-    inp_pos_bucket    = nullptr;
     inp_kq_mask_cross = nullptr;
 
     inp = {};
@@ -3161,6 +3203,17 @@ ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0)
     return inp.self_k_shift;
 }
 
+ggml_tensor * llama_context_kv_self::build_inp_pos_bucket(
+        ggml_context * ctx0,
+             int32_t   n_tokens) {
+    const auto n_kv = kv_self.n;
+
+    inp.self_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
+    ggml_set_input(inp.self_pos_bucket);
+
+    return inp.self_pos_bucket;
+}
+
 void llama_context_kv_self::build_attn_inp(
         ggml_context * ctx0,
              int32_t   n_tokens,
@@ -3199,6 +3252,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
          ggml_tensor * q_cur,
          ggml_tensor * k_cur,
          ggml_tensor * v_cur,
+         ggml_tensor * kq_b,
              int32_t   n_tokens,
              float     kq_scale,
              int       il) {
@@ -3293,6 +3347,8 @@ ggml_tensor * llama_context_kv_self::build_attn(
         GGML_UNUSED(model);
         GGML_UNUSED(n_ctx);
 
+        GGML_ASSERT(kq_b == nullptr);
+
         // split cached v into n_head heads (not transposed)
         struct ggml_tensor * v =
             ggml_view_3d(ctx0, kv_self.v_l[il],
@@ -3329,10 +3385,14 @@ ggml_tensor * llama_context_kv_self::build_attn(
 
         if (hparams.attn_soft_cap) {
             kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
-            kq = ggml_tanh(ctx0, kq);
+            kq = ggml_tanh (ctx0, kq);
             kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
         }
 
+        if (kq_b) {
+            kq = ggml_add(ctx0, kq, kq_b);
+        }
+
         kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
         //cb(kq, "kq_soft_max_ext", il);
 
@@ -3753,7 +3813,7 @@ size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq
 
 llama_context_recurrent::llama_context_recurrent(
         const llama_model & model,
-        const llama_context_params & params,
+              llama_context_params params,
               llama_graph_type gtype) :
     llama_context(model, params, gtype),
     kv_self(model.hparams) {
@@ -4629,7 +4689,7 @@ size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_s
 
 llama_context_enc_dec::llama_context_enc_dec(
         const llama_model & model,
-        const llama_context_params & params) :
+              llama_context_params params) :
     llama_context(model, params, LLAMA_GRAPH_TYPE_ENCODER),
     ctx_dec(model, params, LLAMA_GRAPH_TYPE_DECODER) {
     LLAMA_LOG_INFO("%s: constructing llama_context_enc_dec\n", __func__);
diff --git a/src/llama-context.h b/src/llama-context.h
index 7cc982e10bef0..3e9baabfb5e67 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -25,7 +25,7 @@ struct llama_context : public llama_graph_i {
 public:
     llama_context(
             const llama_model & model,
-            const llama_context_params & params,
+                  llama_context_params params,
                   llama_graph_type gtype);
 
     virtual ~llama_context();
@@ -142,12 +142,13 @@ struct llama_context : public llama_graph_i {
 
     struct {
         // base input tensors
-        ggml_tensor * tokens;  // I32 [n_batch]
-        ggml_tensor * embd;    // F32 [n_embd, n_batch]
-        ggml_tensor * pos;     // I32 [n_batch]
-        ggml_tensor * out_ids; // I32 [n_outputs]
-        ggml_tensor * mean;    // F32 [n_batch, n_batch]
-        ggml_tensor * cls;     // I32 [n_batch]
+        ggml_tensor * tokens;     // I32 [n_batch]
+        ggml_tensor * embd;       // F32 [n_embd, n_batch]
+        ggml_tensor * pos;        // I32 [n_batch]
+        ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
+        ggml_tensor * out_ids;    // I32 [n_outputs]
+        ggml_tensor * mean;       // F32 [n_batch, n_batch]
+        ggml_tensor * cls;        // I32 [n_batch]
 
         // KQ mask input tensors
         ggml_tensor * kq_mask;     // F32 [n_tokens, n_batch]
@@ -233,6 +234,10 @@ struct llama_context : public llama_graph_i {
             ggml_context * ctx0,
                  int32_t   n_tokens);
 
+    virtual ggml_tensor * build_inp_pos_bucket(
+            ggml_context * ctx0,
+                 int32_t   n_tokens);
+
     virtual ggml_tensor * build_inp_out_ids(
             ggml_context * ctx0);
 
@@ -258,6 +263,7 @@ struct llama_context : public llama_graph_i {
              ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
+             ggml_tensor * kq_b,
                  int32_t   n_tokens,
                  float     kq_scale,
                  int       il);
@@ -389,7 +395,7 @@ class llama_context_kv_self : public llama_context {
 public:
     llama_context_kv_self(
             const llama_model & model,
-            const llama_context_params & params,
+                  llama_context_params params,
                   llama_graph_type gtype);
 
     virtual ~llama_context_kv_self();
@@ -414,10 +420,11 @@ class llama_context_kv_self : public llama_context {
     virtual void input_set(const llama_ubatch & ubatch) override;
 
     struct {
-        ggml_tensor * self_kq_mask;         // F32 [kv_size, n_batch]
-        ggml_tensor * self_kq_mask_cnv;     //     [kv_size, n_batch]
-        ggml_tensor * self_kq_mask_swa;     // F32 [kv_size, n_batch]
-        ggml_tensor * self_kq_mask_swa_cnv; //     [kv_size, n_batch]
+        ggml_tensor * self_pos_bucket;      // I32 [n_kv, n_batch]
+        ggml_tensor * self_kq_mask;         // F32 [n_kv, n_batch]
+        ggml_tensor * self_kq_mask_cnv;     //     [n_kv, n_batch]
+        ggml_tensor * self_kq_mask_swa;     // F32 [n_kv, n_batch]
+        ggml_tensor * self_kq_mask_swa_cnv; //     [n_kv, n_batch]
         ggml_tensor * self_k_shift;         // I32 [kv_size]
     } inp;
 
@@ -433,6 +440,10 @@ class llama_context_kv_self : public llama_context {
 
     virtual ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
 
+    virtual ggml_tensor * build_inp_pos_bucket(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) override;
+
     virtual void build_attn_inp(
             ggml_context * ctx0,
                  int32_t   n_tokens,
@@ -447,6 +458,7 @@ class llama_context_kv_self : public llama_context {
              ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
+             ggml_tensor * kq_b,
                  int32_t   n_tokens,
                  float     kq_scale,
                  int       il) override;
@@ -470,7 +482,6 @@ class llama_context_kv_self : public llama_context {
     std::vector<std::set<llama_seq_id>> seq_ids_enc;
 
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
-    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch]
 
     virtual ggml_tensor * build_inp_embd_enc(
@@ -502,7 +513,7 @@ class llama_context_recurrent : public llama_context {
 public:
     llama_context_recurrent(
             const llama_model & model,
-            const llama_context_params & params,
+                  llama_context_params params,
                   llama_graph_type gtype);
 
     virtual ~llama_context_recurrent();
@@ -616,7 +627,7 @@ class llama_context_enc_dec : public llama_context {
 public:
     llama_context_enc_dec(
             const llama_model & model,
-            const llama_context_params & params);
+                  llama_context_params params);
 
     virtual ~llama_context_enc_dec();
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index af2c94be7f85a..3ac96908d69e5 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -12,6 +12,7 @@ ggml_tensor * llama_graph_i::build_attn(
          ggml_tensor * q_cur,
          ggml_tensor * k_cur,
          ggml_tensor * v_cur,
+         ggml_tensor * kq_b,
              int32_t   n_tokens,
              float     kq_scale,
              int       il) {
@@ -22,6 +23,7 @@ ggml_tensor * llama_graph_i::build_attn(
     GGML_UNUSED(q_cur);
     GGML_UNUSED(k_cur);
     GGML_UNUSED(v_cur);
+    GGML_UNUSED(kq_b);
     GGML_UNUSED(n_tokens);
     GGML_UNUSED(kq_scale);
     GGML_UNUSED(il);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 82d2dc736257a..5df90e76d5e3d 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -83,6 +83,10 @@ class llama_graph_i {
             ggml_context * ctx0,
                  int32_t   n_tokens) = 0;
 
+    virtual ggml_tensor * build_inp_pos_bucket(
+            ggml_context * ctx0,
+                 int32_t   n_tokens) = 0;
+
     virtual ggml_tensor * build_inp_out_ids(
             ggml_context * ctx0) = 0;
 
@@ -108,6 +112,7 @@ class llama_graph_i {
              ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
+             ggml_tensor * kq_b,
                  int32_t   n_tokens,
                  float     kq_scale,
                  int       il);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index c862502d3cbac..1e34ed80388bb 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1432,7 +1432,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
             // skip unused tensors
             if (info.op == GGML_OP_NONE) {
-                LLAMA_LOG_WARN("model has unused tensor %s -- ignoring\n", tn.str().c_str());
+                const size_t nbytes = ggml_nbytes(t_meta);
+                LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
+
+                ml.size_data -= nbytes;
                 ml.n_created++;
 
                 return nullptr;
@@ -3952,6 +3955,14 @@ struct llm_build_context {
         return lgf->build_lora_mm_id(ctx0, w, cur, ids);
     }
 
+    // TODO: tmp
+    struct ggml_tensor * build_pos_bucket() {
+        ggml_tensor * cur = lgf->build_inp_pos_bucket(ctx0, n_tokens);
+        cb(cur, "pos_bucket", -1);
+
+        return cur;
+    }
+
     // TODO: tmp
     struct ggml_tensor * build_inp_embd_enc() {
         ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0);
@@ -4263,7 +4274,30 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, k_cur);
         ggml_build_forward_expand(gf, v_cur);
 
-        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il);
+        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, nullptr, n_tokens, kq_scale, il);
+        cb(cur, "kqv_out", il);
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_attn_with_kq_b(
+             struct ggml_cgraph * gf,
+             struct ggml_tensor * wo,
+             struct ggml_tensor * wo_b,
+             struct ggml_tensor * q_cur,
+             struct ggml_tensor * k_cur,
+             struct ggml_tensor * v_cur,
+             struct ggml_tensor * kq_b,
+                        int32_t   n_tokens,
+                        float     kq_scale,
+                        int       il) {
+        // these nodes are added to the graph together so that they are not reordered
+        // by doing so, the number of splits in the graph is reduced
+        ggml_build_forward_expand(gf, q_cur);
+        ggml_build_forward_expand(gf, k_cur);
+        ggml_build_forward_expand(gf, v_cur);
+
+        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, kq_b, n_tokens, kq_scale, il);
         cb(cur, "kqv_out", il);
 
         return cur;
@@ -4364,37 +4398,24 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, cur);
     }
 
-    //struct ggml_tensor * build_pos_bucket(bool causal) {
-    //    if (causal) {
-    //        lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
-    //    } else {
-    //        lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
-    //    }
+    struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
+        struct ggml_tensor * pos_bucket_1d = ggml_reshape_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1]);
+        cb(pos_bucket_1d, "pos_bucket_1d", -1);
 
-    //    ggml_set_input(lctx.inp_pos_bucket);
-    //    cb(lctx.inp_pos_bucket, "pos_bucket", -1);
+        struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
+        cb(pos_bias, "pos_bias", -1);
 
-    //    return lctx.inp_pos_bucket;
-    //}
+        pos_bias = ggml_reshape_3d(ctx0, pos_bias, pos_bias->ne[0], pos_bucket->ne[0], pos_bucket->ne[1]);
+        cb(pos_bias, "pos_bias", -1);
 
-    //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
-    //    struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
-    //    cb(pos_bucket_1d, "pos_bucket_1d", -1);
+        pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
+        cb(pos_bias, "pos_bias", -1);
 
-    //    struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
-    //    cb(pos_bias, "pos_bias", -1);
+        pos_bias = ggml_cont(ctx0, pos_bias);
+        cb(pos_bias, "pos_bias", -1);
 
-    //    pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0],  0);
-    //    cb(pos_bias, "pos_bias", -1);
-
-    //    pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
-    //    cb(pos_bias, "pos_bias", -1);
-
-    //    pos_bias = ggml_cont(ctx0, pos_bias);
-    //    cb(pos_bias, "pos_bias", -1);
-
-    //    return pos_bias;
-    //}
+        return pos_bias;
+    }
 
     void build_llama(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -9614,132 +9635,104 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, cur);
     }
 
-    //void build_t5_enc(ggml_cgraph * gf) {
-    //    const int64_t n_embd_head = hparams.n_embd_head_v;
-    //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-
-    //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-    //    struct ggml_tensor * cur;
-    //    struct ggml_tensor * inpL;
-
-    //    inpL = build_inp_embd(model.tok_embd);
-
-    //    GGML_ASSERT(lctx.is_encoding);
-    //    struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false);
-
-    //    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    //    struct ggml_tensor * KQ_mask_enc = build_inp_kq_mask(false);
-
-    //    for (int il = 0; il < n_layer; ++il) {
-    //        struct ggml_tensor * inpSA = inpL;
-
-    //        // norm
-    //        cur = build_norm(inpL,
-    //                model.layers[il].attn_norm_enc, NULL,
-    //                LLM_NORM_RMS, il);
-    //        cb(cur, "attn_norm", il);
+    void build_t5_enc(ggml_cgraph * gf) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
 
-    //        // self-attention
-    //        {
-    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
-    //            cb(Qcur, "Qcur", il);
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
-    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
-    //            cb(Kcur, "Kcur", il);
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
 
-    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
-    //            cb(Vcur, "Vcur", il);
+        inpL = build_inp_embd(model.tok_embd);
 
-    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-    //            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+        struct ggml_tensor * pos_bucket_enc = build_pos_bucket();
 
-    //            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-    //            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+        lgf->build_attn_inp(ctx0, n_tokens, false, false);
 
-    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-    //            cb(kq, "kq", il);
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
 
-    //            struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
-    //            struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b);
-    //            struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
-    //            cb(kq_b, "kq_b", il);
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm_enc, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
 
-    //            kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
-    //            cb(kq, "kq_soft_max_ext", il);
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
+                cb(Qcur, "Qcur", il);
 
-    //            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
-    //            cb(v, "v", il);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
+                cb(Kcur, "Kcur", il);
 
-    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
-    //            cb(kqv, "kqv", il);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
+                cb(Vcur, "Vcur", il);
 
-    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-    //            cb(kqv_merged, "kqv_merged", il);
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
-    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-    //            cb(cur, "kqv_merged_cont", il);
+                struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+                struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
 
-    //            ggml_build_forward_expand(gf, cur);
+                cur = build_attn_with_kq_b(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, Kcur, Vcur, kq_b, n_tokens, 1.0f, il);
+                cb(cur, "kqv_out", il);
+            }
 
-    //            cur = build_lora_mm(model.layers[il].wo_enc, cur);
-    //            cb(cur, "kqv_out", il);
-    //        }
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
 
-    //        if (il == n_layer - 1) {
-    //            // skip computing output for unused tokens
-    //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-    //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-    //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-    //        }
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
 
-    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-    //        cb(ffn_inp, "ffn_inp", il);
+            // feed-forward network
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm_enc, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
 
-    //        // feed-forward network
-    //        {
-    //            cur = build_norm(ffn_inp,
-    //                    model.layers[il].ffn_norm_enc, NULL,
-    //                    LLM_NORM_RMS, il);
-    //            cb(cur, "ffn_norm", il);
+                // T5 uses relu, flan-T5 uses gelu-gated
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up_enc,   NULL, NULL,
+                        model.layers[il].ffn_gate_enc, NULL, NULL,
+                        model.layers[il].ffn_down_enc, NULL, NULL,
+                        NULL,
+                        model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+                        model.layers[il].ffn_gate_enc ? LLM_FFN_PAR  : LLM_FFN_SEQ,
+                        il);
+                cb(cur, "ffn_out", il);
+            }
 
-    //            // T5 uses relu, flan-T5 uses gelu-gated
-    //            cur = build_ffn(cur,
-    //                    model.layers[il].ffn_up_enc,   NULL, NULL,
-    //                    model.layers[il].ffn_gate_enc, NULL, NULL,
-    //                    model.layers[il].ffn_down_enc, NULL, NULL,
-    //                    NULL,
-    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
-    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_PAR  : LLM_FFN_SEQ,
-    //                    il);
-    //            cb(cur, "ffn_out", il);
-    //        }
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
 
-    //        cur = ggml_add(ctx0, cur, ffn_inp);
-    //        cb(cur, "ffn_out", il);
+            cur = lgf->build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
 
-    //        ggml_tensor * layer_dir = cvec.tensor_for(il);
-    //        if (layer_dir != nullptr) {
-    //            cur = ggml_add(ctx0, cur, layer_dir);
-    //        }
-    //        cb(cur, "l_out", il);
+            // input for next layer
+            inpL = cur;
+        }
 
-    //        // input for next layer
-    //        inpL = cur;
-    //    }
+        cur = inpL;
+        cb(cur, "result_embd", -1);
 
-    //    cur = inpL;
-    //    cb(cur, "result_embd", -1);
+        cur = build_norm(cur,
+                model.output_norm_enc, NULL,
+                LLM_NORM_RMS, -1);
 
-    //    cur = build_norm(cur,
-    //            model.output_norm_enc, NULL,
-    //            LLM_NORM_RMS, -1);
-    //
-    //    cb(cur, "result_norm", -1);
-    //    res.t_embd = cur;
+        cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
-    //    ggml_build_forward_expand(gf, cur);
-    //}
+        ggml_build_forward_expand(gf, cur);
+    }
 
     //void build_t5_dec(ggml_cgraph * gf) {
     //    const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -11091,14 +11084,19 @@ llama_graph_result llama_model::build_graph(
             {
                 llm.build_bitnet(gf);
             } break;
-        //case LLM_ARCH_T5:
-        //    {
-        //        if (lctx.is_encoding) {
-        //            llm.build_t5_enc(gf);
-        //        } else {
-        //            llm.build_t5_dec(gf);
-        //        }
-        //    } break;
+        case LLM_ARCH_T5:
+            {
+                switch (lgf->get_type()) {
+                    case LLAMA_GRAPH_TYPE_ENCODER:
+                        llm.build_t5_enc(gf);
+                        break;
+                    case LLAMA_GRAPH_TYPE_DECODER:
+                        //llm.build_t5_dec(gf);
+                        break;
+                    default:
+                        GGML_ABORT("invalid graph type");
+                };
+            } break;
         //case LLM_ARCH_T5ENCODER:
         //    {
         //        llm.build_t5_enc(gf);

From 6378112cb5c91125f32bcf35e7f556ee6be40fb9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 23 Feb 2025 19:39:22 +0200
Subject: [PATCH 69/84] graph : remove the build_kv_... API from llama_graph_i

ggml-ci
---
 src/llama-context.cpp | 19 +++++++++++++++++
 src/llama-context.h   | 47 ++++++++++++++++++++++++++++---------------
 src/llama-graph.cpp   | 18 -----------------
 src/llama-graph.h     |  9 ---------
 4 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d98f4662c2463..5ad1e2a61edbb 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1842,6 +1842,25 @@ ggml_tensor * llama_context::build_attn(
     return cur;
 }
 
+void llama_context::build_kv_self_shift(
+        ggml_context * ctx0,
+        ggml_cgraph * gf) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+}
+
+void llama_context::build_kv_self_defrag(
+        ggml_context * ctx0,
+        ggml_cgraph * gf) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+}
+
+
 //
 // perf
 //
diff --git a/src/llama-context.h b/src/llama-context.h
index 3e9baabfb5e67..09c8f484251c6 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -171,7 +171,7 @@ struct llama_context : public llama_graph_i {
     // graph
     //
 
-    // zero-out inputs and create the ctx_context for the compute graph
+    // zero-out inputs and create the ctx_compute for the compute graph
     virtual ggml_cgraph * graph_init();
 
     // TODO: add encode/decode graphs
@@ -187,73 +187,74 @@ struct llama_context : public llama_graph_i {
 
     ggml_context_ptr ctx_compute;
 
+public:
     //
-    // graph build API (generic)
+    // graph build
     //
 
     virtual void build_cb(
              ggml_tensor * cur,
               const char * name,
       const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) override;
 
     // apply control vector for layer il
     virtual ggml_tensor * build_cvec(
             ggml_context * ctx0,
              ggml_tensor * cur,
-                     int   il);
+                     int   il) override;
 
     // do mat_mul, while optionally apply lora
     virtual ggml_tensor * build_lora_mm(
             ggml_context * ctx0,
              ggml_tensor * w,
-             ggml_tensor * cur);
+             ggml_tensor * cur) override;
 
     // do mat_mul_id, while optionally apply lora
     virtual ggml_tensor * build_lora_mm_id(
             ggml_context * ctx0,
              ggml_tensor * w,   // struct ggml_tensor * as
              ggml_tensor * cur, // struct ggml_tensor * b
-             ggml_tensor * ids);
+             ggml_tensor * ids) override;
 
-    virtual ggml_tensor * build_rope_factors(int il);
+    virtual ggml_tensor * build_rope_factors(int il) override;
 
     virtual ggml_tensor * build_rope_shift(
             ggml_context * ctx0,
              ggml_tensor * cur,
              ggml_tensor * shift,
              ggml_tensor * factors,
-             ggml_backend_buffer * bbuf);
+             ggml_backend_buffer * bbuf) override;
 
     virtual ggml_tensor * build_inp_embd(
             ggml_context * ctx0,
              ggml_tensor * tok_embd,
-      const llama_ubatch & ubatch);
+      const llama_ubatch & ubatch) override;
 
     virtual ggml_tensor * build_inp_pos(
             ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
 
     virtual ggml_tensor * build_inp_pos_bucket(
             ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
 
     virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0);
+            ggml_context * ctx0) override;
 
     virtual ggml_tensor * build_inp_mean(
             ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
 
     virtual ggml_tensor * build_inp_cls(
             ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
 
     virtual void build_attn_inp(
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
-                    bool   swa);
+                    bool   swa) override;
 
     virtual ggml_tensor * build_attn(
             ggml_context * ctx0,
@@ -266,7 +267,17 @@ struct llama_context : public llama_graph_i {
              ggml_tensor * kq_b,
                  int32_t   n_tokens,
                  float     kq_scale,
-                 int       il);
+                 int       il) override;
+
+protected:
+    virtual void build_kv_self_shift(
+            ggml_context * ctx0,
+            ggml_cgraph * gf);
+
+    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
+    virtual void build_kv_self_defrag(
+            ggml_context * ctx0,
+            ggml_cgraph * gf);
 
 public:
     //
@@ -434,6 +445,7 @@ class llama_context_kv_self : public llama_context {
 
     virtual ggml_cgraph * graph_init() override;
 
+public:
     //
     // graph build
     //
@@ -463,6 +475,7 @@ class llama_context_kv_self : public llama_context {
                  float     kq_scale,
                  int       il) override;
 
+protected:
     virtual void build_kv_self_shift(
             ggml_context * ctx0,
             ggml_cgraph * gf) override;
@@ -548,6 +561,7 @@ class llama_context_recurrent : public llama_context {
 
     virtual ggml_cgraph * graph_init() override;
 
+public:
     //
     // graph build
     //
@@ -600,6 +614,7 @@ class llama_context_recurrent : public llama_context {
       const llama_ubatch & ubatch,
                      int   il) override;
 
+protected:
     //
     // state save/load
     //
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 3ac96908d69e5..25922260d2a7c 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -32,24 +32,6 @@ ggml_tensor * llama_graph_i::build_attn(
     return nullptr;
 }
 
-void llama_graph_i::build_kv_self_shift(
-        ggml_context * ctx0,
-        ggml_cgraph * gf) {
-    GGML_UNUSED(ctx0);
-    GGML_UNUSED(gf);
-
-    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
-}
-
-void llama_graph_i::build_kv_self_defrag(
-        ggml_context * ctx0,
-        ggml_cgraph * gf) {
-    GGML_UNUSED(ctx0);
-    GGML_UNUSED(gf);
-
-    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
-}
-
 ggml_tensor * llama_graph_i::build_inp_self_k_shift(
         ggml_context * ctx0) {
     GGML_UNUSED(ctx0);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 5df90e76d5e3d..3433caf63ac89 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -117,15 +117,6 @@ class llama_graph_i {
                  float     kq_scale,
                  int       il);
 
-    virtual void build_kv_self_shift(
-            ggml_context * ctx0,
-            ggml_cgraph * gf);
-
-    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    virtual void build_kv_self_defrag(
-            ggml_context * ctx0,
-            ggml_cgraph * gf);
-
     virtual ggml_tensor * build_inp_self_k_shift(
             ggml_context * ctx0);
 

From 0699a44c83b5349e13c0e4abe0b3ab09e1d6462c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 23 Feb 2025 20:02:11 +0200
Subject: [PATCH 70/84] context : remove redundant virtual, protected ->
 private

ggml-ci
---
 src/llama-context.cpp |   8 +++
 src/llama-context.h   | 126 ++++++++++++++++++++++++------------------
 src/llama-graph.cpp   |   8 ---
 src/llama-graph.h     |   3 -
 4 files changed, 79 insertions(+), 66 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5ad1e2a61edbb..7628cbc9bf20c 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1842,6 +1842,14 @@ ggml_tensor * llama_context::build_attn(
     return cur;
 }
 
+ggml_tensor * llama_context::build_inp_self_k_shift(
+        ggml_context * ctx0) {
+    GGML_UNUSED(ctx0);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+    return nullptr;
+}
+
 void llama_context::build_kv_self_shift(
         ggml_context * ctx0,
         ggml_cgraph * gf) {
diff --git a/src/llama-context.h b/src/llama-context.h
index 09c8f484251c6..0e55aae1c8dfb 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -140,6 +140,7 @@ struct llama_context : public llama_graph_i {
 
     virtual void input_set(const llama_ubatch & ubatch);
 
+private:
     struct {
         // base input tensors
         ggml_tensor * tokens;     // I32 [n_batch]
@@ -155,6 +156,7 @@ struct llama_context : public llama_graph_i {
         ggml_tensor * kq_mask_cnv; //     [n_tokens, n_batch]
     } inp;
 
+protected:
     //
     // output
     //
@@ -192,71 +194,71 @@ struct llama_context : public llama_graph_i {
     // graph build
     //
 
-    virtual void build_cb(
+    void build_cb(
              ggml_tensor * cur,
               const char * name,
       const llama_ubatch & ubatch,
                      int   il) override;
 
     // apply control vector for layer il
-    virtual ggml_tensor * build_cvec(
+    ggml_tensor * build_cvec(
             ggml_context * ctx0,
              ggml_tensor * cur,
                      int   il) override;
 
     // do mat_mul, while optionally apply lora
-    virtual ggml_tensor * build_lora_mm(
+    ggml_tensor * build_lora_mm(
             ggml_context * ctx0,
              ggml_tensor * w,
              ggml_tensor * cur) override;
 
     // do mat_mul_id, while optionally apply lora
-    virtual ggml_tensor * build_lora_mm_id(
+    ggml_tensor * build_lora_mm_id(
             ggml_context * ctx0,
              ggml_tensor * w,   // struct ggml_tensor * as
              ggml_tensor * cur, // struct ggml_tensor * b
              ggml_tensor * ids) override;
 
-    virtual ggml_tensor * build_rope_factors(int il) override;
+    ggml_tensor * build_rope_factors(int il) override;
 
-    virtual ggml_tensor * build_rope_shift(
+    ggml_tensor * build_rope_shift(
             ggml_context * ctx0,
              ggml_tensor * cur,
              ggml_tensor * shift,
              ggml_tensor * factors,
              ggml_backend_buffer * bbuf) override;
 
-    virtual ggml_tensor * build_inp_embd(
+    ggml_tensor * build_inp_embd(
             ggml_context * ctx0,
              ggml_tensor * tok_embd,
       const llama_ubatch & ubatch) override;
 
-    virtual ggml_tensor * build_inp_pos(
+    ggml_tensor * build_inp_pos(
             ggml_context * ctx0,
                  int32_t   n_tokens) override;
 
-    virtual ggml_tensor * build_inp_pos_bucket(
+    ggml_tensor * build_inp_pos_bucket(
             ggml_context * ctx0,
                  int32_t   n_tokens) override;
 
-    virtual ggml_tensor * build_inp_out_ids(
+    ggml_tensor * build_inp_out_ids(
             ggml_context * ctx0) override;
 
-    virtual ggml_tensor * build_inp_mean(
+    ggml_tensor * build_inp_mean(
             ggml_context * ctx0,
                  int32_t   n_tokens) override;
 
-    virtual ggml_tensor * build_inp_cls(
+    ggml_tensor * build_inp_cls(
             ggml_context * ctx0,
                  int32_t   n_tokens) override;
 
-    virtual void build_attn_inp(
+    void build_attn_inp(
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
                     bool   swa) override;
 
-    virtual ggml_tensor * build_attn(
+    ggml_tensor * build_attn(
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * wo,
@@ -270,6 +272,9 @@ struct llama_context : public llama_graph_i {
                  int       il) override;
 
 protected:
+    virtual ggml_tensor * build_inp_self_k_shift(
+            ggml_context * ctx0);
+
     virtual void build_kv_self_shift(
             ggml_context * ctx0,
             ggml_cgraph * gf);
@@ -288,6 +293,7 @@ struct llama_context : public llama_graph_i {
     virtual void perf_reset();
 
 protected:
+    // TODO: become private
     mutable int64_t t_start_us  = 0;
     mutable int64_t t_load_us   = 0;
     mutable int64_t t_p_eval_us = 0;
@@ -346,6 +352,7 @@ struct llama_context : public llama_graph_i {
     //
     // members
     //
+    // TODO: become private / move to llama_graph_i
 
     const llama_model & model;
 
@@ -412,24 +419,25 @@ class llama_context_kv_self : public llama_context {
     virtual ~llama_context_kv_self();
 
 protected:
-    virtual void reserve() override;
+    void reserve() override;
 
 public:
-    virtual       llama_kv_cache * get_kv_self()       override;
-    virtual const llama_kv_cache * get_kv_self() const override;
+          llama_kv_cache * get_kv_self()       override;
+    const llama_kv_cache * get_kv_self() const override;
 
-    virtual void kv_self_update() override;
+    void kv_self_update() override;
 
-    virtual int encode(llama_batch & inp_batch) override;
-    virtual int decode(llama_batch & inp_batch) override;
+    int encode(llama_batch & inp_batch) override;
+    int decode(llama_batch & inp_batch) override;
 
 protected:
     //
     // input
     //
 
-    virtual void input_set(const llama_ubatch & ubatch) override;
+    void input_set(const llama_ubatch & ubatch) override;
 
+private:
     struct {
         ggml_tensor * self_pos_bucket;      // I32 [n_kv, n_batch]
         ggml_tensor * self_kq_mask;         // F32 [n_kv, n_batch]
@@ -443,26 +451,24 @@ class llama_context_kv_self : public llama_context {
     // graph
     //
 
-    virtual ggml_cgraph * graph_init() override;
+    ggml_cgraph * graph_init() override;
 
 public:
     //
     // graph build
     //
 
-    virtual ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
-
-    virtual ggml_tensor * build_inp_pos_bucket(
+    ggml_tensor * build_inp_pos_bucket(
             ggml_context * ctx0,
                  int32_t   n_tokens) override;
 
-    virtual void build_attn_inp(
+    void build_attn_inp(
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
                     bool   swa) override;
 
-    virtual ggml_tensor * build_attn(
+    ggml_tensor * build_attn(
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * wo,
@@ -476,16 +482,22 @@ class llama_context_kv_self : public llama_context {
                  int       il) override;
 
 protected:
-    virtual void build_kv_self_shift(
+    ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
+
+    void build_kv_self_shift(
             ggml_context * ctx0,
             ggml_cgraph * gf) override;
 
     // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    virtual void build_kv_self_defrag(
+    void build_kv_self_defrag(
             ggml_context * ctx0,
             ggml_cgraph * gf) override;
 
+    // =======================================================
     // === encoder-decoder ===
+    //
+    // TODO: this is temporary here, it will be moved
+    //
 
     // whether we are computing encoder output or decoder output
     bool is_encoding = false;
@@ -497,23 +509,25 @@ class llama_context_kv_self : public llama_context {
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch]
 
-    virtual ggml_tensor * build_inp_embd_enc(
+    ggml_tensor * build_inp_embd_enc(
             ggml_context * ctx0) override;
 
-    virtual ggml_tensor * build_inp_kq_mask_cross(
+    ggml_tensor * build_inp_kq_mask_cross(
             ggml_context * ctx0,
                  int32_t   n_tokens) override;
+    // ======================================================
 
     //
     // state save/load
     //
 
-    virtual size_t state_get_data(llama_io_write_i & io) override;
-    virtual size_t state_set_data(llama_io_read_i  & io) override;
+    size_t state_get_data(llama_io_write_i & io) override;
+    size_t state_set_data(llama_io_read_i  & io) override;
 
-    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
-    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
+    size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
+    size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
 
+private:
     //
     // members
     //
@@ -532,24 +546,25 @@ class llama_context_recurrent : public llama_context {
     virtual ~llama_context_recurrent();
 
 protected:
-    virtual void reserve() override;
+    void reserve() override;
 
 public:
-    virtual       llama_kv_cache * get_kv_self()       override;
-    virtual const llama_kv_cache * get_kv_self() const override;
+          llama_kv_cache * get_kv_self()       override;
+    const llama_kv_cache * get_kv_self() const override;
 
-    virtual void kv_self_update() override;
+    void kv_self_update() override;
 
-    virtual int encode(llama_batch & inp_batch) override;
-    virtual int decode(llama_batch & inp_batch) override;
+    int encode(llama_batch & inp_batch) override;
+    int decode(llama_batch & inp_batch) override;
 
 protected:
     //
     // input
     //
 
-    virtual void input_set(const llama_ubatch & ubatch) override;
+    void input_set(const llama_ubatch & ubatch) override;
 
+private:
     struct {
         ggml_tensor * s_copy; // I32 [kv_size]
         ggml_tensor * s_mask; // F32 [1, n_kv]
@@ -559,20 +574,20 @@ class llama_context_recurrent : public llama_context {
     // graph
     //
 
-    virtual ggml_cgraph * graph_init() override;
+    ggml_cgraph * graph_init() override;
 
 public:
     //
     // graph build
     //
 
-    virtual ggml_tensor * build_inp_s_copy(
+    ggml_tensor * build_inp_s_copy(
             ggml_context * ctx0) override;
 
-    virtual ggml_tensor * build_inp_s_mask(
+    ggml_tensor * build_inp_s_mask(
             ggml_context * ctx0) override;
 
-    virtual ggml_tensor * build_copy_mask_state(
+    ggml_tensor * build_copy_mask_state(
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * s,
@@ -581,7 +596,7 @@ class llama_context_recurrent : public llama_context {
                  int32_t   n_state,
                  int32_t   n_seqs) override;
 
-    virtual ggml_tensor * build_mamba_layer(
+    ggml_tensor * build_mamba_layer(
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * cur,
@@ -590,7 +605,7 @@ class llama_context_recurrent : public llama_context {
       const llama_ubatch & ubatch,
                      int   il) override;
 
-    virtual ggml_tensor * build_rwkv_token_shift_load(
+    ggml_tensor * build_rwkv_token_shift_load(
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * state_copy,
@@ -598,13 +613,13 @@ class llama_context_recurrent : public llama_context {
       const llama_ubatch & ubatch,
                      int   il) override;
 
-    virtual ggml_tensor * build_rwkv_token_shift_store(
+    ggml_tensor * build_rwkv_token_shift_store(
             ggml_context * ctx0,
              ggml_tensor * token_shift,
       const llama_ubatch & ubatch,
                      int   il) override;
 
-    virtual ggml_tensor * build_rwkv6_time_mix(
+    ggml_tensor * build_rwkv6_time_mix(
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * cur,
@@ -619,12 +634,13 @@ class llama_context_recurrent : public llama_context {
     // state save/load
     //
 
-    virtual size_t state_get_data(llama_io_write_i & io) override;
-    virtual size_t state_set_data(llama_io_read_i  & io) override;
+    size_t state_get_data(llama_io_write_i & io) override;
+    size_t state_set_data(llama_io_read_i  & io) override;
 
-    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
-    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
+    size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
+    size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
 
+private:
     //
     // members
     //
@@ -646,7 +662,7 @@ class llama_context_enc_dec : public llama_context {
 
     virtual ~llama_context_enc_dec();
 
-protected:
+private:
     llama_context_kv_self ctx_dec;
 };
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 25922260d2a7c..c058ee2498880 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -32,14 +32,6 @@ ggml_tensor * llama_graph_i::build_attn(
     return nullptr;
 }
 
-ggml_tensor * llama_graph_i::build_inp_self_k_shift(
-        ggml_context * ctx0) {
-    GGML_UNUSED(ctx0);
-
-    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
-    return nullptr;
-}
-
 ggml_tensor * llama_graph_i::build_inp_embd_enc(
         ggml_context * ctx0) {
     GGML_UNUSED(ctx0);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 3433caf63ac89..ee56f08396a63 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -117,9 +117,6 @@ class llama_graph_i {
                  float     kq_scale,
                  int       il);
 
-    virtual ggml_tensor * build_inp_self_k_shift(
-            ggml_context * ctx0);
-
     virtual ggml_tensor * build_inp_embd_enc(
             ggml_context * ctx0);
 

From a5a85a3bc0c45d4f31f8ef4bc16ef158b0a8d670 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 24 Feb 2025 08:59:12 +0200
Subject: [PATCH 71/84] context : fix recurrent reserve

ggml-ci
---
 src/llama-context.cpp | 5 +++++
 src/llama-context.h   | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 7628cbc9bf20c..f73d4b9bf4c2f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -3883,6 +3883,11 @@ llama_context_recurrent::llama_context_recurrent(
 llama_context_recurrent::~llama_context_recurrent() = default;
 
 void llama_context_recurrent::reserve() {
+    // simulate full KV cache
+    kv_self.n = kv_self.size;
+
+    LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n);
+
     // TODO: implement recurrent-specific reserve logic
     llama_context::reserve();
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index 0e55aae1c8dfb..2945cbabe4559 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -447,6 +447,7 @@ class llama_context_kv_self : public llama_context {
         ggml_tensor * self_k_shift;         // I32 [kv_size]
     } inp;
 
+protected:
     //
     // graph
     //
@@ -570,6 +571,7 @@ class llama_context_recurrent : public llama_context {
         ggml_tensor * s_mask; // F32 [1, n_kv]
     } inp;
 
+protected:
     //
     // graph
     //

From 4a1054b55259cb3d43929121294e0ac28a632435 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 24 Feb 2025 11:18:40 +0200
Subject: [PATCH 72/84] context : reuse built_attn_mha

ggml-ci
---
 src/llama-context.cpp | 210 +++++++++++++-----------------------------
 src/llama-context.h   |  17 ++--
 src/llama-graph.cpp   |   6 --
 src/llama-graph.h     |   3 -
 src/llama-model.cpp   |  36 +++++++-
 5 files changed, 107 insertions(+), 165 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f73d4b9bf4c2f..e05afb5646afc 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1721,50 +1721,67 @@ void llama_context::build_attn_inp(
 ggml_tensor * llama_context::build_attn(
         ggml_context * ctx0,
          ggml_cgraph * gf,
-         ggml_tensor * wo,
-         ggml_tensor * wo_b,
          ggml_tensor * q_cur,
          ggml_tensor * k_cur,
          ggml_tensor * v_cur,
          ggml_tensor * kq_b,
-             int32_t   n_tokens,
              float     kq_scale,
              int       il) {
-    const auto & hparams = model.hparams;
+    GGML_UNUSED(il);
 
-    const auto & n_ctx = cparams.n_ctx;
+    const auto & kq_mask = inp.kq_mask_cnv;
 
-  //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
 
-    const auto & kq_mask = inp.kq_mask_cnv;
+    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+    //cb(k, "k", il);
 
-    const int64_t n_head    = hparams.n_head(il);
-    const int64_t n_head_kv = hparams.n_head_kv(il);
+    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
+    //cb(k, "v", il);
 
-  //const auto & n_embd_head_k = hparams.n_embd_head_k;
-    const auto & n_embd_head_v = hparams.n_embd_head_v;
+    ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, false, kq_scale);
 
-    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    const auto n_kv = n_tokens;
+    return cur;
+}
 
-    struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
+ggml_tensor * llama_context::build_attn_mha(
+        ggml_context * ctx0,
+         ggml_cgraph * gf,
+         ggml_tensor * q,
+         ggml_tensor * k,
+         ggml_tensor * v,
+         ggml_tensor * kq_b,
+         ggml_tensor * kq_mask,
+             bool      v_trans,
+             float     kq_scale) {
+    const auto & hparams = model.hparams;
 
-    struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, k_cur, 0, 2, 1, 3));
-    //cb(k, "k", il);
+  //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+  //const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+  //const int64_t n_head    = hparams.n_head(il);
+  //const int64_t n_head_kv = hparams.n_head_kv(il);
+
+  //const auto & n_embd_head_k = hparams.n_embd_head_k;
+  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    const auto n_embd_head_v = v_trans ? v->ne[1] : v->ne[0];
+
+    const auto n_tokens = q->ne[1];
+    const auto n_head   = q->ne[2];
+    const auto n_kv     = k->ne[1];
 
     struct ggml_tensor * cur;
 
-    //if (cparams.flash_attn) {
-    if (false) { // TODO: need to pad the batch size to a multiple of GGML_KQ_MASK_PAD
+    if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
         GGML_UNUSED(model);
-        GGML_UNUSED(n_ctx);
 
-        GGML_ASSERT(kq_b == nullptr);
+        GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
 
-        struct ggml_tensor * v = ggml_cont(ctx0, ggml_permute(ctx0, v_cur, 0, 2, 1, 3));
-        v = ggml_reshape_3d(ctx0, v, n_embd_head_v, n_kv, n_head_kv);
+        if (v_trans) {
+            v = ggml_transpose(ctx0, v);
+        }
 
         cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
                                   hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
@@ -1774,7 +1791,6 @@ ggml_tensor * llama_context::build_attn(
         cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
     } else {
         struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-        //cb(kq, "kq", il);
 
         // note: this op tends to require high floating point range
         //       while for some models F16 is enough, for others it is not, so we default to F32 here
@@ -1802,22 +1818,17 @@ ggml_tensor * llama_context::build_attn(
         }
 
         kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
-        //cb(kq, "kq_soft_max_ext", il);
-
-        // split cached v into n_head heads
-        struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens)));
 
-        v = ggml_reshape_3d(ctx0, v, n_kv, n_embd_head_v, n_head_kv);
-        //cb(v, "v", il);
+        if (!v_trans) {
+            // note: avoid this branch
+            v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
+        }
 
         struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
-        //cb(kqv, "kqv", il);
 
         struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-        //cb(kqv_merged, "kqv_merged", il);
 
         cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
-        //cb(cur, "kqv_merged_cont", il);
 
         if (!cparams.offload_kqv) {
             // all nodes between the KV store and the attention output are run on the CPU
@@ -1827,18 +1838,6 @@ ggml_tensor * llama_context::build_attn(
 
     ggml_build_forward_expand(gf, cur);
 
-    if (wo) {
-        cur = build_lora_mm(ctx0, wo, cur);
-    }
-
-    if (wo_b) {
-        //cb(cur, "kqv_wo", il);
-    }
-
-    if (wo_b) {
-        cur = ggml_add(ctx0, cur, wo_b);
-    }
-
     return cur;
 }
 
@@ -3274,13 +3273,10 @@ void llama_context_kv_self::build_attn_inp(
 ggml_tensor * llama_context_kv_self::build_attn(
         ggml_context * ctx0,
          ggml_cgraph * gf,
-         ggml_tensor * wo,
-         ggml_tensor * wo_b,
          ggml_tensor * q_cur,
          ggml_tensor * k_cur,
          ggml_tensor * v_cur,
          ggml_tensor * kq_b,
-             int32_t   n_tokens,
              float     kq_scale,
              int       il) {
     const auto & hparams = model.hparams;
@@ -3290,6 +3286,10 @@ ggml_tensor * llama_context_kv_self::build_attn(
     const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
     const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
 
+    const auto n_tokens = q_cur->ne[2];
+
+    const bool v_trans = !cparams.flash_attn;
+
     // store to KV cache
     {
         GGML_ASSERT(!kv_self.recurrent);
@@ -3308,7 +3308,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
 
         struct ggml_tensor * v_cache_view = nullptr;
 
-        if (cparams.flash_attn) {
+        if (!v_trans) {
             v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head);
         } else {
             // note: the V cache is transposed when not using flash attention
@@ -3351,16 +3351,15 @@ ggml_tensor * llama_context_kv_self::build_attn(
 
     const auto n_kv = kv_self.n;
 
-    const int64_t n_head    = hparams.n_head(il);
     const int64_t n_head_kv = hparams.n_head_kv(il);
 
     const auto & n_embd_head_k = hparams.n_embd_head_k;
     const auto & n_embd_head_v = hparams.n_embd_head_v;
 
-    struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
     //cb(q, "q", il);
 
-    struct ggml_tensor * k =
+    ggml_tensor * k =
         ggml_view_3d(ctx0, kv_self.k_l[il],
                 n_embd_head_k, n_kv, n_head_kv,
                 ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
@@ -3368,100 +3367,19 @@ ggml_tensor * llama_context_kv_self::build_attn(
                 0);
     //cb(k, "k", il);
 
-    struct ggml_tensor * cur;
-
-    if (cparams.flash_attn) {
-        GGML_UNUSED(model);
-        GGML_UNUSED(n_ctx);
-
-        GGML_ASSERT(kq_b == nullptr);
-
-        // split cached v into n_head heads (not transposed)
-        struct ggml_tensor * v =
-            ggml_view_3d(ctx0, kv_self.v_l[il],
-                    n_embd_head_v, n_kv, n_head_kv,
-                    ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                    ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v),
-                    0);
-        //cb(v, "v", il);
-
-        cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
-                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
-
-        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
-
-        cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
-    } else {
-        struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-        //cb(kq, "kq", il);
-
-        // note: this op tends to require high floating point range
-        //       while for some models F16 is enough, for others it is not, so we default to F32 here
-        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
-
-        if (model.arch == LLM_ARCH_GROK) {
-            // need to do the following:
-            // multiply by attn_output_multiplyer of 0.08838834764831845
-            // and then :
-            // kq = 30 * tanh(kq / 30)
-            // before the softmax below
-
-            kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
-            kq = ggml_scale(ctx0, kq, 30);
-        }
-
-        if (hparams.attn_soft_cap) {
-            kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
-            kq = ggml_tanh (ctx0, kq);
-            kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
-        }
-
-        if (kq_b) {
-            kq = ggml_add(ctx0, kq, kq_b);
-        }
-
-        kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
-        //cb(kq, "kq_soft_max_ext", il);
-
-        GGML_ASSERT(kv_self.size == n_ctx);
-
-        // split cached v into n_head heads
-        struct ggml_tensor * v =
-            ggml_view_3d(ctx0, kv_self.v_l[il],
-                    n_kv, n_embd_head_v, n_head_kv,
-                    ggml_element_size(kv_self.v_l[il])*n_ctx,
-                    ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
-                    0);
-        //cb(v, "v", il);
-
-        struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
-        //cb(kqv, "kqv", il);
-
-        struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-        //cb(kqv_merged, "kqv_merged", il);
-
-        cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
-        //cb(cur, "kqv_merged_cont", il);
-
-        if (!cparams.offload_kqv) {
-            // all nodes between the KV store and the attention output are run on the CPU
-            ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
-        }
-    }
-
-    ggml_build_forward_expand(gf, cur);
-
-    if (wo) {
-        cur = build_lora_mm(ctx0, wo, cur);
-    }
-
-    if (wo_b) {
-        //cb(cur, "kqv_wo", il);
-    }
+    ggml_tensor * v = !v_trans ?
+        ggml_view_3d(ctx0, kv_self.v_l[il],
+                n_embd_head_v, n_kv, n_head_kv,
+                ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v),
+                0) :
+        ggml_view_3d(ctx0, kv_self.v_l[il],
+                n_kv, n_embd_head_v, n_head_kv,
+                ggml_element_size(kv_self.v_l[il])*n_ctx,
+                ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+                0);
 
-    if (wo_b) {
-        cur = ggml_add(ctx0, cur, wo_b);
-    }
+    struct ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale);
 
     return cur;
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index 2945cbabe4559..5b63b3b06d21c 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -261,17 +261,25 @@ struct llama_context : public llama_graph_i {
     ggml_tensor * build_attn(
             ggml_context * ctx0,
              ggml_cgraph * gf,
-             ggml_tensor * wo,
-             ggml_tensor * wo_b,
              ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
              ggml_tensor * kq_b,
-                 int32_t   n_tokens,
                  float     kq_scale,
                  int       il) override;
 
 protected:
+    virtual ggml_tensor * build_attn_mha(
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+             ggml_tensor * q,
+             ggml_tensor * k,
+             ggml_tensor * v,
+             ggml_tensor * kq_b,
+             ggml_tensor * kq_mask,
+                 bool      v_trans,
+                 float     kq_scale);
+
     virtual ggml_tensor * build_inp_self_k_shift(
             ggml_context * ctx0);
 
@@ -472,13 +480,10 @@ class llama_context_kv_self : public llama_context {
     ggml_tensor * build_attn(
             ggml_context * ctx0,
              ggml_cgraph * gf,
-             ggml_tensor * wo,
-             ggml_tensor * wo_b,
              ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
              ggml_tensor * kq_b,
-                 int32_t   n_tokens,
                  float     kq_scale,
                  int       il) override;
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index c058ee2498880..99eb326205bc6 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -7,24 +7,18 @@ llama_graph_i::llama_graph_i(llama_graph_type type) : type(type) {}
 ggml_tensor * llama_graph_i::build_attn(
         ggml_context * ctx0,
          ggml_cgraph * gf,
-         ggml_tensor * wo,
-         ggml_tensor * wo_b,
          ggml_tensor * q_cur,
          ggml_tensor * k_cur,
          ggml_tensor * v_cur,
          ggml_tensor * kq_b,
-             int32_t   n_tokens,
              float     kq_scale,
              int       il) {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
-    GGML_UNUSED(wo);
-    GGML_UNUSED(wo_b);
     GGML_UNUSED(q_cur);
     GGML_UNUSED(k_cur);
     GGML_UNUSED(v_cur);
     GGML_UNUSED(kq_b);
-    GGML_UNUSED(n_tokens);
     GGML_UNUSED(kq_scale);
     GGML_UNUSED(il);
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
index ee56f08396a63..c84c254934ff1 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -107,13 +107,10 @@ class llama_graph_i {
     virtual ggml_tensor * build_attn(
             ggml_context * ctx0,
              ggml_cgraph * gf,
-             ggml_tensor * wo,
-             ggml_tensor * wo_b,
              ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
              ggml_tensor * kq_b,
-                 int32_t   n_tokens,
                  float     kq_scale,
                  int       il);
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 1e34ed80388bb..e8057f4687fdf 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4265,18 +4265,32 @@ struct llm_build_context {
              struct ggml_tensor * q_cur,
              struct ggml_tensor * k_cur,
              struct ggml_tensor * v_cur,
-                        int32_t   n_tokens,
+                        int32_t   n_tokens, // TODO: remove
                         float     kq_scale,
                         int       il) {
+        GGML_UNUSED(n_tokens);
+
         // these nodes are added to the graph together so that they are not reordered
         // by doing so, the number of splits in the graph is reduced
         ggml_build_forward_expand(gf, q_cur);
         ggml_build_forward_expand(gf, k_cur);
         ggml_build_forward_expand(gf, v_cur);
 
-        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, nullptr, n_tokens, kq_scale, il);
+        ggml_tensor * cur = lgf->build_attn(ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il);
         cb(cur, "kqv_out", il);
 
+        if (wo) {
+            cur = lgf->build_lora_mm(ctx0, wo, cur);
+        }
+
+        if (wo_b) {
+            //cb(cur, "kqv_wo", il);
+        }
+
+        if (wo_b) {
+            cur = ggml_add(ctx0, cur, wo_b);
+        }
+
         return cur;
     }
 
@@ -4288,18 +4302,32 @@ struct llm_build_context {
              struct ggml_tensor * k_cur,
              struct ggml_tensor * v_cur,
              struct ggml_tensor * kq_b,
-                        int32_t   n_tokens,
+                        int32_t   n_tokens, // TODO: remove
                         float     kq_scale,
                         int       il) {
+        GGML_UNUSED(n_tokens);
+
         // these nodes are added to the graph together so that they are not reordered
         // by doing so, the number of splits in the graph is reduced
         ggml_build_forward_expand(gf, q_cur);
         ggml_build_forward_expand(gf, k_cur);
         ggml_build_forward_expand(gf, v_cur);
 
-        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, kq_b, n_tokens, kq_scale, il);
+        ggml_tensor * cur = lgf->build_attn(ctx0, gf, q_cur, k_cur, v_cur, kq_b, kq_scale, il);
         cb(cur, "kqv_out", il);
 
+        if (wo) {
+            cur = lgf->build_lora_mm(ctx0, wo, cur);
+        }
+
+        if (wo_b) {
+            //cb(cur, "kqv_wo", il);
+        }
+
+        if (wo_b) {
+            cur = ggml_add(ctx0, cur, wo_b);
+        }
+
         return cur;
     }
 

From 9cd78f11a103c578cb598b16b4e49fc4709754a2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 24 Feb 2025 13:38:11 +0200
Subject: [PATCH 73/84] context : explicit llama_context_i abstract interface

ggml-ci
---
 src/llama-context.cpp | 202 +++++++++++++++----------------
 src/llama-context.h   | 268 +++++++++++++++++++++++++++++++-----------
 2 files changed, 299 insertions(+), 171 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index e05afb5646afc..6b101f4869e44 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -42,16 +42,17 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
 }
 
 //
-// llama_context
+// llama_context_base
 //
 
-llama_context::llama_context(
+llama_context_base::llama_context_base(
         const llama_model & model,
               llama_context_params params,
               llama_graph_type gtype) :
+    llama_context_i(),
     llama_graph_i(gtype),
     model(model) {
-    LLAMA_LOG_INFO("%s: constructing llama_context, gtype = %d\n", __func__, gtype);
+    LLAMA_LOG_INFO("%s: constructing llama_context_base, gtype = %d\n", __func__, gtype);
 
     t_start_us = model.t_start_us;
     t_load_us  = model.t_load_us;
@@ -223,9 +224,9 @@ llama_context::llama_context(
     }
 }
 
-llama_context::~llama_context() = default;
+llama_context_base::~llama_context_base() = default;
 
-void llama_context::init() {
+void llama_context_base::init() {
     LLAMA_LOG_DEBUG("%s: call\n", __func__);
 
     const auto & hparams = model.hparams;
@@ -306,7 +307,7 @@ void llama_context::init() {
     reserve();
 }
 
-void llama_context::synchronize() {
+void llama_context_base::synchronize() {
     ggml_backend_sched_synchronize(sched.get());
 
     // FIXME: if multiple single tokens are evaluated without a synchronization,
@@ -336,7 +337,7 @@ void llama_context::synchronize() {
     t_compute_start_us = 0;
 }
 
-void llama_context::reserve() {
+void llama_context_base::reserve() {
     uint32_t n_seqs = 1; // TODO: worst-case number of sequences
     uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
@@ -415,72 +416,72 @@ void llama_context::reserve() {
     }
 }
 
-const llama_model & llama_context::get_model() const {
+const llama_model & llama_context_base::get_model() const {
     return model;
 }
 
-const llama_cparams & llama_context::get_cparams() const {
+const llama_cparams & llama_context_base::get_cparams() const {
     return cparams;
 }
 
-uint32_t llama_context::n_ctx() const {
+uint32_t llama_context_base::n_ctx() const {
     return cparams.n_ctx;
 }
 
-uint32_t llama_context::n_ctx_per_seq() const {
+uint32_t llama_context_base::n_ctx_per_seq() const {
     return cparams.n_ctx / cparams.n_seq_max;
 }
 
-uint32_t llama_context::n_batch() const {
+uint32_t llama_context_base::n_batch() const {
     return cparams.n_batch;
 }
 
-uint32_t llama_context::n_ubatch() const {
+uint32_t llama_context_base::n_ubatch() const {
     return cparams.n_ubatch;
 }
 
-uint32_t llama_context::n_seq_max() const {
+uint32_t llama_context_base::n_seq_max() const {
     return cparams.n_seq_max;
 }
 
-uint32_t llama_context::n_threads() const {
+uint32_t llama_context_base::n_threads() const {
     return cparams.n_threads;
 }
 
-uint32_t llama_context::n_threads_batch() const {
+uint32_t llama_context_base::n_threads_batch() const {
     return cparams.n_threads_batch;
 }
 
-int32_t llama_context::max_nodes() const {
+int32_t llama_context_base::max_nodes() const {
     return std::max<int32_t>(8192, 5*model.n_tensors());
 }
 
-llama_kv_cache * llama_context::get_kv_self() {
-    LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__);
+llama_kv_cache * llama_context_base::get_kv_self() {
+    LLAMA_LOG_WARN("%s: llama_context_base does not have a KV cache\n", __func__);
     return nullptr;
 }
 
-const llama_kv_cache * llama_context::get_kv_self() const {
-    LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__);
+const llama_kv_cache * llama_context_base::get_kv_self() const {
+    LLAMA_LOG_WARN("%s: llama_context_base does not have a KV cache\n", __func__);
     return nullptr;
 }
 
-void llama_context::kv_self_update() {
-    LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__);
+void llama_context_base::kv_self_update() {
+    LLAMA_LOG_WARN("%s: llama_context_base does not have a KV cache\n", __func__);
 }
 
-enum llama_pooling_type llama_context::pooling_type() const {
+enum llama_pooling_type llama_context_base::pooling_type() const {
     return cparams.pooling_type;
 }
 
-float * llama_context::get_logits() {
+float * llama_context_base::get_logits() {
     // reorder logits for backward compatibility
     output_reorder();
 
     return logits;
 }
 
-float * llama_context::get_logits_ith(int32_t i) {
+float * llama_context_base::get_logits_ith(int32_t i) {
     int32_t j = -1;
 
     try {
@@ -518,14 +519,14 @@ float * llama_context::get_logits_ith(int32_t i) {
     }
 }
 
-float * llama_context::get_embeddings() {
+float * llama_context_base::get_embeddings() {
     // reorder embeddings for backward compatibility
     output_reorder();
 
     return embd;
 }
 
-float * llama_context::get_embeddings_ith(int32_t i) {
+float * llama_context_base::get_embeddings_ith(int32_t i) {
     int32_t j = -1;
 
     try {
@@ -563,7 +564,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
     }
 }
 
-float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
+float * llama_context_base::get_embeddings_seq(llama_seq_id seq_id) {
     auto it = embd_seq.find(seq_id);
     if (it == embd_seq.end()) {
         return nullptr;
@@ -572,11 +573,11 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
     return it->second.data();
 }
 
-int64_t llama_context::n_pos_per_token() const {
+int64_t llama_context_base::n_pos_per_token() const {
     return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
 }
 
-void llama_context::attach_threadpool(
+void llama_context_base::attach_threadpool(
            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch) {
     LLAMA_LOG_DEBUG("%s: call\n", __func__);
@@ -585,21 +586,21 @@ void llama_context::attach_threadpool(
     this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
 }
 
-void llama_context::detach_threadpool() {
+void llama_context_base::detach_threadpool() {
     LLAMA_LOG_DEBUG("%s: call\n", __func__);
 
     this->threadpool       = nullptr;
     this->threadpool_batch = nullptr;
 }
 
-void llama_context::set_n_threads(int32_t n_threads, int32_t n_threads_batch) {
+void llama_context_base::set_n_threads(int32_t n_threads, int32_t n_threads_batch) {
     LLAMA_LOG_DEBUG("%s: n_threads = %d, n_threads_batch = %d\n", __func__, n_threads, n_threads_batch);
 
     cparams.n_threads       = n_threads;
     cparams.n_threads_batch = n_threads_batch;
 }
 
-void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) {
+void llama_context_base::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) {
     LLAMA_LOG_DEBUG("%s: call\n", __func__);
 
     this->abort_callback      = abort_callback;
@@ -614,19 +615,19 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
     }
 }
 
-void llama_context::set_embeddings(bool value) {
+void llama_context_base::set_embeddings(bool value) {
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
     cparams.embeddings = value;
 }
 
-void llama_context::set_causal_attn(bool value) {
+void llama_context_base::set_causal_attn(bool value) {
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
     cparams.causal_attn = value;
 }
 
-void llama_context::set_adapter_lora(
+void llama_context_base::set_adapter_lora(
             llama_adapter_lora * adapter,
             float scale) {
     LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
@@ -634,7 +635,7 @@ void llama_context::set_adapter_lora(
     loras[adapter] = scale;
 }
 
-bool llama_context::rm_adapter_lora(
+bool llama_context_base::rm_adapter_lora(
             llama_adapter_lora * adapter) {
     LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
 
@@ -647,13 +648,13 @@ bool llama_context::rm_adapter_lora(
     return false;
 }
 
-void llama_context::clear_adapter_lora() {
+void llama_context_base::clear_adapter_lora() {
     LLAMA_LOG_DEBUG("%s: call\n", __func__);
 
     loras.clear();
 }
 
-bool llama_context::apply_adapter_cvec(
+bool llama_context_base::apply_adapter_cvec(
             const float * data,
                  size_t   len,
                 int32_t   n_embd,
@@ -664,7 +665,7 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
-int llama_context::encode(llama_batch & inp_batch) {
+int llama_context_base::encode(llama_batch & inp_batch) {
     if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
@@ -798,7 +799,7 @@ int llama_context::encode(llama_batch & inp_batch) {
     return 0;
 }
 
-int llama_context::decode(llama_batch & inp_batch) {
+int llama_context_base::decode(llama_batch & inp_batch) {
     if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
@@ -829,7 +830,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     }
 
     // micro-batching is not possible without KV cache
-    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "llama_context requires n_ubatch >= n_tokens");
+    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "llama_context_base requires n_ubatch >= n_tokens");
 
     if (t_compute_start_us == 0) {
         t_compute_start_us = ggml_time_us();
@@ -1006,7 +1007,7 @@ int llama_context::decode(llama_batch & inp_batch) {
 // input
 //
 
-void llama_context::input_set(const llama_ubatch & ubatch) {
+void llama_context_base::input_set(const llama_ubatch & ubatch) {
     const llama_hparams & hparams = model.hparams;
 
     if (ubatch.token) {
@@ -1280,7 +1281,7 @@ void llama_context::input_set(const llama_ubatch & ubatch) {
 // output
 //
 
-int32_t llama_context::output_reserve(int32_t n_outputs) {
+int32_t llama_context_base::output_reserve(int32_t n_outputs) {
     const auto & hparams = model.hparams;
     const auto & vocab   = model.vocab;
 
@@ -1348,7 +1349,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     return n_outputs_max;
 }
 
-void llama_context::output_reorder() {
+void llama_context_base::output_reorder() {
     auto & out_ids = sbatch.out_ids;
     if (!out_ids.empty()) {
         const uint32_t n_vocab = model.vocab.n_tokens();
@@ -1390,7 +1391,7 @@ void llama_context::output_reorder() {
 // graph
 //
 
-ggml_cgraph * llama_context::graph_init() {
+ggml_cgraph * llama_context_base::graph_init() {
     inp = {};
 
     struct ggml_init_params params = {
@@ -1404,14 +1405,14 @@ ggml_cgraph * llama_context::graph_init() {
     return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false);
 }
 
-llama_graph_result llama_context::graph_build(
+llama_graph_result llama_context_base::graph_build(
             ggml_context * ctx,
              ggml_cgraph * gf,
       const llama_ubatch & ubatch) {
     return model.build_graph(ctx, gf, this, cparams, ubatch);
 }
 
-enum ggml_status llama_context::graph_compute(
+enum ggml_status llama_context_base::graph_compute(
             ggml_cgraph * gf,
                    bool   batched) {
     int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
@@ -1442,7 +1443,7 @@ enum ggml_status llama_context::graph_compute(
 // graph build API
 //
 
-void llama_context::build_cb(
+void llama_context_base::build_cb(
          ggml_tensor * cur,
           const char * name,
   const llama_ubatch & ubatch,
@@ -1477,14 +1478,14 @@ void llama_context::build_cb(
     }
 }
 
-ggml_tensor * llama_context::build_cvec(
+ggml_tensor * llama_context_base::build_cvec(
         ggml_context * ctx0,
          ggml_tensor * cur,
                  int   il) {
     return cvec.apply_to(ctx0, cur, il);
 }
 
-ggml_tensor * llama_context::build_lora_mm(
+ggml_tensor * llama_context_base::build_lora_mm(
         ggml_context * ctx0,
          ggml_tensor * w,
          ggml_tensor * cur) {
@@ -1511,7 +1512,7 @@ ggml_tensor * llama_context::build_lora_mm(
     return res;
 }
 
-ggml_tensor * llama_context::build_lora_mm_id(
+ggml_tensor * llama_context_base::build_lora_mm_id(
         ggml_context * ctx0,
          ggml_tensor * w,
          ggml_tensor * cur,
@@ -1540,7 +1541,7 @@ ggml_tensor * llama_context::build_lora_mm_id(
     return res;
 }
 
-ggml_tensor * llama_context::build_rope_factors(int il) {
+ggml_tensor * llama_context_base::build_rope_factors(int il) {
     const auto & hparams = model.hparams;
 
     // choose long/short freq factors based on the context size
@@ -1557,7 +1558,7 @@ ggml_tensor * llama_context::build_rope_factors(int il) {
     return model.layers[il].rope_short;
 }
 
-ggml_tensor * llama_context::build_rope_shift(
+ggml_tensor * llama_context_base::build_rope_shift(
         ggml_context * ctx0,
         ggml_tensor * cur,
         ggml_tensor * shift,
@@ -1606,7 +1607,7 @@ ggml_tensor * llama_context::build_rope_shift(
     return tmp;
 }
 
-ggml_tensor * llama_context::build_inp_embd(
+ggml_tensor * llama_context_base::build_inp_embd(
         ggml_context * ctx0,
          ggml_tensor * tok_embd,
   const llama_ubatch & ubatch) {
@@ -1656,7 +1657,7 @@ ggml_tensor * llama_context::build_inp_embd(
     return inpL;
 }
 
-ggml_tensor * llama_context::build_inp_pos(
+ggml_tensor * llama_context_base::build_inp_pos(
         ggml_context * ctx0,
              int32_t   n_tokens) {
     inp.pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
@@ -1665,7 +1666,7 @@ ggml_tensor * llama_context::build_inp_pos(
     return inp.pos;
 }
 
-ggml_tensor * llama_context::build_inp_pos_bucket(
+ggml_tensor * llama_context_base::build_inp_pos_bucket(
         ggml_context * ctx0,
              int32_t   n_tokens) {
     inp.pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
@@ -1674,7 +1675,7 @@ ggml_tensor * llama_context::build_inp_pos_bucket(
     return inp.pos_bucket;
 }
 
-ggml_tensor * llama_context::build_inp_out_ids(
+ggml_tensor * llama_context_base::build_inp_out_ids(
         ggml_context * ctx0) {
     const int32_t n_out_ids = n_outputs;
 
@@ -1684,7 +1685,7 @@ ggml_tensor * llama_context::build_inp_out_ids(
     return inp.out_ids;
 }
 
-ggml_tensor * llama_context::build_inp_mean(
+ggml_tensor * llama_context_base::build_inp_mean(
         ggml_context * ctx0,
              int32_t   n_tokens) {
     inp.mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
@@ -1693,7 +1694,7 @@ ggml_tensor * llama_context::build_inp_mean(
     return inp.mean;
 }
 
-ggml_tensor * llama_context::build_inp_cls(
+ggml_tensor * llama_context_base::build_inp_cls(
         ggml_context * ctx0,
              int32_t   n_tokens) {
     inp.cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -1702,7 +1703,7 @@ ggml_tensor * llama_context::build_inp_cls(
     return inp.cls;
 }
 
-void llama_context::build_attn_inp(
+void llama_context_base::build_attn_inp(
         ggml_context * ctx0,
              int32_t   n_tokens,
                 bool   causal,
@@ -1718,7 +1719,7 @@ void llama_context::build_attn_inp(
     inp.kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.kq_mask, GGML_TYPE_F16) : inp.kq_mask;
 }
 
-ggml_tensor * llama_context::build_attn(
+ggml_tensor * llama_context_base::build_attn(
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * q_cur,
@@ -1745,7 +1746,7 @@ ggml_tensor * llama_context::build_attn(
     return cur;
 }
 
-ggml_tensor * llama_context::build_attn_mha(
+ggml_tensor * llama_context_base::build_attn_mha(
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * q,
@@ -1774,6 +1775,7 @@ ggml_tensor * llama_context::build_attn_mha(
 
     struct ggml_tensor * cur;
 
+    // TODO: replace hardcoded padding with ggml-provided padding
     if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
         GGML_UNUSED(model);
 
@@ -1841,7 +1843,7 @@ ggml_tensor * llama_context::build_attn_mha(
     return cur;
 }
 
-ggml_tensor * llama_context::build_inp_self_k_shift(
+ggml_tensor * llama_context_base::build_inp_self_k_shift(
         ggml_context * ctx0) {
     GGML_UNUSED(ctx0);
 
@@ -1849,7 +1851,7 @@ ggml_tensor * llama_context::build_inp_self_k_shift(
     return nullptr;
 }
 
-void llama_context::build_kv_self_shift(
+void llama_context_base::build_kv_self_shift(
         ggml_context * ctx0,
         ggml_cgraph * gf) {
     GGML_UNUSED(ctx0);
@@ -1858,7 +1860,7 @@ void llama_context::build_kv_self_shift(
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
 }
 
-void llama_context::build_kv_self_defrag(
+void llama_context_base::build_kv_self_defrag(
         ggml_context * ctx0,
         ggml_cgraph * gf) {
     GGML_UNUSED(ctx0);
@@ -1872,7 +1874,7 @@ void llama_context::build_kv_self_defrag(
 // perf
 //
 
-llama_perf_context_data llama_context::perf_get_data() const {
+llama_perf_context_data llama_context_base::perf_get_data() const {
     llama_perf_context_data data = {};
 
     data.t_start_ms  = 1e-3 * t_start_us;
@@ -1885,7 +1887,7 @@ llama_perf_context_data llama_context::perf_get_data() const {
     return data;
 }
 
-void llama_context::perf_reset() {
+void llama_context_base::perf_reset() {
     t_start_us  = ggml_time_us();
     t_eval_us   = n_eval = 0;
     t_p_eval_us = n_p_eval = 0;
@@ -2029,7 +2031,7 @@ class llama_io_read_file : public llama_io_read_i {
     std::vector<uint8_t> temp_buffer;
 };
 
-size_t llama_context::state_get_size() {
+size_t llama_context_base::state_get_size() {
     llama_io_write_dummy io;
     try {
         return state_get_data(io);
@@ -2039,7 +2041,7 @@ size_t llama_context::state_get_size() {
     }
 }
 
-size_t llama_context::state_get_data(uint8_t * dst, size_t size) {
+size_t llama_context_base::state_get_data(uint8_t * dst, size_t size) {
     llama_io_write_buffer io(dst, size);
     try {
         return state_get_data(io);
@@ -2049,7 +2051,7 @@ size_t llama_context::state_get_data(uint8_t * dst, size_t size) {
     }
 }
 
-size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
+size_t llama_context_base::state_set_data(const uint8_t * src, size_t size) {
     llama_io_read_buffer io(src, size);
     try {
         return state_set_data(io);
@@ -2059,7 +2061,7 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
     }
 }
 
-size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
+size_t llama_context_base::state_seq_get_size(llama_seq_id seq_id) {
     llama_io_write_dummy io;
     try {
         return state_seq_get_data(io, seq_id);
@@ -2069,7 +2071,7 @@ size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
     }
 }
 
-size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
+size_t llama_context_base::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
     llama_io_write_buffer io(dst, size);
     try {
         return state_seq_get_data(io, seq_id);
@@ -2079,7 +2081,7 @@ size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, siz
     }
 }
 
-size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
+size_t llama_context_base::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
     llama_io_read_buffer io(src, size);
     try {
         return state_seq_set_data(io, seq_id);
@@ -2089,7 +2091,7 @@ size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * sr
     }
 }
 
-bool llama_context::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+bool llama_context_base::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
     llama_file file(filepath, "rb");
 
     // sanity checks
@@ -2132,7 +2134,7 @@ bool llama_context::state_load_file(const char * filepath, llama_token * tokens_
     return true;
 }
 
-bool llama_context::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) {
+bool llama_context_base::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) {
     llama_file file(filepath, "wb");
 
     file.write_u32(LLAMA_SESSION_MAGIC);
@@ -2149,7 +2151,7 @@ bool llama_context::state_save_file(const char * filepath, const llama_token * t
     return true;
 }
 
-size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+size_t llama_context_base::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
     llama_file file(filepath, "rb");
 
     // version checks
@@ -2192,7 +2194,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
     return file.tell();
 }
 
-size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) {
+size_t llama_context_base::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) {
     llama_file file(filepath, "wb");
 
     file.write_u32(LLAMA_STATE_SEQ_MAGIC);
@@ -2212,7 +2214,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
     return res;
 }
 
-size_t llama_context::state_get_data(llama_io_write_i & io) {
+size_t llama_context_base::state_get_data(llama_io_write_i & io) {
     LLAMA_LOG_DEBUG("%s: writing state\n", __func__);
 
     // write model info
@@ -2285,7 +2287,7 @@ size_t llama_context::state_get_data(llama_io_write_i & io) {
     return io.n_bytes();
 }
 
-size_t llama_context::state_set_data(llama_io_read_i & io) {
+size_t llama_context_base::state_set_data(llama_io_read_i & io) {
     LLAMA_LOG_DEBUG("%s: reading state\n", __func__);
 
     // read model info
@@ -2366,13 +2368,13 @@ size_t llama_context::state_set_data(llama_io_read_i & io) {
     return io.n_bytes();
 }
 
-size_t llama_context::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
+size_t llama_context_base::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
     GGML_UNUSED(seq_id);
 
     return io.n_bytes();
 }
 
-size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
+size_t llama_context_base::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
     GGML_UNUSED(seq_id);
 
     return io.n_bytes();
@@ -2386,7 +2388,7 @@ llama_context_kv_self::llama_context_kv_self(
         const llama_model & model,
               llama_context_params params,
               llama_graph_type gtype) :
-    llama_context(model, params, gtype),
+    llama_context_base(model, params, gtype),
     kv_self(model.hparams) {
     LLAMA_LOG_INFO("%s: constructing llama_context_kv_self\n", __func__);
 
@@ -2436,7 +2438,7 @@ void llama_context_kv_self::reserve() {
 
     LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n);
 
-    llama_context::reserve();
+    llama_context_base::reserve();
 }
 
 llama_kv_cache * llama_context_kv_self::get_kv_self() {
@@ -3033,7 +3035,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     }
 
     // call base functionality
-    llama_context::input_set(ubatch);
+    llama_context_base::input_set(ubatch);
 
     if (inp.self_kq_mask || inp.self_kq_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
@@ -3219,7 +3221,7 @@ ggml_cgraph * llama_context_kv_self::graph_init() {
 
     inp = {};
 
-    return llama_context::graph_init();
+    return llama_context_base::graph_init();
 }
 
 ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) {
@@ -3719,7 +3721,7 @@ ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross(
 // state save/load
 
 size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
-    llama_context::state_get_data(io);
+    llama_context_base::state_get_data(io);
 
     LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
     kv_self.state_write(io);
@@ -3728,7 +3730,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
 }
 
 size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
-    llama_context::state_set_data(io);
+    llama_context_base::state_set_data(io);
 
     LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
     kv_self.state_read(io);
@@ -3737,7 +3739,7 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
 }
 
 size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
-    llama_context::state_seq_get_data(io, seq_id);
+    llama_context_base::state_seq_get_data(io, seq_id);
 
     kv_self.state_write(io, seq_id);
 
@@ -3745,7 +3747,7 @@ size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_se
 }
 
 size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
-    llama_context::state_seq_set_data(io, seq_id);
+    llama_context_base::state_seq_set_data(io, seq_id);
 
     kv_self.state_read(io, seq_id);
 
@@ -3760,7 +3762,7 @@ llama_context_recurrent::llama_context_recurrent(
         const llama_model & model,
               llama_context_params params,
               llama_graph_type gtype) :
-    llama_context(model, params, gtype),
+    llama_context_base(model, params, gtype),
     kv_self(model.hparams) {
     LLAMA_LOG_INFO("%s: constructing llama_context_recurrent\n", __func__);
 
@@ -3807,7 +3809,7 @@ void llama_context_recurrent::reserve() {
     LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n);
 
     // TODO: implement recurrent-specific reserve logic
-    llama_context::reserve();
+    llama_context_base::reserve();
 }
 
 llama_kv_cache * llama_context_recurrent::get_kv_self() {
@@ -4139,7 +4141,7 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
 
 void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
     // call base functionality
-    llama_context::input_set(ubatch);
+    llama_context_base::input_set(ubatch);
 
     GGML_ASSERT(kv_self.recurrent);
 
@@ -4193,7 +4195,7 @@ ggml_cgraph * llama_context_recurrent::graph_init() {
     inp.s_copy = nullptr;
     inp.s_mask = nullptr;
 
-    return llama_context::graph_init();
+    return llama_context_base::graph_init();
 }
 
 ggml_tensor * llama_context_recurrent::build_inp_s_copy(
@@ -4602,7 +4604,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
 // state save/load
 
 size_t llama_context_recurrent::state_get_data(llama_io_write_i & io) {
-    llama_context::state_get_data(io);
+    llama_context_base::state_get_data(io);
 
     kv_self.state_write(io);
 
@@ -4610,7 +4612,7 @@ size_t llama_context_recurrent::state_get_data(llama_io_write_i & io) {
 }
 
 size_t llama_context_recurrent::state_set_data(llama_io_read_i & io) {
-    llama_context::state_set_data(io);
+    llama_context_base::state_set_data(io);
 
     kv_self.state_read(io);
 
@@ -4618,7 +4620,7 @@ size_t llama_context_recurrent::state_set_data(llama_io_read_i & io) {
 }
 
 size_t llama_context_recurrent::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
-    llama_context::state_seq_get_data(io, seq_id);
+    llama_context_base::state_seq_get_data(io, seq_id);
 
     kv_self.state_write(io, seq_id);
 
@@ -4626,7 +4628,7 @@ size_t llama_context_recurrent::state_seq_get_data(llama_io_write_i & io, llama_
 }
 
 size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
-    llama_context::state_seq_set_data(io, seq_id);
+    llama_context_base::state_seq_set_data(io, seq_id);
 
     kv_self.state_read(io, seq_id);
 
@@ -4640,7 +4642,7 @@ size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_s
 llama_context_enc_dec::llama_context_enc_dec(
         const llama_model & model,
               llama_context_params params) :
-    llama_context(model, params, LLAMA_GRAPH_TYPE_ENCODER),
+    llama_context_enc(model, params, LLAMA_GRAPH_TYPE_ENCODER),
     ctx_dec(model, params, LLAMA_GRAPH_TYPE_DECODER) {
     LLAMA_LOG_INFO("%s: constructing llama_context_enc_dec\n", __func__);
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index 5b63b3b06d21c..d647a426cd1be 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -20,90 +20,78 @@ class llama_io_write_i;
 
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
-// basic transformer without KV cache
-struct llama_context : public llama_graph_i {
+// abstract interface corresponding to the public C API
+struct llama_context {
 public:
-    llama_context(
-            const llama_model & model,
-                  llama_context_params params,
-                  llama_graph_type gtype);
-
-    virtual ~llama_context();
-
-    // init scheduler and compute buffers, reserve worst-case graphs
-    // call once after the context is constructed
-    virtual void init();
+    llama_context() = default;
+    virtual ~llama_context() = default;
 
-    virtual void synchronize();
+    virtual void init() = 0;
 
-protected:
-    // called by init() to reserve the worst-case graphs
-    // override in child classes
-    virtual void reserve();
+    virtual void synchronize() = 0;
 
-public:
-    const llama_model   & get_model()   const;
-    const llama_cparams & get_cparams() const;
+    virtual const llama_model   & get_model()   const = 0;
+    virtual const llama_cparams & get_cparams() const = 0;
 
-    virtual uint32_t n_ctx()         const;
-    virtual uint32_t n_ctx_per_seq() const;
-    virtual uint32_t n_batch()       const;
-    virtual uint32_t n_ubatch()      const;
-    virtual uint32_t n_seq_max()     const;
+    virtual uint32_t n_ctx()         const = 0;
+    virtual uint32_t n_ctx_per_seq() const = 0;
+    virtual uint32_t n_batch()       const = 0;
+    virtual uint32_t n_ubatch()      const = 0;
+    virtual uint32_t n_seq_max()     const = 0;
 
-    virtual uint32_t n_threads()       const;
-    virtual uint32_t n_threads_batch() const;
+    virtual uint32_t n_threads()       const = 0;
+    virtual uint32_t n_threads_batch() const = 0;
 
-    virtual int32_t max_nodes() const;
+    virtual int32_t max_nodes() const = 0;
 
     // self-attention:
 
     // if the context does not have a KV cache, return nullptr
-    virtual       llama_kv_cache * get_kv_self();
-    virtual const llama_kv_cache * get_kv_self() const;
+    virtual       llama_kv_cache * get_kv_self()       = 0;
+    virtual const llama_kv_cache * get_kv_self() const = 0;
 
     // if the context does not have a KV cache, noop
-    virtual void kv_self_update();
+    virtual void kv_self_update() = 0;
 
-    virtual enum llama_pooling_type pooling_type() const;
+    virtual enum llama_pooling_type pooling_type() const = 0;
 
-    virtual float * get_logits();
-    virtual float * get_logits_ith(int32_t i);
+    virtual float * get_logits()              = 0;
+    virtual float * get_logits_ith(int32_t i) = 0;
 
-    virtual float * get_embeddings();
-    virtual float * get_embeddings_ith(int32_t i);
-    virtual float * get_embeddings_seq(llama_seq_id seq_id);
+    virtual float * get_embeddings()                        = 0;
+    virtual float * get_embeddings_ith(int32_t i)           = 0;
+    virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0;
 
-    virtual int64_t n_pos_per_token() const; // vision
+    virtual int64_t n_pos_per_token() const = 0; // vision
 
     virtual void attach_threadpool(
             ggml_threadpool_t   threadpool,
-            ggml_threadpool_t   threadpool_batch);
+            ggml_threadpool_t   threadpool_batch) = 0;
 
-    virtual void detach_threadpool();
+    virtual void detach_threadpool() = 0;
 
-    virtual void set_n_threads(int32_t n_threads, int32_t n_threads_batch);
+    virtual void set_n_threads(int32_t n_threads, int32_t n_threads_batch) = 0;
 
-    virtual void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
+    virtual void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) = 0;
 
-    virtual void set_embeddings (bool value);
-    virtual void set_causal_attn(bool value);
+    virtual void set_embeddings (bool value) = 0;
+    virtual void set_causal_attn(bool value) = 0;
 
     virtual void set_adapter_lora(
             llama_adapter_lora * adapter,
-            float scale);
+            float scale) = 0;
 
     virtual bool rm_adapter_lora(
-            llama_adapter_lora * adapter);
+            llama_adapter_lora * adapter) = 0;
 
-    virtual void clear_adapter_lora();
+    virtual void clear_adapter_lora() = 0;
 
     virtual bool apply_adapter_cvec(
             const float * data,
                  size_t   len,
                 int32_t   n_embd,
                 int32_t   il_start,
-                int32_t   il_end);
+                int32_t   il_end) = 0;
 
     // encode a batch of tokens by evaluating the encoder part of the transformer
     //
@@ -114,7 +102,7 @@ struct llama_context : public llama_graph_i {
     // return positive int on warning
     // return negative int on error
     //
-    virtual int encode(llama_batch & inp_batch);
+    virtual int encode(llama_batch & inp_batch) = 0;
 
     // decode a batch of tokens by evaluating the transformer
     // in case of unsuccessful decoding (error or warning),
@@ -128,7 +116,145 @@ struct llama_context : public llama_graph_i {
     // return positive int on warning
     // return negative int on error
     //
-    virtual int decode(llama_batch & inp_batch);
+    virtual int decode(llama_batch & inp_batch) = 0;
+
+    //
+    // perf
+    //
+
+    virtual llama_perf_context_data perf_get_data() const = 0;
+    virtual void perf_reset() = 0;
+
+    //
+    // state save/load
+    //
+
+    virtual size_t state_get_size()                                 = 0;
+    virtual size_t state_get_data(      uint8_t * dst, size_t size) = 0;
+    virtual size_t state_set_data(const uint8_t * src, size_t size) = 0;
+
+    virtual size_t state_seq_get_size(llama_seq_id seq_id)                                   = 0;
+    virtual size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) = 0;
+    virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) = 0;
+
+    virtual bool state_load_file(
+            const char * filepath,
+           llama_token * tokens_out,
+                size_t   n_token_capacity,
+                size_t * n_token_count_out) = 0;
+
+    virtual bool state_save_file(
+            const char * filepath,
+     const llama_token * tokens,
+                size_t   n_token_count) = 0;
+
+    virtual size_t state_seq_load_file(
+          llama_seq_id   seq_id,
+            const char * filepath,
+           llama_token * tokens_out,
+                size_t   n_token_capacity,
+                size_t * n_token_count_out) = 0;
+
+    virtual size_t state_seq_save_file(
+          llama_seq_id   seq_id,
+            const char * filepath,
+     const llama_token * tokens,
+                size_t   n_token_count) = 0;
+};
+
+// C++ alias
+class llama_context_i : public llama_context {
+public:
+    using llama_context::llama_context;
+};
+
+// basic transformer without KV cache
+class llama_context_base : public llama_context_i, public llama_graph_i {
+public:
+    llama_context_base(
+            const llama_model & model,
+                  llama_context_params params,
+                  llama_graph_type gtype);
+
+    virtual ~llama_context_base();
+
+    // init scheduler and compute buffers, reserve worst-case graphs
+    // call once after the context is constructed
+    void init() override;
+
+    void synchronize() override;
+
+protected:
+    // called by init() to reserve the worst-case graphs
+    // override in child classes
+    virtual void reserve();
+
+public:
+    const llama_model   & get_model()   const override;
+    const llama_cparams & get_cparams() const override;
+
+    uint32_t n_ctx()         const override;
+    uint32_t n_ctx_per_seq() const override;
+    uint32_t n_batch()       const override;
+    uint32_t n_ubatch()      const override;
+    uint32_t n_seq_max()     const override;
+
+    uint32_t n_threads()       const override;
+    uint32_t n_threads_batch() const override;
+
+    int32_t max_nodes() const override;
+
+    // self-attention:
+
+    // if the context does not have a KV cache, return nullptr
+          llama_kv_cache * get_kv_self()       override;
+    const llama_kv_cache * get_kv_self() const override;
+
+    // if the context does not have a KV cache, noop
+    void kv_self_update() override;
+
+    enum llama_pooling_type pooling_type() const override;
+
+    float * get_logits()              override;
+    float * get_logits_ith(int32_t i) override;
+
+    float * get_embeddings()                        override;
+    float * get_embeddings_ith(int32_t i)           override;
+    float * get_embeddings_seq(llama_seq_id seq_id) override;
+
+    int64_t n_pos_per_token() const override; // vision
+
+    void attach_threadpool(
+            ggml_threadpool_t   threadpool,
+            ggml_threadpool_t   threadpool_batch) override;
+
+    void detach_threadpool() override;
+
+    void set_n_threads(int32_t n_threads, int32_t n_threads_batch) override;
+
+    void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) override;
+
+    void set_embeddings (bool value) override;
+    void set_causal_attn(bool value) override;
+
+    void set_adapter_lora(
+            llama_adapter_lora * adapter,
+            float scale) override;
+
+    bool rm_adapter_lora(
+            llama_adapter_lora * adapter) override;
+
+    void clear_adapter_lora() override;
+
+    bool apply_adapter_cvec(
+            const float * data,
+                 size_t   len,
+                int32_t   n_embd,
+                int32_t   il_start,
+                int32_t   il_end) override;
+
+    int encode(llama_batch & inp_batch) override;
+    int decode(llama_batch & inp_batch) override;
 
 protected:
     //
@@ -297,8 +423,8 @@ struct llama_context : public llama_graph_i {
     // perf
     //
 
-    virtual llama_perf_context_data perf_get_data() const;
-    virtual void perf_reset();
+    llama_perf_context_data perf_get_data() const override;
+    void perf_reset()                             override;
 
 protected:
     // TODO: become private
@@ -318,37 +444,37 @@ struct llama_context : public llama_graph_i {
     // state save/load
     //
 
-    virtual size_t state_get_size();
-    virtual size_t state_get_data(      uint8_t * dst, size_t size);
-    virtual size_t state_set_data(const uint8_t * src, size_t size);
+    size_t state_get_size()                                 override;
+    size_t state_get_data(      uint8_t * dst, size_t size) override;
+    size_t state_set_data(const uint8_t * src, size_t size) override;
 
-    virtual size_t state_seq_get_size(llama_seq_id seq_id);
-    virtual size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size);
-    virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
+    size_t state_seq_get_size(llama_seq_id seq_id)                                   override;
+    size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) override;
+    size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override;
 
-    virtual bool state_load_file(
+    bool state_load_file(
             const char * filepath,
            llama_token * tokens_out,
                 size_t   n_token_capacity,
-                size_t * n_token_count_out);
+                size_t * n_token_count_out) override;
 
-    virtual bool state_save_file(
+    bool state_save_file(
             const char * filepath,
      const llama_token * tokens,
-                size_t   n_token_count);
+                size_t   n_token_count) override;
 
-    virtual size_t state_seq_load_file(
+    size_t state_seq_load_file(
           llama_seq_id   seq_id,
             const char * filepath,
            llama_token * tokens_out,
                 size_t   n_token_capacity,
-                size_t * n_token_count_out);
+                size_t * n_token_count_out) override;
 
-    virtual size_t state_seq_save_file(
+    size_t state_seq_save_file(
           llama_seq_id   seq_id,
             const char * filepath,
      const llama_token * tokens,
-                size_t   n_token_count);
+                size_t   n_token_count) override;
 
 protected:
     virtual size_t state_get_data(llama_io_write_i & io);
@@ -417,7 +543,7 @@ struct llama_context : public llama_graph_i {
 };
 
 // transformer with a self-attention KV cache
-class llama_context_kv_self : public llama_context {
+class llama_context_kv_self : public llama_context_base {
 public:
     llama_context_kv_self(
             const llama_model & model,
@@ -542,7 +668,7 @@ class llama_context_kv_self : public llama_context {
 };
 
 // a recurrent transformer (ie.e RWKV, Mamba)
-class llama_context_recurrent : public llama_context {
+class llama_context_recurrent : public llama_context_base {
 public:
     llama_context_recurrent(
             const llama_model & model,
@@ -656,12 +782,12 @@ class llama_context_recurrent : public llama_context {
     llama_kv_cache_recurrent kv_self;
 };
 
-class llama_context_enc : public llama_context {
+class llama_context_enc : public llama_context_base {
 public:
-    using llama_context::llama_context;
+    using llama_context_base::llama_context_base;
 };
 
-class llama_context_enc_dec : public llama_context {
+class llama_context_enc_dec : public llama_context_enc {
 public:
     llama_context_enc_dec(
             const llama_model & model,

From be58e30017b445e2146c8bc1784ae0b291fae48c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 24 Feb 2025 15:16:45 +0200
Subject: [PATCH 74/84] enc-dec : compose wip

ggml-ci
---
 src/llama-context.cpp | 777 ++++++++++++++++++++++++++++++++----------
 src/llama-context.h   | 261 +++++++++++---
 src/llama-graph.cpp   |  26 +-
 src/llama-graph.h     |  14 +-
 src/llama-model.cpp   | 328 +++++++++---------
 5 files changed, 1002 insertions(+), 404 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 6b101f4869e44..81663c40018e3 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -261,7 +261,7 @@ void llama_context_base::init() {
 
         LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
 
-        const size_t max_nodes = this->max_nodes();
+        const size_t max_nodes = this->graph_max_nodes();
 
         LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
 
@@ -420,10 +420,6 @@ const llama_model & llama_context_base::get_model() const {
     return model;
 }
 
-const llama_cparams & llama_context_base::get_cparams() const {
-    return cparams;
-}
-
 uint32_t llama_context_base::n_ctx() const {
     return cparams.n_ctx;
 }
@@ -452,10 +448,6 @@ uint32_t llama_context_base::n_threads_batch() const {
     return cparams.n_threads_batch;
 }
 
-int32_t llama_context_base::max_nodes() const {
-    return std::max<int32_t>(8192, 5*model.n_tensors());
-}
-
 llama_kv_cache * llama_context_base::get_kv_self() {
     LLAMA_LOG_WARN("%s: llama_context_base does not have a KV cache\n", __func__);
     return nullptr;
@@ -573,10 +565,6 @@ float * llama_context_base::get_embeddings_seq(llama_seq_id seq_id) {
     return it->second.data();
 }
 
-int64_t llama_context_base::n_pos_per_token() const {
-    return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
-}
-
 void llama_context_base::attach_threadpool(
            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch) {
@@ -1007,6 +995,10 @@ int llama_context_base::decode(llama_batch & inp_batch) {
 // input
 //
 
+int64_t llama_context_base::n_pos_per_token() const {
+    return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+}
+
 void llama_context_base::input_set(const llama_ubatch & ubatch) {
     const llama_hparams & hparams = model.hparams;
 
@@ -1391,6 +1383,10 @@ void llama_context_base::output_reorder() {
 // graph
 //
 
+int32_t llama_context_base::graph_max_nodes() const {
+    return std::max<int32_t>(8192, 5*model.n_tensors());
+}
+
 ggml_cgraph * llama_context_base::graph_init() {
     inp = {};
 
@@ -1402,7 +1398,7 @@ ggml_cgraph * llama_context_base::graph_init() {
 
     ctx_compute.reset(ggml_init(params));
 
-    return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false);
+    return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
 }
 
 llama_graph_result llama_context_base::graph_build(
@@ -2034,7 +2030,7 @@ class llama_io_read_file : public llama_io_read_i {
 size_t llama_context_base::state_get_size() {
     llama_io_write_dummy io;
     try {
-        return state_get_data(io);
+        return state_write_data(io);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
         return 0;
@@ -2044,7 +2040,7 @@ size_t llama_context_base::state_get_size() {
 size_t llama_context_base::state_get_data(uint8_t * dst, size_t size) {
     llama_io_write_buffer io(dst, size);
     try {
-        return state_get_data(io);
+        return state_write_data(io);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
         return 0;
@@ -2054,7 +2050,7 @@ size_t llama_context_base::state_get_data(uint8_t * dst, size_t size) {
 size_t llama_context_base::state_set_data(const uint8_t * src, size_t size) {
     llama_io_read_buffer io(src, size);
     try {
-        return state_set_data(io);
+        return state_read_data(io);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
         return 0;
@@ -2064,7 +2060,7 @@ size_t llama_context_base::state_set_data(const uint8_t * src, size_t size) {
 size_t llama_context_base::state_seq_get_size(llama_seq_id seq_id) {
     llama_io_write_dummy io;
     try {
-        return state_seq_get_data(io, seq_id);
+        return state_seq_write_data(io, seq_id);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
         return 0;
@@ -2074,7 +2070,7 @@ size_t llama_context_base::state_seq_get_size(llama_seq_id seq_id) {
 size_t llama_context_base::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
     llama_io_write_buffer io(dst, size);
     try {
-        return state_seq_get_data(io, seq_id);
+        return state_seq_write_data(io, seq_id);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
         return 0;
@@ -2084,7 +2080,7 @@ size_t llama_context_base::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst
 size_t llama_context_base::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
     llama_io_read_buffer io(src, size);
     try {
-        return state_seq_set_data(io, seq_id);
+        return state_seq_read_data(io, seq_id);
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
         return 0;
@@ -2123,7 +2119,7 @@ bool llama_context_base::state_load_file(const char * filepath, llama_token * to
         const size_t n_state_size_cur = file.size() - file.tell();
 
         llama_io_read_file io( &file);
-        const size_t n_read = state_set_data(io);
+        const size_t n_read = state_read_data(io);
 
         if (n_read != n_state_size_cur) {
             LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
@@ -2146,7 +2142,7 @@ bool llama_context_base::state_save_file(const char * filepath, const llama_toke
 
     // save the context state using stream saving
     llama_io_write_file io(&file);
-    state_get_data(io);
+    state_write_data(io);
 
     return true;
 }
@@ -2182,7 +2178,7 @@ size_t llama_context_base::state_seq_load_file(llama_seq_id seq_id, const char *
     {
         const size_t state_size = file.size() - file.tell();
         llama_io_read_file io(&file);
-        const size_t nread = state_seq_set_data(io, seq_id);
+        const size_t nread = state_seq_read_data(io, seq_id);
         if (!nread) {
             LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
             return 0;
@@ -2206,7 +2202,7 @@ size_t llama_context_base::state_seq_save_file(llama_seq_id seq_id, const char *
 
     // save the context state using stream saving
     llama_io_write_file io(&file);
-    state_seq_get_data(io, seq_id);
+    state_seq_write_data(io, seq_id);
 
     const size_t res = file.tell();
     GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
@@ -2214,7 +2210,7 @@ size_t llama_context_base::state_seq_save_file(llama_seq_id seq_id, const char *
     return res;
 }
 
-size_t llama_context_base::state_get_data(llama_io_write_i & io) {
+size_t llama_context_base::state_write_data(llama_io_write_i & io) {
     LLAMA_LOG_DEBUG("%s: writing state\n", __func__);
 
     // write model info
@@ -2287,7 +2283,7 @@ size_t llama_context_base::state_get_data(llama_io_write_i & io) {
     return io.n_bytes();
 }
 
-size_t llama_context_base::state_set_data(llama_io_read_i & io) {
+size_t llama_context_base::state_read_data(llama_io_read_i & io) {
     LLAMA_LOG_DEBUG("%s: reading state\n", __func__);
 
     // read model info
@@ -2368,13 +2364,13 @@ size_t llama_context_base::state_set_data(llama_io_read_i & io) {
     return io.n_bytes();
 }
 
-size_t llama_context_base::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
+size_t llama_context_base::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
     GGML_UNUSED(seq_id);
 
     return io.n_bytes();
 }
 
-size_t llama_context_base::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
+size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
     GGML_UNUSED(seq_id);
 
     return io.n_bytes();
@@ -2400,9 +2396,6 @@ llama_context_kv_self::llama_context_kv_self(
 
     LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
 
-    // build worst-case graph for encoder if a model contains encoder
-    is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder()
-
     uint32_t kv_size = cparams.n_ctx;
     ggml_type type_k = params.type_k;
     ggml_type type_v = params.type_v;
@@ -2537,8 +2530,6 @@ void llama_context_kv_self::kv_self_update() {
 }
 
 int llama_context_kv_self::encode(llama_batch & inp_batch) {
-    is_encoding = true;
-
     if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
@@ -2589,7 +2580,6 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
         output_ids[i] = i;
     }
 
-    inp_embd_enc = NULL;
     n_outputs = n_tokens;
 
     //batch_manager->prepare(ubatch);
@@ -2624,65 +2614,48 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
         ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
         GGML_ASSERT(backend_embd != nullptr);
 
-        if (llama_model_has_decoder(&model)) {
-            embd_enc.resize(n_tokens*n_embd);
-            float * embd_out = embd_enc.data();
-
-            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
-            GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
-
-            // remember the sequence ids used during the encoding - needed for cross attention later
-            seq_ids_enc.resize(n_tokens);
-            for (int32_t i = 0; i < n_tokens; i++) {
-                for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
-                    llama_seq_id seq_id = ubatch.seq_id[i][s];
-                    seq_ids_enc[i].insert(seq_id);
-                }
-            }
-        } else {
-            GGML_ASSERT(embd != nullptr);
+        GGML_ASSERT(embd != nullptr);
 
-            switch (cparams.pooling_type) {
-                case LLAMA_POOLING_TYPE_NONE:
-                    {
-                        // extract token embeddings
-                        GGML_ASSERT(embd != nullptr);
-                        float * embd_out = embd;
+        switch (cparams.pooling_type) {
+            case LLAMA_POOLING_TYPE_NONE:
+                {
+                    // extract token embeddings
+                    GGML_ASSERT(embd != nullptr);
+                    float * embd_out = embd;
 
-                        GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
-                    } break;
-                case LLAMA_POOLING_TYPE_MEAN:
-                case LLAMA_POOLING_TYPE_CLS:
-                case LLAMA_POOLING_TYPE_LAST:
-                    {
-                        // extract sequence embeddings
-                        auto & embd_seq_out = embd_seq;
-                        embd_seq_out.clear();
+                    GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
+                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                } break;
+            case LLAMA_POOLING_TYPE_MEAN:
+            case LLAMA_POOLING_TYPE_CLS:
+            case LLAMA_POOLING_TYPE_LAST:
+                {
+                    // extract sequence embeddings
+                    auto & embd_seq_out = embd_seq;
+                    embd_seq_out.clear();
 
-                        GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+                    GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
 
-                        for (int32_t i = 0; i < n_tokens; i++) {
-                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(n_embd);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                    for (int32_t i = 0; i < n_tokens; i++) {
+                        const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
                         }
-                    } break;
-                case LLAMA_POOLING_TYPE_RANK:
-                    {
-                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
-                        //       wait for an encoder model that requires this pooling type in order to test it
-                        //       https://github.com/ggerganov/llama.cpp/pull/9510
-                        GGML_ABORT("RANK pooling not implemented yet");
-                    }
-                case LLAMA_POOLING_TYPE_UNSPECIFIED:
-                    {
-                        GGML_ABORT("unknown pooling type");
+                        embd_seq_out[seq_id].resize(n_embd);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                     }
-            }
+                } break;
+            case LLAMA_POOLING_TYPE_RANK:
+                {
+                    // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                    //       wait for an encoder model that requires this pooling type in order to test it
+                    //       https://github.com/ggerganov/llama.cpp/pull/9510
+                    GGML_ABORT("RANK pooling not implemented yet");
+                }
+            case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                {
+                    GGML_ABORT("unknown pooling type");
+                }
         }
     }
 
@@ -2694,8 +2667,6 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
 }
 
 int llama_context_kv_self::decode(llama_batch & inp_batch) {
-    is_encoding = false;
-
     if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
@@ -3039,7 +3010,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
 
     if (inp.self_kq_mask || inp.self_kq_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
-        if (cparams.causal_attn && !is_encoding) {
+        if (cparams.causal_attn) {
             const int64_t n_kv         = kv_self.n;
             const int64_t n_tokens     = ubatch.n_tokens;
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
@@ -3116,7 +3087,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
             const int64_t n_seqs       = ubatch.n_seqs;
             // when using kv cache, the mask needs to match the kv cache size
-            const int64_t n_stride     = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens;
+            const int64_t n_stride     = n_tokens;
 
             GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask->buffer));
 
@@ -3175,50 +3146,9 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
             }
         }
     }
-
-    if (!is_encoding && inp_embd_enc) {
-        assert(inp_embd_enc->type == GGML_TYPE_F32);
-        assert((size_t) ggml_nelements(inp_embd_enc) == embd_enc.size());
-
-        ggml_backend_tensor_set(inp_embd_enc, embd_enc.data(), 0, ggml_nbytes(inp_embd_enc));
-    }
-
-    if (!is_encoding && inp_kq_mask_cross) {
-        const int64_t n_output_enc = embd_enc.size() / hparams.n_embd;
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask_cross->buffer));
-        GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
-
-        float * data = (float *) inp_kq_mask_cross->data;
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_output_enc; ++i) {
-                    float f = -INFINITY;
-                    for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
-                        const llama_seq_id seq_id = ubatch.seq_id[j][s];
-                        if (seq_ids_enc[i].find(seq_id) != seq_ids_enc[i].end()) {
-                            f = 0.0f;
-                        }
-                    }
-                    data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f;
-                }
-            }
-
-            for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                for (int j = 0; j < n_output_enc; ++j) {
-                    data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
-                }
-            }
-        }
-    }
 }
 
 ggml_cgraph * llama_context_kv_self::graph_init() {
-    inp_embd_enc      = nullptr;
-    inp_kq_mask_cross = nullptr;
-
     inp = {};
 
     return llama_context_base::graph_init();
@@ -3441,7 +3371,7 @@ void llama_context_kv_self::build_kv_self_defrag(
     //   - x2 for keys and values
     //const uint32_t max_moves = max_nodes()/(6*n_layer);
     // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
-    const uint32_t max_moves = (max_nodes() - 2*n_layer)/(6*n_layer);
+    const uint32_t max_moves = (graph_max_nodes() - 2*n_layer)/(6*n_layer);
 
     // determine which KV cells to move where
     //
@@ -3689,39 +3619,10 @@ void llama_context_kv_self::build_kv_self_defrag(
 #endif
 }
 
-ggml_tensor * llama_context_kv_self::build_inp_embd_enc(
-        ggml_context * ctx0) {
-    const auto & hparams = model.hparams;
-    const int64_t n_embd = hparams.n_embd;
-
-    // TODO: not sure if this is correct
-    const int32_t n_outputs_enc = embd_enc.size() / n_embd;
-
-    inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
-    ggml_set_input(inp_embd_enc);
-
-    return inp_embd_enc;
-}
-
-ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross(
-            ggml_context * ctx0,
-                 int32_t   n_tokens) {
-    const auto & hparams = model.hparams;
-    const int64_t n_embd = hparams.n_embd;
-
-    // TODO: not sure if this is correct
-    const int32_t n_outputs_enc = embd_enc.size() / n_embd;
-
-    inp_kq_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    ggml_set_input(inp_kq_mask_cross);
-
-    return inp_kq_mask_cross;
-}
-
 // state save/load
 
-size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
-    llama_context_base::state_get_data(io);
+size_t llama_context_kv_self::state_write_data(llama_io_write_i & io) {
+    llama_context_base::state_write_data(io);
 
     LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
     kv_self.state_write(io);
@@ -3729,8 +3630,8 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
     return io.n_bytes();
 }
 
-size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
-    llama_context_base::state_set_data(io);
+size_t llama_context_kv_self::state_read_data(llama_io_read_i & io) {
+    llama_context_base::state_read_data(io);
 
     LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
     kv_self.state_read(io);
@@ -3738,16 +3639,16 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
     return io.n_bytes();
 }
 
-size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
-    llama_context_base::state_seq_get_data(io, seq_id);
+size_t llama_context_kv_self::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
+    llama_context_base::state_seq_write_data(io, seq_id);
 
     kv_self.state_write(io, seq_id);
 
     return io.n_bytes();
 }
 
-size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
-    llama_context_base::state_seq_set_data(io, seq_id);
+size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
+    llama_context_base::state_seq_read_data(io, seq_id);
 
     kv_self.state_read(io, seq_id);
 
@@ -4603,54 +4504,568 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
 
 // state save/load
 
-size_t llama_context_recurrent::state_get_data(llama_io_write_i & io) {
-    llama_context_base::state_get_data(io);
+size_t llama_context_recurrent::state_write_data(llama_io_write_i & io) {
+    llama_context_base::state_write_data(io);
 
     kv_self.state_write(io);
 
     return io.n_bytes();
 }
 
-size_t llama_context_recurrent::state_set_data(llama_io_read_i & io) {
-    llama_context_base::state_set_data(io);
+size_t llama_context_recurrent::state_read_data(llama_io_read_i & io) {
+    llama_context_base::state_read_data(io);
 
     kv_self.state_read(io);
 
     return io.n_bytes();
 }
 
-size_t llama_context_recurrent::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
-    llama_context_base::state_seq_get_data(io, seq_id);
+size_t llama_context_recurrent::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
+    llama_context_base::state_seq_write_data(io, seq_id);
 
     kv_self.state_write(io, seq_id);
 
     return io.n_bytes();
 }
 
-size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
-    llama_context_base::state_seq_set_data(io, seq_id);
+size_t llama_context_recurrent::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
+    llama_context_base::state_seq_read_data(io, seq_id);
 
     kv_self.state_read(io, seq_id);
 
     return io.n_bytes();
 }
 
+//
+// llama_context_enc
+//
+
+int llama_context_enc::encode(llama_batch & inp_batch) {
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporary allocate memory for the input batch if needed
+    llama_batch_allocr batch_allocr(inp_batch, 0);
+
+    const llama_batch & batch = batch_allocr.batch;
+
+    const int32_t n_tokens = batch.n_tokens;
+
+    const auto & hparams = model.hparams;
+
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+    if (batch.token) {
+        for (int32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
+
+    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
+    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
+
+    if (t_compute_start_us == 0) {
+        t_compute_start_us = ggml_time_us();
+    }
+
+    n_queued_tokens += n_tokens;
+
+    const int64_t n_embd = hparams.n_embd;
+
+    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+
+    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
+
+    // reserve output buffer
+    if (output_reserve(n_tokens) < n_tokens) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+        return -2;
+    };
+
+    for (int32_t i = 0; i < n_tokens; ++i) {
+        output_ids[i] = i;
+    }
+
+    n_outputs = n_tokens;
+
+    ggml_backend_sched_reset(sched.get());
+    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+    auto * gf = graph_init();
+    auto res = graph_build(ctx_compute.get(), gf, ubatch);
+
+    ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+    input_set(ubatch);
+
+    const auto compute_status = graph_compute(gf, n_tokens > 1);
+    switch (compute_status) {
+        case GGML_STATUS_SUCCESS:
+            break;
+        case GGML_STATUS_ABORTED:
+            return 2;
+        case GGML_STATUS_ALLOC_FAILED:
+            return -2;
+        case GGML_STATUS_FAILED:
+        default:
+            return -3;
+    }
+
+    auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd;
+
+    // extract embeddings
+    if (t_embd) {
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+        GGML_ASSERT(backend_embd != nullptr);
+
+        switch (cparams.pooling_type) {
+            case LLAMA_POOLING_TYPE_NONE:
+                {
+                    GGML_ASSERT(embd != nullptr);
+
+                    // extract token embeddings
+                    float * embd_out = embd;
+
+                    GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
+                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                } break;
+            case LLAMA_POOLING_TYPE_MEAN:
+            case LLAMA_POOLING_TYPE_CLS:
+            case LLAMA_POOLING_TYPE_LAST:
+                {
+                    // extract sequence embeddings
+                    auto & embd_seq_out = embd_seq;
+                    embd_seq_out.clear();
+
+                    GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+                    for (int32_t i = 0; i < n_tokens; i++) {
+                        const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
+                        embd_seq_out[seq_id].resize(n_embd);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                    }
+                } break;
+            case LLAMA_POOLING_TYPE_RANK:
+                {
+                    // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                    //       wait for an encoder model that requires this pooling type in order to test it
+                    //       https://github.com/ggerganov/llama.cpp/pull/9510
+                    GGML_ABORT("RANK pooling not implemented yet");
+                }
+            case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                {
+                    GGML_ABORT("unknown pooling type");
+                }
+        }
+    }
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    cross->n_outputs = n_tokens;
+    cross->embd_enc = embd;
+
+    // remember the sequence ids used during the encoding - needed for cross attention later
+    cross->seq_ids_enc.resize(n_tokens);
+    for (int32_t i = 0; i < n_tokens; i++) {
+        for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
+            llama_seq_id seq_id = ubatch.seq_id[i][s];
+            cross->seq_ids_enc[i].insert(seq_id);
+        }
+    }
+
+    return 0;
+}
+
+//
+// llama_context_dec
+//
+
+void llama_context_dec::reserve() {
+    // simulate full KV cache
+    cross->n_outputs = cparams.n_ubatch;
+
+    LLAMA_LOG_DEBUG("%s: n_outputs = %u\n", __func__, cross->n_outputs);
+
+    llama_context_kv_self::reserve();
+}
+
+void llama_context_dec::input_set(const llama_ubatch & ubatch) {
+    // call base functionality
+    llama_context_kv_self::input_set(ubatch);
+
+    if (inp.cross_embd) {
+        assert(inp.cross_embd->type == GGML_TYPE_F32);
+        assert(ggml_nelements(inp.cross_embd) == cross->n_outputs*model.hparams.n_embd);
+
+        ggml_backend_tensor_set(inp.cross_embd, cross->embd_enc, 0, ggml_nbytes(inp.cross_embd));
+    }
+
+    if (inp.cross_kq_mask) {
+        const int64_t n_output_enc = cross->n_outputs;
+        const int64_t n_tokens = ubatch.n_tokens;
+
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp.cross_kq_mask->buffer));
+        GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
+
+        float * data = (float *) inp.cross_kq_mask->data;
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_output_enc; ++i) {
+                    float f = -INFINITY;
+                    for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[j][s];
+                        if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
+                            f = 0.0f;
+                        }
+                    }
+                    data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f;
+                }
+            }
+
+            for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                for (int j = 0; j < n_output_enc; ++j) {
+                    data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
+                }
+            }
+        }
+    }
+}
+
+ggml_cgraph * llama_context_dec::graph_init() {
+    inp = {};
+
+    return llama_context_kv_self::graph_init();
+}
+
+ggml_tensor * llama_context_dec::build_inp_cross_embd(
+        ggml_context * ctx0) {
+    const auto & hparams = model.hparams;
+    const int64_t n_embd = hparams.n_embd;
+
+    const int32_t n_outputs_enc = cross->n_outputs;
+
+    inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
+    ggml_set_input(inp.cross_embd);
+
+    return inp.cross_embd;
+}
+
+void llama_context_dec::build_attn_inp(
+        ggml_context * ctx0,
+             int32_t   n_tokens,
+                bool   causal,
+                bool   swa) {
+    llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa);
+
+    const int32_t n_outputs_enc = cross->n_outputs;
+
+    inp.cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    ggml_set_input(inp.cross_kq_mask);
+
+    inp.cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.cross_kq_mask, GGML_TYPE_F16) : inp.cross_kq_mask;
+}
+
+ggml_tensor * llama_context_dec::build_attn_cross(
+        ggml_context * ctx0,
+         ggml_cgraph * gf,
+         ggml_tensor * q_cur,
+         ggml_tensor * k_cur,
+         ggml_tensor * v_cur,
+         ggml_tensor * kq_b,
+             float     kq_scale,
+             int       il) {
+    GGML_UNUSED(il);
+
+    const auto & kq_mask = inp.cross_kq_mask_cnv;
+
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
+
+    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+    //cb(k, "k", il);
+
+    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
+    //cb(k, "v", il);
+
+    ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, false, kq_scale);
+
+    return cur;
+}
+
 //
 // llama_context_enc_dec
 //
 
 llama_context_enc_dec::llama_context_enc_dec(
         const llama_model & model,
-              llama_context_params params) :
-    llama_context_enc(model, params, LLAMA_GRAPH_TYPE_ENCODER),
-    ctx_dec(model, params, LLAMA_GRAPH_TYPE_DECODER) {
+              llama_context_params params) {
     LLAMA_LOG_INFO("%s: constructing llama_context_enc_dec\n", __func__);
+
+    ctx_enc = std::make_unique<llama_context_enc>(model, params, LLAMA_GRAPH_TYPE_ENCODER);
+    ctx_dec = std::make_unique<llama_context_dec>(model, params, LLAMA_GRAPH_TYPE_DECODER);
+
+    ctx_enc->cross = &cross;
+    ctx_dec->cross = &cross;
 }
 
 llama_context_enc_dec::~llama_context_enc_dec() {
     LLAMA_LOG_INFO("%s: destructing llama_context_enc_dec\n", __func__);
 }
 
+void llama_context_enc_dec::init() {
+    ctx_enc->init();
+    ctx_dec->init();
+}
+
+void llama_context_enc_dec::synchronize() {
+    ctx_enc->synchronize();
+    ctx_dec->synchronize();
+}
+
+const llama_model & llama_context_enc_dec::get_model() const {
+    return ctx_enc->get_model();
+}
+
+uint32_t llama_context_enc_dec::n_ctx() const {
+    return ctx_dec->n_ctx();
+}
+
+uint32_t llama_context_enc_dec::n_ctx_per_seq() const {
+    return ctx_dec->n_ctx_per_seq();
+}
+
+uint32_t llama_context_enc_dec::n_batch() const {
+    return ctx_dec->n_batch();
+}
+
+uint32_t llama_context_enc_dec::n_ubatch() const {
+    return ctx_dec->n_ubatch();
+}
+
+uint32_t llama_context_enc_dec::n_seq_max() const {
+    return ctx_dec->n_seq_max();
+}
+
+uint32_t llama_context_enc_dec::n_threads() const {
+    return ctx_dec->n_threads();
+}
+
+uint32_t llama_context_enc_dec::n_threads_batch() const {
+    return ctx_dec->n_threads_batch();
+}
+
+llama_kv_cache * llama_context_enc_dec::get_kv_self() {
+    return ctx_dec->get_kv_self();
+}
+
+const llama_kv_cache * llama_context_enc_dec::get_kv_self() const {
+    return ctx_dec->get_kv_self();
+}
+
+void llama_context_enc_dec::kv_self_update() {
+    ctx_dec->kv_self_update();
+}
+
+enum llama_pooling_type llama_context_enc_dec::pooling_type() const {
+    return ctx_enc->pooling_type();
+}
+
+float * llama_context_enc_dec::get_logits() {
+    return ctx_dec->get_logits();
+}
+
+float * llama_context_enc_dec::get_logits_ith(int32_t i) {
+    return ctx_dec->get_logits_ith(i);
+}
+
+float * llama_context_enc_dec::get_embeddings() {
+    return ctx_enc->get_embeddings();
+}
+
+float * llama_context_enc_dec::get_embeddings_ith(int32_t i) {
+    return ctx_enc->get_embeddings_ith(i);
+}
+
+float * llama_context_enc_dec::get_embeddings_seq(llama_seq_id seq_id) {
+    return ctx_enc->get_embeddings_seq(seq_id);
+}
+
+void llama_context_enc_dec::attach_threadpool(
+        ggml_threadpool_t threadpool,
+        ggml_threadpool_t threadpool_batch) {
+    // TODO: attach to both - not sure if this is correct
+    ctx_enc->attach_threadpool(threadpool, threadpool_batch);
+    ctx_dec->attach_threadpool(threadpool, threadpool_batch);
+}
+
+void llama_context_enc_dec::detach_threadpool() {
+    ctx_enc->detach_threadpool();
+    ctx_dec->detach_threadpool();
+}
+
+void llama_context_enc_dec::set_n_threads(int32_t n_threads, int32_t n_threads_batch) {
+    ctx_enc->set_n_threads(n_threads, n_threads_batch);
+    ctx_dec->set_n_threads(n_threads, n_threads_batch);
+}
+
+void llama_context_enc_dec::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) {
+    ctx_enc->set_abort_callback(abort_callback, abort_callback_data);
+    ctx_dec->set_abort_callback(abort_callback, abort_callback_data);
+}
+
+void llama_context_enc_dec::set_embeddings(bool value) {
+    GGML_UNUSED(value);
+    LLAMA_LOG_WARN("%s: set_embeddings() not supported for llama_context_enc_dec\n", __func__);
+}
+
+void llama_context_enc_dec::set_causal_attn(bool value) {
+    GGML_UNUSED(value);
+    LLAMA_LOG_WARN("%s: set_causal_attn() not supported for llama_context_enc_dec\n", __func__);
+}
+
+void llama_context_enc_dec::set_adapter_lora(
+        llama_adapter_lora * adapter,
+        float scale) {
+    ctx_dec->set_adapter_lora(adapter, scale);
+}
+
+bool llama_context_enc_dec::rm_adapter_lora(
+        llama_adapter_lora * adapter) {
+    return ctx_dec->rm_adapter_lora(adapter);
+}
+
+void llama_context_enc_dec::clear_adapter_lora() {
+    ctx_dec->clear_adapter_lora();
+}
+
+bool llama_context_enc_dec::apply_adapter_cvec(
+        const float * data,
+             size_t   len,
+            int32_t   n_embd,
+            int32_t   il_start,
+            int32_t   il_end) {
+    return ctx_dec->apply_adapter_cvec(data, len, n_embd, il_start, il_end);
+}
+
+int llama_context_enc_dec::encode(llama_batch & inp_batch) {
+    return ctx_enc->encode(inp_batch);
+}
+
+int llama_context_enc_dec::decode(llama_batch & inp_batch) {
+    return ctx_dec->decode(inp_batch);
+}
+
+//
+// perf
+//
+
+llama_perf_context_data llama_context_enc_dec::perf_get_data() const {
+    return ctx_dec->perf_get_data();
+}
+
+void llama_context_enc_dec::perf_reset() {
+    ctx_enc->perf_reset();
+    ctx_dec->perf_reset();
+}
+
+//
+// state save/load
+//
+
+size_t llama_context_enc_dec::state_get_size() {
+    GGML_ABORT("TODO: implement");
+}
+
+size_t llama_context_enc_dec::state_get_data(      uint8_t * dst, size_t size) {
+    GGML_UNUSED(dst);
+    GGML_UNUSED(size);
+    GGML_ABORT("TODO: implement");
+}
+
+size_t llama_context_enc_dec::state_set_data(const uint8_t * src, size_t size) {
+    GGML_UNUSED(src);
+    GGML_UNUSED(size);
+    GGML_ABORT("TODO: implement");
+}
+
+size_t llama_context_enc_dec::state_seq_get_size(llama_seq_id seq_id) {
+    GGML_UNUSED(seq_id);
+    GGML_ABORT("TODO: implement");
+}
+
+size_t llama_context_enc_dec::state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) {
+    GGML_UNUSED(seq_id);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(size);
+    GGML_ABORT("TODO: implement");
+}
+
+size_t llama_context_enc_dec::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
+    GGML_UNUSED(seq_id);
+    GGML_UNUSED(src);
+    GGML_UNUSED(size);
+    GGML_ABORT("TODO: implement");
+}
+
+bool llama_context_enc_dec::state_load_file(
+        const char * filepath,
+       llama_token * tokens_out,
+            size_t   n_token_capacity,
+            size_t * n_token_count_out) {
+    GGML_UNUSED(filepath);
+    GGML_UNUSED(tokens_out);
+    GGML_UNUSED(n_token_capacity);
+    GGML_UNUSED(n_token_count_out);
+    GGML_ABORT("TODO: implement");
+}
+
+bool llama_context_enc_dec::state_save_file(
+        const char * filepath,
+ const llama_token * tokens,
+            size_t   n_token_count) {
+    GGML_UNUSED(filepath);
+    GGML_UNUSED(tokens);
+    GGML_UNUSED(n_token_count);
+    GGML_ABORT("TODO: implement");
+}
+
+size_t llama_context_enc_dec::state_seq_load_file(
+      llama_seq_id   seq_id,
+        const char * filepath,
+       llama_token * tokens_out,
+            size_t   n_token_capacity,
+            size_t * n_token_count_out) {
+    GGML_UNUSED(seq_id);
+    GGML_UNUSED(filepath);
+    GGML_UNUSED(tokens_out);
+    GGML_UNUSED(n_token_capacity);
+    GGML_UNUSED(n_token_count_out);
+    GGML_ABORT("TODO: implement");
+}
+
+size_t llama_context_enc_dec::state_seq_save_file(
+      llama_seq_id   seq_id,
+        const char * filepath,
+ const llama_token * tokens,
+            size_t   n_token_count) {
+    GGML_UNUSED(seq_id);
+    GGML_UNUSED(filepath);
+    GGML_UNUSED(tokens);
+    GGML_UNUSED(n_token_count);
+    GGML_ABORT("TODO: implement");
+}
+
 //
 // interface implementation
 //
diff --git a/src/llama-context.h b/src/llama-context.h
index d647a426cd1be..3165865a73c37 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -30,8 +30,7 @@ struct llama_context {
 
     virtual void synchronize() = 0;
 
-    virtual const llama_model   & get_model()   const = 0;
-    virtual const llama_cparams & get_cparams() const = 0;
+    virtual const llama_model & get_model() const = 0;
 
     virtual uint32_t n_ctx()         const = 0;
     virtual uint32_t n_ctx_per_seq() const = 0;
@@ -42,8 +41,6 @@ struct llama_context {
     virtual uint32_t n_threads()       const = 0;
     virtual uint32_t n_threads_batch() const = 0;
 
-    virtual int32_t max_nodes() const = 0;
-
     // self-attention:
 
     // if the context does not have a KV cache, return nullptr
@@ -62,8 +59,6 @@ struct llama_context {
     virtual float * get_embeddings_ith(int32_t i)           = 0;
     virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0;
 
-    virtual int64_t n_pos_per_token() const = 0; // vision
-
     virtual void attach_threadpool(
             ggml_threadpool_t   threadpool,
             ggml_threadpool_t   threadpool_batch) = 0;
@@ -190,8 +185,7 @@ class llama_context_base : public llama_context_i, public llama_graph_i {
     virtual void reserve();
 
 public:
-    const llama_model   & get_model()   const override;
-    const llama_cparams & get_cparams() const override;
+    const llama_model & get_model() const override;
 
     uint32_t n_ctx()         const override;
     uint32_t n_ctx_per_seq() const override;
@@ -202,15 +196,9 @@ class llama_context_base : public llama_context_i, public llama_graph_i {
     uint32_t n_threads()       const override;
     uint32_t n_threads_batch() const override;
 
-    int32_t max_nodes() const override;
-
-    // self-attention:
-
-    // if the context does not have a KV cache, return nullptr
           llama_kv_cache * get_kv_self()       override;
     const llama_kv_cache * get_kv_self() const override;
 
-    // if the context does not have a KV cache, noop
     void kv_self_update() override;
 
     enum llama_pooling_type pooling_type() const override;
@@ -222,8 +210,6 @@ class llama_context_base : public llama_context_i, public llama_graph_i {
     float * get_embeddings_ith(int32_t i)           override;
     float * get_embeddings_seq(llama_seq_id seq_id) override;
 
-    int64_t n_pos_per_token() const override; // vision
-
     void attach_threadpool(
             ggml_threadpool_t   threadpool,
             ggml_threadpool_t   threadpool_batch) override;
@@ -261,6 +247,8 @@ class llama_context_base : public llama_context_i, public llama_graph_i {
     // input
     //
 
+    virtual int64_t n_pos_per_token() const; // vision
+
     // when the compute graph is built, it creates the input tensors that it needs
     // the contents of the input tensors are set by the input_set() function
 
@@ -299,6 +287,8 @@ class llama_context_base : public llama_context_i, public llama_graph_i {
     // graph
     //
 
+    virtual int32_t graph_max_nodes() const;
+
     // zero-out inputs and create the ctx_compute for the compute graph
     virtual ggml_cgraph * graph_init();
 
@@ -477,11 +467,11 @@ class llama_context_base : public llama_context_i, public llama_graph_i {
                 size_t   n_token_count) override;
 
 protected:
-    virtual size_t state_get_data(llama_io_write_i & io);
-    virtual size_t state_set_data(llama_io_read_i  & io);
+    virtual size_t state_write_data(llama_io_write_i & io);
+    virtual size_t state_read_data (llama_io_read_i  & io);
 
-    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id);
-    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id);
+    virtual size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
+    virtual size_t state_seq_read_data (llama_io_read_i  & io, llama_seq_id seq_id);
 
     //
     // members
@@ -625,39 +615,15 @@ class llama_context_kv_self : public llama_context_base {
             ggml_context * ctx0,
             ggml_cgraph * gf) override;
 
-    // =======================================================
-    // === encoder-decoder ===
-    //
-    // TODO: this is temporary here, it will be moved
-    //
-
-    // whether we are computing encoder output or decoder output
-    bool is_encoding = false;
-
-    // output of the encoder part of the encoder-decoder models
-    std::vector<float> embd_enc;
-    std::vector<std::set<llama_seq_id>> seq_ids_enc;
-
-    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
-    struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch]
-
-    ggml_tensor * build_inp_embd_enc(
-            ggml_context * ctx0) override;
-
-    ggml_tensor * build_inp_kq_mask_cross(
-            ggml_context * ctx0,
-                 int32_t   n_tokens) override;
-    // ======================================================
-
     //
     // state save/load
     //
 
-    size_t state_get_data(llama_io_write_i & io) override;
-    size_t state_set_data(llama_io_read_i  & io) override;
+    size_t state_write_data(llama_io_write_i & io) override;
+    size_t state_read_data (llama_io_read_i  & io) override;
 
-    size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
-    size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
+    size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) override;
+    size_t state_seq_read_data (llama_io_read_i  & io, llama_seq_id seq_id) override;
 
 private:
     //
@@ -767,11 +733,11 @@ class llama_context_recurrent : public llama_context_base {
     // state save/load
     //
 
-    size_t state_get_data(llama_io_write_i & io) override;
-    size_t state_set_data(llama_io_read_i  & io) override;
+    size_t state_write_data(llama_io_write_i & io) override;
+    size_t state_read_data (llama_io_read_i  & io) override;
 
-    size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
-    size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;
+    size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) override;
+    size_t state_seq_read_data (llama_io_read_i  & io, llama_seq_id seq_id) override;
 
 private:
     //
@@ -782,21 +748,206 @@ class llama_context_recurrent : public llama_context_base {
     llama_kv_cache_recurrent kv_self;
 };
 
+// TODO: tmp - need something better
+struct llama_cross {
+    int32_t n_outputs;
+    float * embd_enc;
+
+    std::vector<std::set<llama_seq_id>> seq_ids_enc;
+};
+
 class llama_context_enc : public llama_context_base {
 public:
     using llama_context_base::llama_context_base;
+
+    int encode(llama_batch & inp_batch) override;
+
+    llama_cross * cross = nullptr;
 };
 
-class llama_context_enc_dec : public llama_context_enc {
+class llama_context_dec : public llama_context_kv_self {
+public:
+    using llama_context_kv_self::llama_context_kv_self;
+
+protected:
+    void reserve() override;
+
+    //
+    // input
+    //
+
+    void input_set(const llama_ubatch & ubatch) override;
+
+private:
+    struct {
+        ggml_tensor * cross_embd;        // F32 [n_embd, n_outputs_enc]
+        ggml_tensor * cross_kq_mask;     // F32 [n_outputs_enc, n_batch]
+        ggml_tensor * cross_kq_mask_cnv; // F32 [n_outputs_enc, n_batch]
+    } inp;
+
+protected:
+    //
+    // graph
+    //
+
+    ggml_cgraph * graph_init() override;
+
+    ggml_tensor * build_inp_cross_embd(
+            ggml_context * ctx0) override;
+
+    void build_attn_inp(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   causal,
+                    bool   swa) override;
+
+    ggml_tensor * build_attn_cross(
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+             ggml_tensor * q_cur,
+             ggml_tensor * k_cur,
+             ggml_tensor * v_cur,
+             ggml_tensor * kq_b,
+                 float     kq_scale,
+                 int       il) override;
+
+public:
+    llama_cross * cross = nullptr;
+};
+
+class llama_context_enc_dec : public llama_context_i {
 public:
     llama_context_enc_dec(
             const llama_model & model,
                   llama_context_params params);
 
-    virtual ~llama_context_enc_dec();
+    ~llama_context_enc_dec();
+
+    void init() override;
+
+    void synchronize() override;
+
+    const llama_model & get_model() const override;
+
+    // TODO: the default implementation of these getters calls the corresponding getter of the enc or dec context
+    //       in the future, the public API in llama.h should allow to get references to the context that the user wants
+    //       this will allow to specify the desired context explicitly
+    //       for example:
+    //
+    //          // this can be an enc-dec context
+    //          llama_context_t ctx = llama_init_from_model(...);
+    //
+    //          ...
+    //
+    //          llama_context_t ctx_enc = llama_get_ctx_enc(ctx);
+    //          llama_set_embeddings(ctx_enc, true);
+    //
+    //          llama_context_t ctx_dec = llama_get_ctx_dec(ctx);
+    //          llama_set_causal_attn(ctx_dec, true);
+    //
+    uint32_t n_ctx()         const override;
+    uint32_t n_ctx_per_seq() const override;
+    uint32_t n_batch()       const override;
+    uint32_t n_ubatch()      const override;
+    uint32_t n_seq_max()     const override;
+
+    uint32_t n_threads()       const override;
+    uint32_t n_threads_batch() const override;
+
+          llama_kv_cache * get_kv_self()       override;
+    const llama_kv_cache * get_kv_self() const override;
+
+    void kv_self_update() override;
+
+    enum llama_pooling_type pooling_type() const override;
+
+    float * get_logits()              override;
+    float * get_logits_ith(int32_t i) override;
+
+    float * get_embeddings()                        override;
+    float * get_embeddings_ith(int32_t i)           override;
+    float * get_embeddings_seq(llama_seq_id seq_id) override;
+
+    void attach_threadpool(
+            ggml_threadpool_t threadpool,
+            ggml_threadpool_t threadpool_batch) override;
+
+    void detach_threadpool() override;
+
+    void set_n_threads(int32_t n_threads, int32_t n_threads_batch) override;
+
+    void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) override;
+
+    void set_embeddings (bool value) override;
+    void set_causal_attn(bool value) override;
+
+    void set_adapter_lora(
+            llama_adapter_lora * adapter,
+            float scale) override;
+
+    bool rm_adapter_lora(
+            llama_adapter_lora * adapter) override;
+
+    void clear_adapter_lora() override;
+
+    bool apply_adapter_cvec(
+            const float * data,
+                 size_t   len,
+                int32_t   n_embd,
+                int32_t   il_start,
+                int32_t   il_end) override;
+
+    int encode(llama_batch & inp_batch) override;
+    int decode(llama_batch & inp_batch) override;
+
+    //
+    // perf
+    //
+
+    llama_perf_context_data perf_get_data() const override;
+    void perf_reset() override;
+
+    //
+    // state save/load
+    //
+
+    size_t state_get_size()                                 override;
+    size_t state_get_data(      uint8_t * dst, size_t size) override;
+    size_t state_set_data(const uint8_t * src, size_t size) override;
+
+    size_t state_seq_get_size(llama_seq_id seq_id)                                   override;
+    size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) override;
+    size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override;
+
+    bool state_load_file(
+            const char * filepath,
+           llama_token * tokens_out,
+                size_t   n_token_capacity,
+                size_t * n_token_count_out) override;
+
+    bool state_save_file(
+            const char * filepath,
+     const llama_token * tokens,
+                size_t   n_token_count) override;
+
+    size_t state_seq_load_file(
+          llama_seq_id   seq_id,
+            const char * filepath,
+           llama_token * tokens_out,
+                size_t   n_token_capacity,
+                size_t * n_token_count_out) override;
+
+    size_t state_seq_save_file(
+          llama_seq_id   seq_id,
+            const char * filepath,
+     const llama_token * tokens,
+                size_t   n_token_count) override;
 
 private:
-    llama_context_kv_self ctx_dec;
+    std::unique_ptr<llama_context_enc> ctx_enc;
+    std::unique_ptr<llama_context_dec> ctx_dec;
+
+    llama_cross cross;
 };
 
 // For internal test use
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 99eb326205bc6..1e336e844ada0 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -26,7 +26,29 @@ ggml_tensor * llama_graph_i::build_attn(
     return nullptr;
 }
 
-ggml_tensor * llama_graph_i::build_inp_embd_enc(
+ggml_tensor * llama_graph_i::build_attn_cross(
+        ggml_context * ctx0,
+         ggml_cgraph * gf,
+         ggml_tensor * q_cur,
+         ggml_tensor * k_cur,
+         ggml_tensor * v_cur,
+         ggml_tensor * kq_b,
+             float     kq_scale,
+             int       il) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+    GGML_UNUSED(q_cur);
+    GGML_UNUSED(k_cur);
+    GGML_UNUSED(v_cur);
+    GGML_UNUSED(kq_b);
+    GGML_UNUSED(kq_scale);
+    GGML_UNUSED(il);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+    return nullptr;
+}
+
+ggml_tensor * llama_graph_i::build_inp_cross_embd(
         ggml_context * ctx0) {
     GGML_UNUSED(ctx0);
 
@@ -34,7 +56,7 @@ ggml_tensor * llama_graph_i::build_inp_embd_enc(
     return nullptr;
 }
 
-ggml_tensor * llama_graph_i::build_inp_kq_mask_cross(
+ggml_tensor * llama_graph_i::build_inp_cross_kq_mask(
         ggml_context * ctx0,
              int32_t   n_tokens) {
     GGML_UNUSED(ctx0);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index c84c254934ff1..28e8a563067db 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -114,10 +114,20 @@ class llama_graph_i {
                  float     kq_scale,
                  int       il);
 
-    virtual ggml_tensor * build_inp_embd_enc(
+    virtual ggml_tensor * build_attn_cross(
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+             ggml_tensor * q_cur,
+             ggml_tensor * k_cur,
+             ggml_tensor * v_cur,
+             ggml_tensor * kq_b,
+                 float     kq_scale,
+                 int       il);
+
+    virtual ggml_tensor * build_inp_cross_embd(
             ggml_context * ctx0);
 
-    virtual ggml_tensor * build_inp_kq_mask_cross(
+    virtual ggml_tensor * build_inp_cross_kq_mask(
             ggml_context * ctx0,
                  int32_t   n_tokens);
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index e8057f4687fdf..38e8c2812fcbb 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3964,16 +3964,16 @@ struct llm_build_context {
     }
 
     // TODO: tmp
-    struct ggml_tensor * build_inp_embd_enc() {
-        ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0);
+    struct ggml_tensor * build_inp_cross_embd() {
+        ggml_tensor * cur = lgf->build_inp_cross_embd(ctx0);
         cb(cur, "embd_enc", -1);
 
         return cur;
     }
 
     // TODO: tmp
-    struct ggml_tensor * build_inp_kq_mask_cross() {
-        ggml_tensor * cur = lgf->build_inp_kq_mask_cross(ctx0, n_tokens);
+    struct ggml_tensor * build_inp_cross_kq_mask() {
+        ggml_tensor * cur = lgf->build_inp_cross_kq_mask(ctx0, n_tokens);
         cb(cur, "KQ_mask_cross", -1);
 
         return cur;
@@ -4294,6 +4294,42 @@ struct llm_build_context {
         return cur;
     }
 
+    struct ggml_tensor * build_attn_cross(
+             struct ggml_cgraph * gf,
+             struct ggml_tensor * wo,
+             struct ggml_tensor * wo_b,
+             struct ggml_tensor * q_cur,
+             struct ggml_tensor * k_cur,
+             struct ggml_tensor * v_cur,
+                        int32_t   n_tokens, // TODO: remove
+                        float     kq_scale,
+                        int       il) {
+        GGML_UNUSED(n_tokens);
+
+        // these nodes are added to the graph together so that they are not reordered
+        // by doing so, the number of splits in the graph is reduced
+        ggml_build_forward_expand(gf, q_cur);
+        ggml_build_forward_expand(gf, k_cur);
+        ggml_build_forward_expand(gf, v_cur);
+
+        ggml_tensor * cur = lgf->build_attn_cross(ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il);
+        cb(cur, "kqv_out", il);
+
+        if (wo) {
+            cur = lgf->build_lora_mm(ctx0, wo, cur);
+        }
+
+        if (wo_b) {
+            //cb(cur, "kqv_wo", il);
+        }
+
+        if (wo_b) {
+            cur = ggml_add(ctx0, cur, wo_b);
+        }
+
+        return cur;
+    }
+
     struct ggml_tensor * build_attn_with_kq_b(
              struct ggml_cgraph * gf,
              struct ggml_tensor * wo,
@@ -9762,209 +9798,173 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, cur);
     }
 
-    //void build_t5_dec(ggml_cgraph * gf) {
-    //    const int64_t n_embd_head = hparams.n_embd_head_v;
-    //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-
-    //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-    //    struct ggml_tensor * cur;
-    //    struct ggml_tensor * inpL;
-
-    //    inpL = build_inp_embd(model.tok_embd);
-
-    //    GGML_ASSERT(!lctx.is_encoding);
-    //    GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
-
-    //    struct ggml_tensor * embd_enc       = build_inp_embd_enc();
-    //    struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true);
-
-    //    struct ggml_tensor * KQ_mask_dec   = build_inp_kq_mask();
-    //    struct ggml_tensor * KQ_mask_cross = build_inp_kq_mask_cross();
-
-    //    for (int il = 0; il < n_layer; ++il) {
-    //        struct ggml_tensor * inpSA = inpL;
-
-    //        // norm
-    //        cur = build_norm(inpL,
-    //                model.layers[il].attn_norm, NULL,
-    //                LLM_NORM_RMS, il);
-    //        cb(cur, "attn_norm", il);
-
-    //        // self-attention
-    //        {
-    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-    //            cb(Qcur, "Qcur", il);
-
-    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-    //            cb(Kcur, "Kcur", il);
+    void build_t5_dec(ggml_cgraph * gf) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+      //const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
 
-    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-    //            cb(Vcur, "Vcur", il);
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
-    //            build_kv_store(gf, Kcur, Vcur, il);
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
 
-    //            struct ggml_tensor * k =
-    //                ggml_view_3d(ctx0, kv_self.k_l[il],
-    //                        n_embd_head_k, n_kv, n_head_kv,
-    //                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-    //                        ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-    //                        0);
-    //            cb(k, "k", il);
+        inpL = build_inp_embd(model.tok_embd);
 
-    //            struct ggml_tensor * v =
-    //                ggml_view_3d(ctx0, kv_self.v_l[il],
-    //                        n_kv, n_embd_head_v, n_head_kv,
-    //                        ggml_element_size(kv_self.v_l[il])*n_ctx,
-    //                        ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
-    //                        0);
-    //            cb(v, "v", il);
+        struct ggml_tensor * embd_enc       = build_inp_cross_embd();
+        struct ggml_tensor * pos_bucket_dec = build_pos_bucket();
 
-    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+        const int64_t n_outputs_enc = embd_enc->ne[1];
 
-    //            struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);
 
-    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-    //            cb(kq, "kq", il);
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
 
-    //            struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
-    //            struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b);
-    //            struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
-    //            cb(kq_b, "kq_b", il);
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
 
-    //            kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
-    //            cb(kq, "kq_soft_max_ext", il);
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
 
-    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
-    //            cb(kqv, "kqv", il);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
 
-    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-    //            cb(kqv_merged, "kqv_merged", il);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
 
-    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-    //            cb(cur, "kqv_merged_cont", il);
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
-    //            ggml_build_forward_expand(gf, cur);
+                struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+                struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
 
-    //            cur = build_lora_mm(model.layers[il].wo, cur);
-    //            cb(cur, "kqv_out", il);
-    //        }
+                cur = build_attn_with_kq_b(gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, Kcur, Vcur, kq_b, n_tokens, 1.0f, il);
+                cb(cur, "kqv_out", il);
+            }
 
-    //        cur = ggml_add(ctx0, cur, inpSA);
-    //        cb(cur, "cross_inp", il);
+            cur = ggml_add(ctx0, cur, inpSA);
+            cb(cur, "cross_inp", il);
 
-    //        struct ggml_tensor * inpCA = cur;
+            struct ggml_tensor * inpCA = cur;
 
-    //        // norm
-    //        cur = build_norm(cur,
-    //                model.layers[il].attn_norm_cross, NULL,
-    //                LLM_NORM_RMS, il);
-    //        cb(cur, "attn_norm_cross", il);
+            // norm
+            cur = build_norm(cur,
+                    model.layers[il].attn_norm_cross, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm_cross", il);
 
-    //        // cross-attention
-    //        {
-    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
-    //            cb(Qcur, "Qcur", il);
+            // cross-attention
+            {
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
+                cb(Qcur, "Qcur", il);
 
-    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
-    //            cb(Kcur, "Kcur", il);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
+                cb(Kcur, "Kcur", il);
 
-    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
-    //            cb(Vcur, "Vcur", il);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
+                cb(Vcur, "Vcur", il);
 
-    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-    //            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
 
-    //            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-    //            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+                cur = build_attn_cross(gf,
+                        model.layers[il].wo_cross, nullptr,
+                        Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
+                cb(cur, "kqv_out", il);
 
-    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-    //            cb(kq, "kq", il);
+                //struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+                //struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
 
-    //            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
-    //            cb(kq, "kq_soft_max_ext", il);
+                //struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+                //cb(kq, "kq", il);
 
-    //            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
-    //            cb(v, "v", il);
+                //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
+                //cb(kq, "kq_soft_max_ext", il);
 
-    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
-    //            cb(kqv, "kqv", il);
+                //struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
+                //cb(v, "v", il);
 
-    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-    //            cb(kqv_merged, "kqv_merged", il);
+                //struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
+                //cb(kqv, "kqv", il);
 
-    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-    //            cb(cur, "kqv_merged_cont", il);
+                //struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+                //cb(kqv_merged, "kqv_merged", il);
 
-    //            ggml_build_forward_expand(gf, cur);
+                //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+                //cb(cur, "kqv_merged_cont", il);
 
-    //            cur = build_lora_mm(model.layers[il].wo_cross, cur);
-    //            cb(cur, "kqv_out", il);
-    //        }
+                //ggml_build_forward_expand(gf, cur);
 
-    //        if (il == n_layer - 1) {
-    //            // skip computing output for unused tokens
-    //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-    //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-    //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-    //            inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
-    //        }
+                //cur = build_lora_mm(model.layers[il].wo_cross, cur);
+                //cb(cur, "kqv_out", il);
+            }
 
-    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
-    //        cb(ffn_inp, "ffn_inp", il);
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+                inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
+            }
 
-    //        // feed-forward network
-    //        {
-    //            cur = build_norm(ffn_inp,
-    //                    model.layers[il].ffn_norm, NULL,
-    //                    LLM_NORM_RMS, il);
-    //            cb(cur, "ffn_norm", il);
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
+            cb(ffn_inp, "ffn_inp", il);
 
-    //            // T5 uses relu, flan-T5 uses gelu-gated
-    //            cur = build_ffn(cur,
-    //                    model.layers[il].ffn_up,   NULL, NULL,
-    //                    model.layers[il].ffn_gate, NULL, NULL,
-    //                    model.layers[il].ffn_down, NULL, NULL,
-    //                    NULL,
-    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
-    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
-    //                    il);
-    //            cb(cur, "ffn_out", il);
-    //        }
+            // feed-forward network
+            {
+                cur = build_norm(ffn_inp,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(cur, "ffn_norm", il);
 
-    //        cur = ggml_add(ctx0, cur, ffn_inp);
-    //        cb(cur, "ffn_out", il);
+                // T5 uses relu, flan-T5 uses gelu-gated
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+                        model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+                        il);
+                cb(cur, "ffn_out", il);
+            }
 
-    //        ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-    //        if (layer_dir != nullptr) {
-    //            cur = ggml_add(ctx0, cur, layer_dir);
-    //        }
-    //        cb(cur, "l_out", il);
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
 
-    //        // input for next layer
-    //        inpL = cur;
-    //    }
+            cur = lgf->build_cvec(ctx0, cur, il);
+            cb(cur, "l_out", il);
 
-    //    cur = inpL;
-    //    cb(cur, "result_embd", -1);
+            // input for next layer
+            inpL = cur;
+        }
 
-    //    cur = build_norm(cur,
-    //            model.output_norm, NULL,
-    //            LLM_NORM_RMS, -1);
+        cur = inpL;
+        cb(cur, "result_embd", -1);
 
-    //    cb(cur, "result_norm", -1);
-    //    res.t_embd = cur;
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
 
-    //    // lm_head
-    //    cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
-    //    cb(cur, "result_output", -1);
-    //    res.t_logits = cur;
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
 
-    //    ggml_build_forward_expand(gf, cur);
+        cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
-    //    return gf;
-    //}
+        ggml_build_forward_expand(gf, cur);
+    }
 
     void build_jais(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -11119,7 +11119,7 @@ llama_graph_result llama_model::build_graph(
                         llm.build_t5_enc(gf);
                         break;
                     case LLAMA_GRAPH_TYPE_DECODER:
-                        //llm.build_t5_dec(gf);
+                        llm.build_t5_dec(gf);
                         break;
                     default:
                         GGML_ABORT("invalid graph type");

From e5bc5f8e029b668078f76eb779eac52b183ff660 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 25 Feb 2025 12:10:34 +0200
Subject: [PATCH 75/84] context : enc-dec is now working

ggml-ci
---
 src/llama-model.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 38e8c2812fcbb..8e579d8e88fa1 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -9741,7 +9741,7 @@ struct llm_build_context {
                 struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
 
                 cur = build_attn_with_kq_b(gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        model.layers[il].wo_enc, nullptr,
                         Qcur, Kcur, Vcur, kq_b, n_tokens, 1.0f, il);
                 cb(cur, "kqv_out", il);
             }

From e2b3294f2c13c468ca9f798525344b063dafa378 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 25 Feb 2025 12:14:34 +0200
Subject: [PATCH 76/84] context : fix enc-dec state save/load

ggml-ci
---
 src/llama-context.cpp | 48 ++++++++++---------------------------------
 1 file changed, 11 insertions(+), 37 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 81663c40018e3..dacf809086cb1 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -4981,41 +4981,31 @@ void llama_context_enc_dec::perf_reset() {
 
 //
 // state save/load
+// TODO: for now dump just the decoder state, in the future dump both
 //
 
 size_t llama_context_enc_dec::state_get_size() {
-    GGML_ABORT("TODO: implement");
+    return ctx_dec->state_get_size();
 }
 
 size_t llama_context_enc_dec::state_get_data(      uint8_t * dst, size_t size) {
-    GGML_UNUSED(dst);
-    GGML_UNUSED(size);
-    GGML_ABORT("TODO: implement");
+    return ctx_dec->state_get_data(dst, size);
 }
 
 size_t llama_context_enc_dec::state_set_data(const uint8_t * src, size_t size) {
-    GGML_UNUSED(src);
-    GGML_UNUSED(size);
-    GGML_ABORT("TODO: implement");
+    return ctx_dec->state_set_data(src, size);
 }
 
 size_t llama_context_enc_dec::state_seq_get_size(llama_seq_id seq_id) {
-    GGML_UNUSED(seq_id);
-    GGML_ABORT("TODO: implement");
+    return ctx_dec->state_seq_get_size(seq_id);
 }
 
 size_t llama_context_enc_dec::state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) {
-    GGML_UNUSED(seq_id);
-    GGML_UNUSED(dst);
-    GGML_UNUSED(size);
-    GGML_ABORT("TODO: implement");
+    return ctx_dec->state_seq_get_data(seq_id, dst, size);
 }
 
 size_t llama_context_enc_dec::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
-    GGML_UNUSED(seq_id);
-    GGML_UNUSED(src);
-    GGML_UNUSED(size);
-    GGML_ABORT("TODO: implement");
+    return ctx_dec->state_seq_set_data(seq_id, src, size);
 }
 
 bool llama_context_enc_dec::state_load_file(
@@ -5023,21 +5013,14 @@ bool llama_context_enc_dec::state_load_file(
        llama_token * tokens_out,
             size_t   n_token_capacity,
             size_t * n_token_count_out) {
-    GGML_UNUSED(filepath);
-    GGML_UNUSED(tokens_out);
-    GGML_UNUSED(n_token_capacity);
-    GGML_UNUSED(n_token_count_out);
-    GGML_ABORT("TODO: implement");
+    return ctx_dec->state_load_file(filepath, tokens_out, n_token_capacity, n_token_count_out);
 }
 
 bool llama_context_enc_dec::state_save_file(
         const char * filepath,
  const llama_token * tokens,
             size_t   n_token_count) {
-    GGML_UNUSED(filepath);
-    GGML_UNUSED(tokens);
-    GGML_UNUSED(n_token_count);
-    GGML_ABORT("TODO: implement");
+    return ctx_dec->state_save_file(filepath, tokens, n_token_count);
 }
 
 size_t llama_context_enc_dec::state_seq_load_file(
@@ -5046,12 +5029,7 @@ size_t llama_context_enc_dec::state_seq_load_file(
        llama_token * tokens_out,
             size_t   n_token_capacity,
             size_t * n_token_count_out) {
-    GGML_UNUSED(seq_id);
-    GGML_UNUSED(filepath);
-    GGML_UNUSED(tokens_out);
-    GGML_UNUSED(n_token_capacity);
-    GGML_UNUSED(n_token_count_out);
-    GGML_ABORT("TODO: implement");
+    return ctx_dec->state_seq_load_file(seq_id, filepath, tokens_out, n_token_capacity, n_token_count_out);
 }
 
 size_t llama_context_enc_dec::state_seq_save_file(
@@ -5059,11 +5037,7 @@ size_t llama_context_enc_dec::state_seq_save_file(
         const char * filepath,
  const llama_token * tokens,
             size_t   n_token_count) {
-    GGML_UNUSED(seq_id);
-    GGML_UNUSED(filepath);
-    GGML_UNUSED(tokens);
-    GGML_UNUSED(n_token_count);
-    GGML_ABORT("TODO: implement");
+    return ctx_dec->state_seq_save_file(seq_id, filepath, tokens, n_token_count);
 }
 
 //

From 4efe9898862ccea908176a6801c643382f2e27f7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 25 Feb 2025 16:11:17 +0200
Subject: [PATCH 77/84] context : pass embeddings tensor from encoder to
 decoder

ggml-ci
---
 src/llama-context.cpp | 45 ++++++++++++++++++++++++-------------------
 src/llama-context.h   |  7 ++++---
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index dacf809086cb1..f7c83e886ef1c 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -4540,6 +4540,7 @@ size_t llama_context_recurrent::state_seq_read_data(llama_io_read_i & io, llama_
 // llama_context_enc
 //
 
+// TODO: avoid copy-paste of the entire encode() function
 int llama_context_enc::encode(llama_batch & inp_batch) {
     if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
@@ -4671,8 +4672,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
     // overlap with device computation.
     ggml_backend_sched_reset(sched.get());
 
-    cross->n_outputs = n_tokens;
-    cross->embd_enc = embd;
+    cross->t_embd = t_embd;
 
     // remember the sequence ids used during the encoding - needed for cross attention later
     cross->seq_ids_enc.resize(n_tokens);
@@ -4692,9 +4692,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
 
 void llama_context_dec::reserve() {
     // simulate full KV cache
-    cross->n_outputs = cparams.n_ubatch;
-
-    LLAMA_LOG_DEBUG("%s: n_outputs = %u\n", __func__, cross->n_outputs);
+    cross->t_embd = nullptr;
 
     llama_context_kv_self::reserve();
 }
@@ -4703,15 +4701,15 @@ void llama_context_dec::input_set(const llama_ubatch & ubatch) {
     // call base functionality
     llama_context_kv_self::input_set(ubatch);
 
-    if (inp.cross_embd) {
-        assert(inp.cross_embd->type == GGML_TYPE_F32);
-        assert(ggml_nelements(inp.cross_embd) == cross->n_outputs*model.hparams.n_embd);
+    //if (inp.cross_embd && inp.cross_embd->op != GGML_OP_NONE) {
+    //    assert(inp.cross_embd->type == GGML_TYPE_F32);
+    //    assert(ggml_nelements(inp.cross_embd) == cross->n_outputs*model.hparams.n_embd);
 
-        ggml_backend_tensor_set(inp.cross_embd, cross->embd_enc, 0, ggml_nbytes(inp.cross_embd));
-    }
+    //    ggml_backend_tensor_set(inp.cross_embd, cross->embd_enc, 0, ggml_nbytes(inp.cross_embd));
+    //}
 
     if (inp.cross_kq_mask) {
-        const int64_t n_output_enc = cross->n_outputs;
+        const int64_t n_enc    = inp.cross_kq_mask->ne[0];
         const int64_t n_tokens = ubatch.n_tokens;
 
         GGML_ASSERT(ggml_backend_buffer_is_host(inp.cross_kq_mask->buffer));
@@ -4721,7 +4719,7 @@ void llama_context_dec::input_set(const llama_ubatch & ubatch) {
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_output_enc; ++i) {
+                for (int i = 0; i < n_enc; ++i) {
                     float f = -INFINITY;
                     for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
                         const llama_seq_id seq_id = ubatch.seq_id[j][s];
@@ -4729,13 +4727,13 @@ void llama_context_dec::input_set(const llama_ubatch & ubatch) {
                             f = 0.0f;
                         }
                     }
-                    data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f;
+                    data[h*(n_enc*n_tokens) + j*n_enc + i] = f;
                 }
             }
 
             for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                for (int j = 0; j < n_output_enc; ++j) {
-                    data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
+                for (int j = 0; j < n_enc; ++j) {
+                    data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
                 }
             }
         }
@@ -4750,12 +4748,19 @@ ggml_cgraph * llama_context_dec::graph_init() {
 
 ggml_tensor * llama_context_dec::build_inp_cross_embd(
         ggml_context * ctx0) {
+    // if we have the output embeddings from the encoder, use them directly
+    if (cross->t_embd) {
+        inp.cross_embd = ggml_view_tensor(ctx0, cross->t_embd);
+
+        return inp.cross_embd;
+    }
+
     const auto & hparams = model.hparams;
-    const int64_t n_embd = hparams.n_embd;
 
-    const int32_t n_outputs_enc = cross->n_outputs;
+    const auto n_embd = hparams.n_embd;
+    const auto n_enc  = hparams.n_ctx_train;
 
-    inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
+    inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
     ggml_set_input(inp.cross_embd);
 
     return inp.cross_embd;
@@ -4768,9 +4773,9 @@ void llama_context_dec::build_attn_inp(
                 bool   swa) {
     llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa);
 
-    const int32_t n_outputs_enc = cross->n_outputs;
+    const int32_t n_enc = cross->t_embd ? cross->t_embd->ne[1] : model.hparams.n_ctx_train;
 
-    inp.cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    inp.cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
     ggml_set_input(inp.cross_kq_mask);
 
     inp.cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.cross_kq_mask, GGML_TYPE_F16) : inp.cross_kq_mask;
diff --git a/src/llama-context.h b/src/llama-context.h
index 3165865a73c37..af35b577b3af1 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -748,11 +748,12 @@ class llama_context_recurrent : public llama_context_base {
     llama_kv_cache_recurrent kv_self;
 };
 
-// TODO: tmp - need something better
+// TODO: tmp - need something better to pass the data from the encoder to the decoder
 struct llama_cross {
-    int32_t n_outputs;
-    float * embd_enc;
+    // the output embeddings from the encoder
+    ggml_tensor * t_embd = nullptr;
 
+    // needed to construct the cross-attention mask in the decoder
     std::vector<std::set<llama_seq_id>> seq_ids_enc;
 };
 

From 952feedfca81134c686781ec210a6a15e5bd2b6c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 27 Feb 2025 15:07:10 +0200
Subject: [PATCH 78/84] context : disable encoder embd tensor for now

ggml-ci
---
 src/llama-context.cpp | 23 ++++++++++++-----------
 src/llama-context.h   |  7 ++++++-
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f7c83e886ef1c..4341c571e3b2d 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -4673,6 +4673,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
     ggml_backend_sched_reset(sched.get());
 
     cross->t_embd = t_embd;
+    cross->v_embd = embd;
 
     // remember the sequence ids used during the encoding - needed for cross attention later
     cross->seq_ids_enc.resize(n_tokens);
@@ -4701,12 +4702,11 @@ void llama_context_dec::input_set(const llama_ubatch & ubatch) {
     // call base functionality
     llama_context_kv_self::input_set(ubatch);
 
-    //if (inp.cross_embd && inp.cross_embd->op != GGML_OP_NONE) {
-    //    assert(inp.cross_embd->type == GGML_TYPE_F32);
-    //    assert(ggml_nelements(inp.cross_embd) == cross->n_outputs*model.hparams.n_embd);
+    if (inp.cross_embd && cross->t_embd) {
+        assert(inp.cross_embd->type == GGML_TYPE_F32);
 
-    //    ggml_backend_tensor_set(inp.cross_embd, cross->embd_enc, 0, ggml_nbytes(inp.cross_embd));
-    //}
+        ggml_backend_tensor_set(inp.cross_embd, cross->v_embd, 0, ggml_nbytes(inp.cross_embd));
+    }
 
     if (inp.cross_kq_mask) {
         const int64_t n_enc    = inp.cross_kq_mask->ne[0];
@@ -4749,16 +4749,17 @@ ggml_cgraph * llama_context_dec::graph_init() {
 ggml_tensor * llama_context_dec::build_inp_cross_embd(
         ggml_context * ctx0) {
     // if we have the output embeddings from the encoder, use them directly
-    if (cross->t_embd) {
-        inp.cross_embd = ggml_view_tensor(ctx0, cross->t_embd);
+    // TODO: needs more work to be correct, for now just use the tensor shape
+    //if (cross->t_embd) {
+    //    inp.cross_embd = ggml_view_tensor(ctx0, cross->t_embd);
 
-        return inp.cross_embd;
-    }
+    //    return inp.cross_embd;
+    //}
 
     const auto & hparams = model.hparams;
 
-    const auto n_embd = hparams.n_embd;
-    const auto n_enc  = hparams.n_ctx_train;
+    const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd;
+    const auto n_enc  = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train;
 
     inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
     ggml_set_input(inp.cross_embd);
diff --git a/src/llama-context.h b/src/llama-context.h
index af35b577b3af1..1b807ccf84a5c 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -750,9 +750,14 @@ class llama_context_recurrent : public llama_context_base {
 
 // TODO: tmp - need something better to pass the data from the encoder to the decoder
 struct llama_cross {
-    // the output embeddings from the encoder
+    // the output embeddings from the encoder as a ggml tensor
+    // TODO: this needs more work to be correct, for now copy the embeddings data to host memory
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524
     ggml_tensor * t_embd = nullptr;
 
+    // embeddings data copied to host memory (tmp)
+    float * v_embd = nullptr;
+
     // needed to construct the cross-attention mask in the decoder
     std::vector<std::set<llama_seq_id>> seq_ids_enc;
 };

From 828effd9d74d770e03852b6123d54f12e92bb950 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 27 Feb 2025 15:54:44 +0200
Subject: [PATCH 79/84] kv-cache : basic abstraction

ggml-ci
---
 src/llama-context.cpp  | 288 +++++++++++++++++++++--------------------
 src/llama-context.h    |   4 +-
 src/llama-kv-cache.cpp |  84 +++++++-----
 src/llama-kv-cache.h   |  66 +++++++---
 4 files changed, 244 insertions(+), 198 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4341c571e3b2d..5c77b29c13a7d 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2384,15 +2384,16 @@ llama_context_kv_self::llama_context_kv_self(
         const llama_model & model,
               llama_context_params params,
               llama_graph_type gtype) :
-    llama_context_base(model, params, gtype),
-    kv_self(model.hparams) {
+    llama_context_base(model, params, gtype) {
     LLAMA_LOG_INFO("%s: constructing llama_context_kv_self\n", __func__);
 
     const auto & hparams = model.hparams;
 
+    kv_self = std::make_unique<llama_kv_cache_unified>(hparams);
+
     LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
 
-    cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv_self.get_padding(cparams));
+    cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv_self->get_padding(cparams));
 
     LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
 
@@ -2406,14 +2407,14 @@ llama_context_kv_self::llama_context_kv_self(
     GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
 
     if (!hparams.vocab_only) {
-        if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
+        if (!kv_self->init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             throw std::runtime_error("failed to initialize self-attention cache");
         }
 
         {
-            const size_t memory_size_k = kv_self.size_k_bytes();
-            const size_t memory_size_v = kv_self.size_v_bytes();
+            const size_t memory_size_k = kv_self->size_k_bytes();
+            const size_t memory_size_v = kv_self->size_v_bytes();
 
             LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
                       (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
@@ -2427,19 +2428,19 @@ llama_context_kv_self::~llama_context_kv_self() = default;
 
 void llama_context_kv_self::reserve() {
     // simulate full KV cache
-    kv_self.n = kv_self.size;
+    kv_self->n = kv_self->size;
 
-    LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n);
+    LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self->n);
 
     llama_context_base::reserve();
 }
 
 llama_kv_cache * llama_context_kv_self::get_kv_self() {
-    return &kv_self;
+    return kv_self.get();
 }
 
 const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
-    return &kv_self;
+    return kv_self.get();
 }
 
 void llama_context_kv_self::kv_self_update() {
@@ -2449,8 +2450,8 @@ void llama_context_kv_self::kv_self_update() {
 
     bool need_reserve = false;
 
-    if (kv.has_shift) {
-        if (!kv.can_shift) {
+    if (kv->has_shift) {
+        if (!kv->get_can_shift()) {
             GGML_ABORT("The current context does not support K-shift");
         }
 
@@ -2474,16 +2475,16 @@ void llama_context_kv_self::kv_self_update() {
         }
 
         {
-            kv.has_shift = false;
+            kv->has_shift = false;
 
-            for (uint32_t i = 0; i < kv.size; ++i) {
-                kv.cells[i].delta = 0;
+            for (uint32_t i = 0; i < kv->size; ++i) {
+                kv->cells[i].delta = 0;
             }
         }
     }
 
     // defragment the KV cache if needed
-    if (kv.do_defrag) {
+    if (kv->do_defrag) {
         LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
 
         ggml_backend_sched_reset(sched.get());
@@ -2499,7 +2500,7 @@ void llama_context_kv_self::kv_self_update() {
 
         graph_compute(gf, false);
 
-        kv.do_defrag = false;
+        kv->do_defrag = false;
 
         need_reserve = true;
     }
@@ -2513,7 +2514,7 @@ void llama_context_kv_self::kv_self_update() {
         uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
         // simulate full KV cache
-        kv_self.n = kv_self.size;
+        kv_self->n = kv_self->size;
 
         llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
         llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
@@ -2537,7 +2538,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
 
     // temporary allocate memory for the input batch if needed
     // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
     const int32_t n_tokens = batch.n_tokens;
@@ -2674,7 +2675,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
     // temporary allocate memory for the input batch if needed
     // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
 
@@ -2689,7 +2690,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
     // TODO: remove this stuff
     class batch_guard {
     public:
-        batch_guard(llama_kv_cache & kv_self) : kv_slot_restorer(kv_self) {
+        batch_guard(llama_kv_cache_unified & kv_self) : kv_slot_restorer(kv_self) {
         }
 
         ~batch_guard() {
@@ -2712,7 +2713,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         llama_kv_slot_restorer kv_slot_restorer;
     };
 
-    batch_guard bg(kv_self);
+    batch_guard bg(*kv_self);
 
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
@@ -2797,11 +2798,11 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
-            if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
-                kv_self.head = 0;
+            if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) {
+                kv_self->head = 0;
             }
 
-            const auto slot_info = kv_self.find_slot(ubatch);
+            const auto slot_info = kv_self->find_slot(ubatch);
             if (!slot_info) {
                 LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
                 return -3;
@@ -2813,12 +2814,12 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
                 // after enough generations, the benefit from this heuristic disappears
                 // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = kv_self.get_padding(cparams);
-                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
+                const uint32_t pad = kv_self->get_padding(cparams);
+                kv_self->n = std::min(kv_self->size, std::max(pad, GGML_PAD(kv_self->cell_max(), pad)));
             }
         }
 
-        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
+        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head);
 
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
@@ -2847,11 +2848,11 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
         // update the kv ring buffer
         {
-            kv_self.head += ubatch.n_tokens;
+            kv_self->head += ubatch.n_tokens;
 
             // Ensure kv cache head points to a valid index.
-            if (kv_self.head >= kv_self.size) {
-                kv_self.head = 0;
+            if (kv_self->head >= kv_self->size) {
+                kv_self->head = 0;
             }
         }
 
@@ -2972,13 +2973,13 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
     if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
         // - do not defrag small contexts (i.e. < 2048 tokens)
         // - count the padding towards the number of used tokens
-        const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + kv_self.get_padding(cparams))/float(kv_self.n)) : 0.0f;
+        const float fragmentation = kv_self->n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self->used + kv_self->get_padding(cparams))/float(kv_self->n)) : 0.0f;
 
         // queue defragmentation for next llama_kv_cache_update
         if (fragmentation > cparams.defrag_thold) {
             LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
 
-            kv_self.defrag();
+            kv_self->defrag();
         }
     }
 
@@ -2997,8 +2998,8 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
 
         int32_t * data = (int32_t *) inp.self_k_shift->data;
 
-        for (uint32_t i = 0; i < kv_self.size; ++i) {
-            data[i] = kv_self.cells[i].delta;
+        for (uint32_t i = 0; i < kv_self->size; ++i) {
+            data[i] = kv_self->cells[i].delta;
         }
 
         // the K-shift graph requires just this input
@@ -3011,7 +3012,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     if (inp.self_kq_mask || inp.self_kq_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
         if (cparams.causal_attn) {
-            const int64_t n_kv         = kv_self.n;
+            const int64_t n_kv         = kv_self->n;
             const int64_t n_tokens     = ubatch.n_tokens;
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
             const int64_t n_seqs       = ubatch.n_seqs;
@@ -3041,11 +3042,11 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
 
                         for (int i = 0; i < n_kv; ++i) {
                             float f;
-                            if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                            if (!kv_self->cells[i].has_seq_id(seq_id) || kv_self->cells[i].pos > pos) {
                                 f = -INFINITY;
                             } else {
                                 if (hparams.use_alibi) {
-                                    f = -std::abs(kv_self.cells[i].pos - pos);
+                                    f = -std::abs(kv_self->cells[i].pos - pos);
                                 } else {
                                     f = 0.0f;
                                 }
@@ -3057,7 +3058,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
 
                             // may need to cut off old tokens for sliding window
                             if (data_swa) {
-                                if (pos - kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
+                                if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
                                     f = -INFINITY;
                                 }
                                 data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
@@ -3137,11 +3138,11 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
 
         int32_t * data = (int32_t *) inp.self_pos_bucket->data;
 
-        const int64_t n_kv = kv_self.n;
+        const int64_t n_kv = kv_self->n;
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 for (int i = 0; i < n_kv; ++i) {
-                    data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, false);
+                    data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, false);
                 }
             }
         }
@@ -3164,7 +3165,7 @@ ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0)
 ggml_tensor * llama_context_kv_self::build_inp_pos_bucket(
         ggml_context * ctx0,
              int32_t   n_tokens) {
-    const auto n_kv = kv_self.n;
+    const auto n_kv = kv_self->n;
 
     inp.self_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
     ggml_set_input(inp.self_pos_bucket);
@@ -3177,7 +3178,7 @@ void llama_context_kv_self::build_attn_inp(
              int32_t   n_tokens,
                 bool   causal,
                 bool   swa) {
-    const auto n_kv = kv_self.n;
+    const auto n_kv = kv_self->n;
 
     inp.self_kq_mask = causal
         ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
@@ -3224,13 +3225,13 @@ ggml_tensor * llama_context_kv_self::build_attn(
 
     // store to KV cache
     {
-        GGML_ASSERT(!kv_self.recurrent);
+        GGML_ASSERT(!kv_self->recurrent);
 
-        const auto kv_head = kv_self.head;
+        const auto kv_head = kv_self->head;
 
-        GGML_ASSERT(kv_self.size == n_ctx);
+        GGML_ASSERT(kv_self->size == n_ctx);
 
-        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head);
+        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head);
         //cb(k_cache_view, "k_cache_view", il);
 
         // note: storing RoPE-ed version of K in the KV cache
@@ -3241,12 +3242,12 @@ ggml_tensor * llama_context_kv_self::build_attn(
         struct ggml_tensor * v_cache_view = nullptr;
 
         if (!v_trans) {
-            v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head);
+            v_cache_view = ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head);
         } else {
             // note: the V cache is transposed when not using flash attention
-            v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa,
-                    (  n_ctx)*ggml_element_size(kv_self.v_l[il]),
-                    (kv_head)*ggml_element_size(kv_self.v_l[il]));
+            v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa,
+                    (  n_ctx)*ggml_element_size(kv_self->v_l[il]),
+                    (kv_head)*ggml_element_size(kv_self->v_l[il]));
 
             v_cur = ggml_transpose(ctx0, v_cur);
         }
@@ -3281,7 +3282,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
 
     const auto & kq_mask = is_sliding ? inp.self_kq_mask_swa_cnv : inp.self_kq_mask_cnv;
 
-    const auto n_kv = kv_self.n;
+    const auto n_kv = kv_self->n;
 
     const int64_t n_head_kv = hparams.n_head_kv(il);
 
@@ -3292,23 +3293,23 @@ ggml_tensor * llama_context_kv_self::build_attn(
     //cb(q, "q", il);
 
     ggml_tensor * k =
-        ggml_view_3d(ctx0, kv_self.k_l[il],
+        ggml_view_3d(ctx0, kv_self->k_l[il],
                 n_embd_head_k, n_kv, n_head_kv,
-                ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+                ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
                 0);
     //cb(k, "k", il);
 
     ggml_tensor * v = !v_trans ?
-        ggml_view_3d(ctx0, kv_self.v_l[il],
+        ggml_view_3d(ctx0, kv_self->v_l[il],
                 n_embd_head_v, n_kv, n_head_kv,
-                ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v),
+                ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+                ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
                 0) :
-        ggml_view_3d(ctx0, kv_self.v_l[il],
+        ggml_view_3d(ctx0, kv_self->v_l[il],
                 n_kv, n_embd_head_v, n_head_kv,
-                ggml_element_size(kv_self.v_l[il])*n_ctx,
-                ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+                ggml_element_size(kv_self->v_l[il])*n_ctx,
+                ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
                 0);
 
     struct ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale);
@@ -3326,7 +3327,7 @@ void llama_context_kv_self::build_kv_self_shift(
     const auto & n_embd_head_k = hparams.n_embd_head_k;
   //const auto & n_embd_head_v = hparams.n_embd_head_v;
 
-    //GGML_ASSERT(kv_self.size == n_ctx);
+    //GGML_ASSERT(kv_self->size == n_ctx);
 
     ggml_tensor * inp_self_k_shift = build_inp_self_k_shift(ctx0);
 
@@ -3337,13 +3338,13 @@ void llama_context_kv_self::build_kv_self_shift(
         struct ggml_tensor * rope_factors = build_rope_factors(il);
 
         struct ggml_tensor * k =
-            ggml_view_3d(ctx0, kv_self.k_l[il],
-                n_embd_head_k, n_head_kv, kv_self.size,
-                ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-                ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+            ggml_view_3d(ctx0, kv_self->k_l[il],
+                n_embd_head_k, n_head_kv, kv_self->size,
+                ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
+                ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
                 0);
 
-        ggml_tensor * cur = build_rope_shift(ctx0, k, inp_self_k_shift, rope_factors, kv_self.k_l[il]->buffer);
+        ggml_tensor * cur = build_rope_shift(ctx0, k, inp_self_k_shift, rope_factors, kv_self->k_l[il]->buffer);
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -3356,8 +3357,8 @@ void llama_context_kv_self::build_kv_self_defrag(
 
     const uint32_t n_layer = hparams.n_layer;
 
-    const uint32_t n_kv   = kv_self.cell_max();
-    const uint32_t n_used = kv_self.used;
+    const uint32_t n_kv   = kv_self->cell_max();
+    const uint32_t n_used = kv_self->used;
 
     assert(n_used <= n_kv);
 
@@ -3382,7 +3383,7 @@ void llama_context_kv_self::build_kv_self_defrag(
     std::vector<uint32_t> ids(n_kv, n_kv);
 
     for (uint32_t i0 = 0; i0 < n_used; ++i0) {
-        const auto & cell0 = kv_self.cells[i0];
+        const auto & cell0 = kv_self->cells[i0];
 
         if (!cell0.is_empty()) {
             ids[i0] = i0;
@@ -3395,7 +3396,7 @@ void llama_context_kv_self::build_kv_self_defrag(
         uint32_t nh = 1;
 
         // determine the size of the hole
-        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
+        while (i0 + nh < n_used && kv_self->cells[i0 + nh].is_empty()) {
             nh++;
         }
 
@@ -3404,7 +3405,7 @@ void llama_context_kv_self::build_kv_self_defrag(
 
         // starting from the end, find nh non-empty cells
         for (; is > i0; --is) {
-            const auto & cell1 = kv_self.cells[is];
+            const auto & cell1 = kv_self->cells[is];
 
             if (cell1.is_empty() || ids[is] != n_kv) {
                 continue;
@@ -3433,7 +3434,7 @@ void llama_context_kv_self::build_kv_self_defrag(
 
         // go back and move the nf cells to the hole
         for (; i1 < n_kv; ++i1) {
-            auto & cell1 = kv_self.cells[i1];
+            auto & cell1 = kv_self->cells[i1];
 
             if (cell1.is_empty() || ids[i1] != n_kv) {
                 if (n_moves == max_moves) {
@@ -3449,11 +3450,11 @@ void llama_context_kv_self::build_kv_self_defrag(
             ids[i1] = i0 + nf;
 
             // move the cell meta data
-            kv_self.cells[i0 + nf] = cell1;
+            kv_self->cells[i0 + nf] = cell1;
 
             // clear the old cell and move the head there
             cell1 = llama_kv_cell();
-            kv_self.head = n_used;
+            kv_self->head = n_used;
 
             if (!cont) {
                 n_moves++;
@@ -3572,40 +3573,40 @@ void llama_context_kv_self::build_kv_self_defrag(
             const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
             const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
 
-            ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
+            ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il],
                     n_embd_k_gqa, nm,
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
 
-            ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
+            ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il],
                     n_embd_k_gqa, nm,
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
 
             ggml_tensor * view_v_src;
             ggml_tensor * view_v_dst;
 
             if (cparams.flash_attn) {
                 // NOTE: the V cache is not transposed when using flash attention
-                view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
                         n_embd_v_gqa, nm,
-                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
 
-                view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
                         n_embd_v_gqa, nm,
-                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
             } else {
-                view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
                         nm, n_embd_v_gqa,
-                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                        ggml_row_size(kv_self.v_l[il]->type, i));
+                        ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
+                        ggml_row_size(kv_self->v_l[il]->type, i));
 
-                view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il],
                         nm, n_embd_v_gqa,
-                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                        ggml_row_size(kv_self.v_l[il]->type, id));
+                        ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
+                        ggml_row_size(kv_self->v_l[il]->type, id));
             }
 
             ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
@@ -3625,7 +3626,7 @@ size_t llama_context_kv_self::state_write_data(llama_io_write_i & io) {
     llama_context_base::state_write_data(io);
 
     LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
-    kv_self.state_write(io);
+    kv_self->state_write(io);
 
     return io.n_bytes();
 }
@@ -3634,7 +3635,7 @@ size_t llama_context_kv_self::state_read_data(llama_io_read_i & io) {
     llama_context_base::state_read_data(io);
 
     LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
-    kv_self.state_read(io);
+    kv_self->state_read(io);
 
     return io.n_bytes();
 }
@@ -3642,7 +3643,7 @@ size_t llama_context_kv_self::state_read_data(llama_io_read_i & io) {
 size_t llama_context_kv_self::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
     llama_context_base::state_seq_write_data(io, seq_id);
 
-    kv_self.state_write(io, seq_id);
+    kv_self->state_write(io, seq_id);
 
     return io.n_bytes();
 }
@@ -3650,7 +3651,7 @@ size_t llama_context_kv_self::state_seq_write_data(llama_io_write_i & io, llama_
 size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
     llama_context_base::state_seq_read_data(io, seq_id);
 
-    kv_self.state_read(io, seq_id);
+    kv_self->state_read(io, seq_id);
 
     return io.n_bytes();
 }
@@ -3663,12 +3664,13 @@ llama_context_recurrent::llama_context_recurrent(
         const llama_model & model,
               llama_context_params params,
               llama_graph_type gtype) :
-    llama_context_base(model, params, gtype),
-    kv_self(model.hparams) {
+    llama_context_base(model, params, gtype) {
     LLAMA_LOG_INFO("%s: constructing llama_context_recurrent\n", __func__);
 
     const auto & hparams = model.hparams;
 
+    kv_self = std::make_unique<llama_kv_cache_recurrent>(hparams);
+
     LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
 
     // Mamba only needs a constant number of KV cache cells per sequence
@@ -3684,14 +3686,14 @@ llama_context_recurrent::llama_context_recurrent(
     GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
 
     if (!hparams.vocab_only) {
-        if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
+        if (!kv_self->init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             throw std::runtime_error("failed to initialize self-attention cache");
         }
 
         {
-            const size_t memory_size_k = kv_self.size_k_bytes();
-            const size_t memory_size_v = kv_self.size_v_bytes();
+            const size_t memory_size_k = kv_self->size_k_bytes();
+            const size_t memory_size_v = kv_self->size_v_bytes();
 
             LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
                       (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
@@ -3705,20 +3707,20 @@ llama_context_recurrent::~llama_context_recurrent() = default;
 
 void llama_context_recurrent::reserve() {
     // simulate full KV cache
-    kv_self.n = kv_self.size;
+    kv_self->n = kv_self->size;
 
-    LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n);
+    LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self->n);
 
     // TODO: implement recurrent-specific reserve logic
     llama_context_base::reserve();
 }
 
 llama_kv_cache * llama_context_recurrent::get_kv_self() {
-    return &kv_self;
+    return kv_self.get();
 }
 
 const llama_kv_cache * llama_context_recurrent::get_kv_self() const {
-    return &kv_self;
+    return kv_self.get();
 }
 
 void llama_context_recurrent::kv_self_update() {
@@ -3740,7 +3742,7 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
 
     // temporary allocate memory for the input batch if needed
     // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
 
@@ -3755,7 +3757,7 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
     // TODO: remove this stuff
     class batch_guard {
     public:
-        batch_guard(llama_kv_cache & kv_self) : kv_slot_restorer(kv_self) {
+        batch_guard(llama_kv_cache_unified & kv_self) : kv_slot_restorer(kv_self) {
         }
 
         ~batch_guard() {
@@ -3778,7 +3780,7 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
         llama_kv_slot_restorer kv_slot_restorer;
     };
 
-    batch_guard bg(kv_self);
+    batch_guard bg(*kv_self);
 
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
@@ -3870,11 +3872,11 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
-            if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
-                kv_self.head = 0;
+            if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) {
+                kv_self->head = 0;
             }
 
-            const auto slot_info = kv_self.find_slot(ubatch);
+            const auto slot_info = kv_self->find_slot(ubatch);
             if (!slot_info) {
                 LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
                 return -3;
@@ -3883,7 +3885,7 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
             bg.save(slot_info);
         }
 
-        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
+        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head);
 
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
@@ -3912,11 +3914,11 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
 
         // update the kv ring buffer
         {
-            kv_self.head += ubatch.n_tokens;
+            kv_self->head += ubatch.n_tokens;
 
             // Ensure kv cache head points to a valid index.
-            if (kv_self.head >= kv_self.size) {
-                kv_self.head = 0;
+            if (kv_self->head >= kv_self->size) {
+                kv_self->head = 0;
             }
         }
 
@@ -4044,9 +4046,9 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
     // call base functionality
     llama_context_base::input_set(ubatch);
 
-    GGML_ASSERT(kv_self.recurrent);
+    GGML_ASSERT(kv_self->recurrent);
 
-    const int64_t n_kv = kv_self.n;
+    const int64_t n_kv = kv_self->n;
 
     if (inp.s_mask) {
         GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_mask->buffer));
@@ -4054,8 +4056,8 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
 
         // clear unused states
         for (int i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self.head;
-            llama_kv_cell & kv_cell = kv_self.cells[cell_id];
+            const uint32_t  cell_id = i + kv_self->head;
+            llama_kv_cell & kv_cell = kv_self->cells[cell_id];
 
             data[i] = (float) (kv_cell.src >= 0);
 
@@ -4073,11 +4075,11 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
         for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self.head;
-            llama_kv_cell & kv_cell = kv_self.cells[cell_id];
+            const uint32_t  cell_id = i + kv_self->head;
+            llama_kv_cell & kv_cell = kv_self->cells[cell_id];
 
             // prevent out-of-bound sources
-            if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) {
+            if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
                 kv_cell.src = cell_id;
             }
 
@@ -4101,7 +4103,7 @@ ggml_cgraph * llama_context_recurrent::graph_init() {
 
 ggml_tensor * llama_context_recurrent::build_inp_s_copy(
         ggml_context * ctx0) {
-    const auto n_kv = kv_self.n;
+    const auto n_kv = kv_self->n;
 
     inp.s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
     //cb(inp.s_copy, "inp_s_copy", -1);
@@ -4112,7 +4114,7 @@ ggml_tensor * llama_context_recurrent::build_inp_s_copy(
 
 ggml_tensor * llama_context_recurrent::build_inp_s_mask(
         ggml_context * ctx0) {
-    const auto n_kv = kv_self.n;
+    const auto n_kv = kv_self->n;
 
     inp.s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
     //cb(inp.s_mask, "inp_s_mask", -1);
@@ -4129,10 +4131,10 @@ ggml_tensor * llama_context_recurrent::build_copy_mask_state(
          ggml_tensor * state_mask,
              int32_t   n_state,
              int32_t   n_seqs) {
-    const auto n_kv    = kv_self.n;
-    const auto kv_head = kv_self.head;
+    const auto n_kv    = kv_self->n;
+    const auto kv_head = kv_self->head;
 
-    struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self.size);
+    struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self->size);
 
     // copy states
     // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
@@ -4164,7 +4166,7 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer(
                  int   il) {
     const auto & hparams = model.hparams;
 
-    const auto kv_head = kv_self.head;
+    const auto kv_head = kv_self->head;
 
     const int64_t d_conv  = hparams.ssm_d_conv;
     const int64_t d_inner = hparams.ssm_d_inner;
@@ -4182,8 +4184,8 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer(
     GGML_ASSERT(ubatch.equal_seqs);
     GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
 
-    struct ggml_tensor * conv_states_all = kv_self.k_l[il];
-    struct ggml_tensor * ssm_states_all  = kv_self.v_l[il];
+    struct ggml_tensor * conv_states_all = kv_self->k_l[il];
+    struct ggml_tensor * ssm_states_all  = kv_self->v_l[il];
 
     // (ab)using the KV cache to store the states
     struct ggml_tensor * conv = build_copy_mask_state(
@@ -4300,7 +4302,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_load(
 
     const int64_t n_seqs  = ubatch.n_seqs;
 
-    struct ggml_tensor * token_shift_all = kv_self.k_l[il];
+    struct ggml_tensor * token_shift_all = kv_self->k_l[il];
 
     struct ggml_tensor * token_shift = build_copy_mask_state(
             ctx0, gf, token_shift_all, state_copy, state_mask,
@@ -4323,12 +4325,12 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_store(
 
     const int64_t n_seqs = ubatch.n_seqs;
 
-    const auto kv_head = kv_self.head;
+    const auto kv_head = kv_self->head;
 
     return ggml_cpy(
         ctx0,
         ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
-        ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
+        ggml_view_1d(ctx0, kv_self->k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self->k_l[il]))
     );
 }
 
@@ -4350,7 +4352,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
     const auto n_head = n_embd / head_size;
     const auto n_head_kv = hparams.n_head_kv(il);
 
-    const auto kv_head = kv_self.head;
+    const auto kv_head = kv_self->head;
 
     const auto & layer = model.layers[il];
 
@@ -4458,7 +4460,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
     }
 
     struct ggml_tensor * wkv_state = build_copy_mask_state(
-            ctx0, gf, kv_self.v_l[il], state_copy, state_mask,
+            ctx0, gf, kv_self->v_l[il], state_copy, state_mask,
             hparams.n_embd_v_s(), n_seqs);
 
     struct ggml_tensor * wkv_output;
@@ -4477,9 +4479,9 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
             wkv_state,
             ggml_view_1d(
                 ctx0,
-                kv_self.v_l[il],
+                kv_self->v_l[il],
                 hparams.n_embd_v_s() * n_seqs,
-                hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+                hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
             )
         )
     );
@@ -4507,7 +4509,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
 size_t llama_context_recurrent::state_write_data(llama_io_write_i & io) {
     llama_context_base::state_write_data(io);
 
-    kv_self.state_write(io);
+    kv_self->state_write(io);
 
     return io.n_bytes();
 }
@@ -4515,7 +4517,7 @@ size_t llama_context_recurrent::state_write_data(llama_io_write_i & io) {
 size_t llama_context_recurrent::state_read_data(llama_io_read_i & io) {
     llama_context_base::state_read_data(io);
 
-    kv_self.state_read(io);
+    kv_self->state_read(io);
 
     return io.n_bytes();
 }
@@ -4523,7 +4525,7 @@ size_t llama_context_recurrent::state_read_data(llama_io_read_i & io) {
 size_t llama_context_recurrent::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
     llama_context_base::state_seq_write_data(io, seq_id);
 
-    kv_self.state_write(io, seq_id);
+    kv_self->state_write(io, seq_id);
 
     return io.n_bytes();
 }
@@ -4531,7 +4533,7 @@ size_t llama_context_recurrent::state_seq_write_data(llama_io_write_i & io, llam
 size_t llama_context_recurrent::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
     llama_context_base::state_seq_read_data(io, seq_id);
 
-    kv_self.state_read(io, seq_id);
+    kv_self->state_read(io, seq_id);
 
     return io.n_bytes();
 }
@@ -5211,7 +5213,7 @@ void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view *
         return;
     }
 
-    llama_kv_cache_view_update(view, *kv);
+    llama_kv_cache_view_update(view, kv);
 }
 
 //
diff --git a/src/llama-context.h b/src/llama-context.h
index 1b807ccf84a5c..d74db70c7781c 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -630,7 +630,7 @@ class llama_context_kv_self : public llama_context_base {
     // members
     //
 
-    llama_kv_cache kv_self;
+    std::unique_ptr<llama_kv_cache_unified> kv_self;
 };
 
 // a recurrent transformer (ie.e RWKV, Mamba)
@@ -745,7 +745,7 @@ class llama_context_recurrent : public llama_context_base {
     //
 
     // TODO: change name to something more meaningful -- does "KV cache" make sense for recurrent models?
-    llama_kv_cache_recurrent kv_self;
+    std::unique_ptr<llama_kv_cache_recurrent> kv_self;
 };
 
 // TODO: tmp - need something better to pass the data from the encoder to the decoder
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index e1b07c9932166..0cd4142d5f8d5 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -6,17 +6,16 @@
 #include "llama-model.h"
 
 #include <algorithm>
-#include <cassert>
 #include <limits>
 #include <map>
 #include <stdexcept>
 
 static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
 
-llama_kv_cache::llama_kv_cache(const llama_hparams & hparams) : hparams(hparams) {
+llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams) : hparams(hparams) {
 }
 
-bool llama_kv_cache::init(
+bool llama_kv_cache_unified::init(
         const llama_model & model,
       const llama_cparams & cparams,
                 ggml_type   type_k,
@@ -123,7 +122,7 @@ bool llama_kv_cache::init(
     return true;
 }
 
-int32_t llama_kv_cache::n_tokens() const {
+int32_t llama_kv_cache_unified::n_tokens() const {
     int32_t result = 0;
 
     for (uint32_t i = 0; i < size; i++) {
@@ -133,7 +132,11 @@ int32_t llama_kv_cache::n_tokens() const {
     return result;
 }
 
-size_t llama_kv_cache::total_size() const {
+uint32_t llama_kv_cache_unified::used_cells() const {
+    return used;
+}
+
+size_t llama_kv_cache_unified::total_size() const {
     size_t size = 0;
     for (const auto & buf : bufs) {
         size += ggml_backend_buffer_get_size(buf.get());
@@ -142,7 +145,7 @@ size_t llama_kv_cache::total_size() const {
     return size;
 }
 
-llama_pos llama_kv_cache::pos_max() const {
+llama_pos llama_kv_cache_unified::pos_max() const {
     llama_pos pos_max = -1;
     for (const auto & cell : cells) {
         pos_max = std::max(pos_max, cell.pos);
@@ -151,7 +154,7 @@ llama_pos llama_kv_cache::pos_max() const {
     return pos_max;
 }
 
-void llama_kv_cache::clear() {
+void llama_kv_cache_unified::clear() {
     for (int32_t i = 0; i < (int32_t) size; ++i) {
         cells[i].pos = -1;
         cells[i].seq_id.clear();
@@ -166,7 +169,7 @@ void llama_kv_cache::clear() {
     }
 }
 
-bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
     uint32_t new_head = size;
 
     if (p0 < 0) {
@@ -237,7 +240,7 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
     return true;
 }
 
-void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
     if (seq_id_src == seq_id_dst) {
         return;
     }
@@ -288,7 +291,7 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll
     }
 }
 
-void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
+void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
     uint32_t new_head = size;
 
     for (uint32_t i = 0; i < size; ++i) {
@@ -320,7 +323,7 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
     }
 }
 
-void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
     if (delta == 0) {
         return;
     }
@@ -378,7 +381,7 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll
     head = new_head != size ? new_head : 0;
 }
 
-void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
     if (d == 1) {
         return;
     }
@@ -424,7 +427,7 @@ void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, in
     }
 }
 
-llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) {
+llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) {
     llama_pos result = 0;
 
     for (uint32_t i = 0; i < size; ++i) {
@@ -436,13 +439,17 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) {
     return result;
 }
 
-void llama_kv_cache::defrag() {
+void llama_kv_cache_unified::defrag() {
     if (!recurrent) {
         do_defrag = true;
     }
 }
 
-struct llama_kv_cache_slot_info llama_kv_cache::find_slot(
+bool llama_kv_cache_unified::get_can_shift() const {
+    return can_shift;
+}
+
+struct llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
        const struct llama_ubatch & ubatch) {
     const uint32_t n_tokens = ubatch.n_tokens;
     const uint32_t n_seqs   = ubatch.n_seqs;
@@ -663,12 +670,12 @@ struct llama_kv_cache_slot_info llama_kv_cache::find_slot(
     return llama_kv_cache_slot_info(head, head + n_tokens);
 }
 
-uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) const {
+uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const {
     // the FA kernels require padding to avoid extra runtime boundary checks
     return cparams.flash_attn ? 256u : 32u;
 }
 
-uint32_t llama_kv_cache::cell_max() const {
+uint32_t llama_kv_cache_unified::cell_max() const {
     for (uint32_t i = size; i > 0; --i) {
         const llama_kv_cell & cell = cells[i - 1];
 
@@ -680,7 +687,7 @@ uint32_t llama_kv_cache::cell_max() const {
     return 0;
 }
 
-size_t llama_kv_cache::size_k_bytes() const {
+size_t llama_kv_cache_unified::size_k_bytes() const {
     size_t size_k_bytes = 0;
 
     for (const auto & k : k_l) {
@@ -690,7 +697,7 @@ size_t llama_kv_cache::size_k_bytes() const {
     return size_k_bytes;
 }
 
-size_t llama_kv_cache::size_v_bytes() const {
+size_t llama_kv_cache_unified::size_v_bytes() const {
     size_t size_v_bytes = 0;
 
     for (const auto & v : v_l) {
@@ -700,7 +707,7 @@ size_t llama_kv_cache::size_v_bytes() const {
     return size_v_bytes;
 }
 
-void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
     std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
     uint32_t cell_count = 0;
 
@@ -738,7 +745,7 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) con
     state_write_data(io, cell_ranges);
 }
 
-void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
     uint32_t cell_count;
     io.read_to(&cell_count, sizeof(cell_count));
 
@@ -756,7 +763,7 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
     }
 }
 
-void llama_kv_cache::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
     for (const auto & range : cell_ranges) {
         for (uint32_t i = range.first; i < range.second; ++i) {
             const auto & cell = cells[i];
@@ -775,7 +782,7 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const std::vector<s
     }
 }
 
-void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
+void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
     const uint32_t v_trans = this->v_trans ? 1 : 0;
     const uint32_t n_layer = hparams.n_layer;
 
@@ -855,7 +862,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector<s
     }
 }
 
-bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
     if (dest_seq_id != -1) {
         // single sequence
 
@@ -921,7 +928,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t cell_count,
                 llama_seq_id seq_id;
                 io.read_to(&seq_id, sizeof(seq_id));
 
-                // TODO: llama_kv_cache should have a notion of max sequences
+                // TODO: llama_kv_cache_unified should have a notion of max sequences
                 //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
                 if (seq_id < 0) {
                     //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
@@ -957,7 +964,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t cell_count,
     return true;
 }
 
-bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
+bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
     uint32_t v_trans;
     uint32_t n_layer;
     io.read_to(&v_trans, sizeof(v_trans));
@@ -1092,7 +1099,7 @@ int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
         return 0;
     }
 
-    return kv->used;
+    return kv->used_cells();
 }
 
 void llama_kv_cache_clear(llama_kv_cache * kv) {
@@ -1183,7 +1190,7 @@ bool llama_kv_cache_can_shift(const llama_kv_cache * kv) {
         return false;
     }
 
-    return kv->can_shift;
+    return kv->get_can_shift();
 }
 
 //
@@ -1216,9 +1223,16 @@ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
     }
 }
 
-void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv) {
-    if (uint32_t(view->n_cells) < kv.size || view->cells == nullptr) {
-        view->n_cells = int32_t(kv.size);
+void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache * kv) {
+    // TODO: rework this in the future, for now quick hack
+    const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
+    if (kvu == nullptr) {
+        LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
+        return;
+    }
+
+    if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
+        view->n_cells = int32_t(kvu->size);
         void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
         GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
         view->cells = (struct llama_kv_cache_view_cell *)p;
@@ -1227,7 +1241,7 @@ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct
         view->cells_sequences = (llama_seq_id *)p;
     }
 
-    const std::vector<llama_kv_cell> & kv_cells = kv.cells;
+    const std::vector<llama_kv_cell> & kv_cells = kvu->cells;
     llama_kv_cache_view_cell * c_curr = view->cells;
     llama_seq_id * cs_curr = view->cells_sequences;
     int32_t used_cells = 0;
@@ -1236,7 +1250,7 @@ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct
     uint32_t max_contig = 0;
     int32_t max_contig_idx = -1;
 
-    for (int32_t i = 0; i < int32_t(kv.size); i++, c_curr++, cs_curr += view->n_seq_max) {
+    for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
         const size_t curr_size = kv_cells[i].seq_id.size();
         token_count += curr_size;
         c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
@@ -1274,8 +1288,8 @@ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct
     view->max_contiguous_idx = max_contig_idx;
     view->token_count = token_count;
     view->used_cells = used_cells;
-    if (uint32_t(used_cells) != kv.used) {
+    if (uint32_t(used_cells) != kvu->used) {
         LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
-            __func__, kv.used, used_cells);
+            __func__, kvu->used, used_cells);
     }
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index dda9bfec48846..99eb0be3c7404 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -45,12 +45,39 @@ struct llama_kv_cache_slot_info {
     operator bool() const { return found; }
 };
 
+struct llama_kv_cache {
+public:
+    virtual int32_t  n_tokens()   const = 0;
+    virtual uint32_t used_cells() const = 0; // TODO: remove
+
+    virtual void clear() = 0;
+    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
+    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
+    virtual void seq_keep(llama_seq_id seq_id) = 0;
+    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) = 0;
+    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
+
+    virtual llama_pos seq_pos_max(llama_seq_id seq_id) = 0;
+
+    virtual void defrag() = 0;
+    virtual bool get_can_shift() const = 0;
+};
+
+
+// C++ alias
+class llama_kv_cache_i : public llama_kv_cache {
+public:
+    using llama_kv_cache::llama_kv_cache;
+};
+
+
 // ring-buffer of cached KV data
 // TODO: pimpl
 // TODO: add notion of max sequences
-struct llama_kv_cache {
-    llama_kv_cache(const llama_hparams & hparams);
-    virtual ~llama_kv_cache() = default;
+class llama_kv_cache_unified : public llama_kv_cache_i {
+public:
+    llama_kv_cache_unified(const llama_hparams & hparams);
+    virtual ~llama_kv_cache_unified() = default;
 
     // TODO: become constructor
     bool init(
@@ -61,24 +88,26 @@ struct llama_kv_cache {
                      uint32_t   kv_size,
                          bool   offload);
 
-    int32_t n_tokens() const;
+    int32_t n_tokens() const override;
+    uint32_t used_cells() const override;
 
     size_t total_size() const;
 
     // TODO: better data structures to reduce the cost of this operation
     llama_pos pos_max() const;
 
-    void clear();
+    void clear() override;
 
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1);
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
-    void seq_keep(llama_seq_id seq_id);
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta);
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d);
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id) override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
 
-    llama_pos seq_pos_max(llama_seq_id seq_id);
+    llama_pos seq_pos_max(llama_seq_id seq_id) override;
 
-    void defrag();
+    void defrag() override;
+    bool get_can_shift() const override;
 
     // find an empty slot of size "n_tokens" in the cache
     // updates the cache head
@@ -143,9 +172,10 @@ struct llama_kv_cache {
     bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 
-// TODO: temporary reusing llama_kv_cache -- implement recurrent cache and simplify llama_kv_cache
-struct llama_kv_cache_recurrent : public llama_kv_cache {
-    using llama_kv_cache::llama_kv_cache;
+// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
+class llama_kv_cache_recurrent : public llama_kv_cache_unified {
+public:
+    using llama_kv_cache_unified::llama_kv_cache_unified;
 };
 
 //
@@ -166,9 +196,9 @@ struct llama_kv_slot_restorer {
 
     bool do_restore = false;
 
-    llama_kv_cache & cache;
+    llama_kv_cache_unified & cache;
 
-    explicit llama_kv_slot_restorer(llama_kv_cache & cache) : cache(cache) {
+    explicit llama_kv_slot_restorer(llama_kv_cache_unified & cache) : cache(cache) {
         old_state.head = cache.head;
         old_state.n    = cache.n;
     }
@@ -249,4 +279,4 @@ bool llama_kv_cache_can_shift(const llama_kv_cache * kv);
 
 struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
 
-void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
+void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache * kv);

From 38db8a586105ea8d516e66d0dbcb87924efe70b0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 28 Feb 2025 10:51:17 +0200
Subject: [PATCH 80/84] llama : introduce concept of llama_memory

ggml-ci
---
 src/llama-context.cpp  |    2 +-
 src/llama-context.h    |   17 +-
 src/llama-kv-cache.cpp |    8 +-
 src/llama-kv-cache.h   |   47 +-
 src/llama-memory.cpp   | 1295 ++++++++++++++++++++++++++++++++++++++++
 src/llama-memory.h     |   21 +
 6 files changed, 1345 insertions(+), 45 deletions(-)
 create mode 100644 src/llama-memory.cpp
 create mode 100644 src/llama-memory.h

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5c77b29c13a7d..c599801763181 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -49,7 +49,7 @@ llama_context_base::llama_context_base(
         const llama_model & model,
               llama_context_params params,
               llama_graph_type gtype) :
-    llama_context_i(),
+    llama_context(),
     llama_graph_i(gtype),
     model(model) {
     LLAMA_LOG_INFO("%s: constructing llama_context_base, gtype = %d\n", __func__, gtype);
diff --git a/src/llama-context.h b/src/llama-context.h
index d74db70c7781c..f44652e2d1f18 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -21,10 +21,10 @@ class llama_io_write_i;
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
 // abstract interface corresponding to the public C API
-struct llama_context {
+class llama_context_i {
 public:
-    llama_context() = default;
-    virtual ~llama_context() = default;
+    llama_context_i() = default;
+    virtual ~llama_context_i() = default;
 
     virtual void init() = 0;
 
@@ -157,14 +157,13 @@ struct llama_context {
                 size_t   n_token_count) = 0;
 };
 
-// C++ alias
-class llama_context_i : public llama_context {
-public:
-    using llama_context::llama_context;
+// C alias
+struct llama_context : public llama_context_i {
+    using llama_context_i::llama_context_i;
 };
 
 // basic transformer without KV cache
-class llama_context_base : public llama_context_i, public llama_graph_i {
+class llama_context_base : public llama_context, public llama_graph_i {
 public:
     llama_context_base(
             const llama_model & model,
@@ -821,7 +820,7 @@ class llama_context_dec : public llama_context_kv_self {
     llama_cross * cross = nullptr;
 };
 
-class llama_context_enc_dec : public llama_context_i {
+class llama_context_enc_dec : public llama_context {
 public:
     llama_context_enc_dec(
             const llama_model & model,
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 0cd4142d5f8d5..33ee833125b58 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -122,7 +122,7 @@ bool llama_kv_cache_unified::init(
     return true;
 }
 
-int32_t llama_kv_cache_unified::n_tokens() const {
+int32_t llama_kv_cache_unified::get_n_tokens() const {
     int32_t result = 0;
 
     for (uint32_t i = 0; i < size; i++) {
@@ -132,7 +132,7 @@ int32_t llama_kv_cache_unified::n_tokens() const {
     return result;
 }
 
-uint32_t llama_kv_cache_unified::used_cells() const {
+uint32_t llama_kv_cache_unified::get_used_cells() const {
     return used;
 }
 
@@ -1091,7 +1091,7 @@ int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) {
         return 0;
     }
 
-    return kv->n_tokens();
+    return kv->get_n_tokens();
 }
 
 int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
@@ -1099,7 +1099,7 @@ int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
         return 0;
     }
 
-    return kv->used_cells();
+    return kv->get_used_cells();
 }
 
 void llama_kv_cache_clear(llama_kv_cache * kv) {
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 99eb0be3c7404..8aed239154885 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -2,7 +2,7 @@
 
 #include "llama.h"
 #include "llama-io.h"
-#include "llama-graph.h"
+#include "llama-memory.h"
 
 #include "ggml-cpp.h"
 
@@ -13,6 +13,17 @@ struct llama_cparams;
 struct llama_hparams;
 struct llama_ubatch;
 
+struct llama_kv_cache : public llama_memory_i {
+    using llama_memory_i::llama_memory_i;
+
+    virtual int32_t  get_n_tokens()   const = 0;
+    virtual uint32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
+
+    virtual bool get_can_shift() const = 0;
+
+    bool get_can_edit() const override { return get_can_shift(); }
+};
+
 struct llama_kv_cell {
     llama_pos pos   = -1;
     llama_pos delta = 0;
@@ -45,36 +56,10 @@ struct llama_kv_cache_slot_info {
     operator bool() const { return found; }
 };
 
-struct llama_kv_cache {
-public:
-    virtual int32_t  n_tokens()   const = 0;
-    virtual uint32_t used_cells() const = 0; // TODO: remove
-
-    virtual void clear() = 0;
-    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
-    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
-    virtual void seq_keep(llama_seq_id seq_id) = 0;
-    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) = 0;
-    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
-
-    virtual llama_pos seq_pos_max(llama_seq_id seq_id) = 0;
-
-    virtual void defrag() = 0;
-    virtual bool get_can_shift() const = 0;
-};
-
-
-// C++ alias
-class llama_kv_cache_i : public llama_kv_cache {
-public:
-    using llama_kv_cache::llama_kv_cache;
-};
-
-
 // ring-buffer of cached KV data
 // TODO: pimpl
 // TODO: add notion of max sequences
-class llama_kv_cache_unified : public llama_kv_cache_i {
+class llama_kv_cache_unified : public llama_kv_cache {
 public:
     llama_kv_cache_unified(const llama_hparams & hparams);
     virtual ~llama_kv_cache_unified() = default;
@@ -88,8 +73,8 @@ class llama_kv_cache_unified : public llama_kv_cache_i {
                      uint32_t   kv_size,
                          bool   offload);
 
-    int32_t n_tokens() const override;
-    uint32_t used_cells() const override;
+    int32_t  get_n_tokens()   const override;
+    uint32_t get_used_cells() const override;
 
     size_t total_size() const;
 
@@ -97,6 +82,7 @@ class llama_kv_cache_unified : public llama_kv_cache_i {
     llama_pos pos_max() const;
 
     void clear() override;
+    void defrag() override;
 
     bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
     void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
@@ -106,7 +92,6 @@ class llama_kv_cache_unified : public llama_kv_cache_i {
 
     llama_pos seq_pos_max(llama_seq_id seq_id) override;
 
-    void defrag() override;
     bool get_can_shift() const override;
 
     // find an empty slot of size "n_tokens" in the cache
diff --git a/src/llama-memory.cpp b/src/llama-memory.cpp
new file mode 100644
index 0000000000000..0cd4142d5f8d5
--- /dev/null
+++ b/src/llama-memory.cpp
@@ -0,0 +1,1295 @@
+#include "llama-kv-cache.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-cparams.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <limits>
+#include <map>
+#include <stdexcept>
+
+static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
+
+llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams) : hparams(hparams) {
+}
+
+bool llama_kv_cache_unified::init(
+        const llama_model & model,
+      const llama_cparams & cparams,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                 uint32_t   kv_size,
+                     bool   offload) {
+    const int32_t n_layer = hparams.n_layer;
+
+    has_shift = false;
+
+    recurrent = llama_model_is_recurrent(&model);
+    v_trans   = !recurrent && !cparams.flash_attn;
+    can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+
+    LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
+            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift);
+
+    head = 0;
+    size = kv_size;
+    used = 0;
+
+    this->type_k = type_k;
+    this->type_v = type_v;
+
+    cells.clear();
+    cells.resize(kv_size);
+
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+
+            ctx_map[buft] = ctx;
+            ctxs.emplace_back(ctx);
+
+            return ctx;
+        }
+
+        return it->second;
+    };
+
+    k_l.reserve(n_layer);
+    v_l.reserve(n_layer);
+
+    for (int i = 0; i < n_layer; i++) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
+
+        const char * dev_name = "CPU";
+
+        ggml_backend_buffer_type_t buft;
+        if (offload) {
+            auto * dev = model.dev_layer(i);
+            buft = ggml_backend_dev_buffer_type(dev);
+
+            dev_name = ggml_backend_dev_name(dev);
+        } else {
+            buft = ggml_backend_cpu_buffer_type();
+        }
+
+        LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
+                i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
+
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+            return false;
+        }
+
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+        ggml_format_name(k, "cache_k_l%d", i);
+        ggml_format_name(v, "cache_v_l%d", i);
+        k_l.push_back(k);
+        v_l.push_back(v);
+    }
+
+    // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    for (auto it : ctx_map) {
+        auto * buft = it.first;
+        auto * ctx  = it.second;
+
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
+            return false;
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+        bufs.emplace_back(buf);
+    }
+
+    return true;
+}
+
+int32_t llama_kv_cache_unified::n_tokens() const {
+    int32_t result = 0;
+
+    for (uint32_t i = 0; i < size; i++) {
+        result += cells[i].seq_id.size();
+    }
+
+    return result;
+}
+
+uint32_t llama_kv_cache_unified::used_cells() const {
+    return used;
+}
+
+size_t llama_kv_cache_unified::total_size() const {
+    size_t size = 0;
+    for (const auto & buf : bufs) {
+        size += ggml_backend_buffer_get_size(buf.get());
+    }
+
+    return size;
+}
+
+llama_pos llama_kv_cache_unified::pos_max() const {
+    llama_pos pos_max = -1;
+    for (const auto & cell : cells) {
+        pos_max = std::max(pos_max, cell.pos);
+    }
+
+    return pos_max;
+}
+
+void llama_kv_cache_unified::clear() {
+    for (int32_t i = 0; i < (int32_t) size; ++i) {
+        cells[i].pos = -1;
+        cells[i].seq_id.clear();
+        cells[i].src = -1;
+        cells[i].tail = -1;
+    }
+    head = 0;
+    used = 0;
+
+    for (auto & buf : bufs) {
+        ggml_backend_buffer_clear(buf.get(), 0);
+    }
+}
+
+bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // models like Mamba or RWKV can't have a state partially erased
+    if (recurrent) {
+        if (seq_id >= (int64_t) size) {
+            // could be fatal
+            return false;
+        }
+        if (0 <= seq_id) {
+            int32_t & tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                const llama_kv_cell & cell = cells[tail_id];
+                // partial intersection is invalid
+                if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+                    return false;
+                }
+                // invalidate tails which will be cleared
+                if (p0 <= cell.pos && cell.pos < p1) {
+                    tail_id = -1;
+                }
+            }
+        } else {
+            // seq_id is negative, then the range should include everything or nothing
+            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+                return false;
+            }
+        }
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].pos >= p0 && cells[i].pos < p1) {
+            if (seq_id < 0) {
+                cells[i].seq_id.clear();
+            } else if (cells[i].has_seq_id(seq_id)) {
+                cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
+            if (cells[i].is_empty()) {
+                // keep count of the number of used cells
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+
+                cells[i].pos = -1;
+                cells[i].src = -1;
+
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+
+    return true;
+}
+
+void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    if (recurrent) {
+        if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
+            llama_kv_cell & tail_src = cells[seq_id_src];
+            llama_kv_cell & tail_dst = cells[seq_id_dst];
+            if (tail_dst.tail >= 0) {
+                // clear destination seq_id if it wasn't empty
+                llama_kv_cell & cell_dst = cells[tail_dst.tail];
+
+                cell_dst.seq_id.erase(seq_id_dst);
+                tail_dst.tail = -1;
+                if (cell_dst.seq_id.empty()) {
+                    cell_dst.pos = -1;
+                    cell_dst.delta = -1;
+                    cell_dst.src = -1;
+                    used -= 1;
+                }
+            }
+            if (tail_src.tail >= 0) {
+                llama_kv_cell & cell_src = cells[tail_src.tail];
+
+                cell_src.seq_id.insert(seq_id_dst);
+                tail_dst.tail = tail_src.tail;
+            }
+        }
+
+        return;
+    }
+
+    // otherwise, this is the KV of a Transformer-like model
+    head = 0;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            cells[i].seq_id.insert(seq_id_dst);
+        }
+    }
+}
+
+void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
+    uint32_t new_head = size;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (recurrent && (llama_seq_id) i != seq_id) {
+            cells[i].tail = -1;
+        }
+
+        if (!cells[i].has_seq_id(seq_id)) {
+            if (cells[i].pos >= 0) {
+                used--;
+            }
+
+            cells[i].pos = -1;
+            cells[i].src = -1;
+            cells[i].seq_id.clear();
+
+            if (new_head == size){
+                new_head = i;
+            }
+        } else {
+            cells[i].seq_id.clear();
+            cells[i].seq_id.insert(seq_id);
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    if (delta == 0) {
+        return;
+    }
+
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the
+    if (p0 == p1) {
+        return;
+    }
+
+    if (recurrent) {
+        // for Mamba-like or RWKV models, only the pos needs to be shifted
+        if (0 <= seq_id && seq_id < (int64_t) size) {
+            const int32_t tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                llama_kv_cell & cell = cells[tail_id];
+                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                    cell.pos += delta;
+                }
+            }
+        }
+        return;
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            has_shift = true;
+            cells[i].pos   += delta;
+            cells[i].delta += delta;
+
+            if (cells[i].pos < 0) {
+                if (!cells[i].is_empty()) {
+                    used--;
+                }
+                cells[i].pos = -1;
+                cells[i].seq_id.clear();
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    // Otherwise we just start the next search from the beginning.
+    head = new_head != size ? new_head : 0;
+}
+
+void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (d == 1) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) {
+        return;
+    }
+
+    if (recurrent) {
+        // for Mamba-like or RWKV models, only the pos needs to be changed
+        if (0 <= seq_id && seq_id < (int64_t) size) {
+            const int32_t tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                llama_kv_cell & cell = cells[tail_id];
+                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                    cell.pos /= d;
+                }
+            }
+        }
+
+        return;
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            has_shift = true;
+
+            {
+                llama_pos p_old = cells[i].pos;
+                cells[i].pos   /= d;
+                cells[i].delta += cells[i].pos - p_old;
+            }
+        }
+    }
+}
+
+llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) {
+    llama_pos result = 0;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::max(result, cells[i].pos);
+        }
+    }
+
+    return result;
+}
+
+void llama_kv_cache_unified::defrag() {
+    if (!recurrent) {
+        do_defrag = true;
+    }
+}
+
+bool llama_kv_cache_unified::get_can_shift() const {
+    return can_shift;
+}
+
+struct llama_kv_cache_slot_info llama_kv_cache_unified::find_slot(
+       const struct llama_ubatch & ubatch) {
+    const uint32_t n_tokens = ubatch.n_tokens;
+    const uint32_t n_seqs   = ubatch.n_seqs;
+    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    if (recurrent) {
+        // For recurrent state architectures (like Mamba or RWKV),
+        // each cache cell can store the state for a whole sequence.
+        // A slot should be always be contiguous.
+
+        // can only process batches with an equal number of new tokens in each sequence
+        GGML_ASSERT(ubatch.equal_seqs);
+
+        int32_t min = size - 1;
+        int32_t max = 0;
+
+        // everything should fit if all seq_ids are smaller than the max
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            const uint32_t n_seq_id = ubatch.n_seq_id[s];
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                const llama_seq_id seq_id = ubatch.seq_id[s][j];
+
+                if (seq_id < 0 || (uint32_t) seq_id >= size) {
+                    // too big seq_id
+                    // TODO: would it be possible to resize the cache instead?
+                    LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
+                    return llama_kv_cache_slot_info_failed;
+                }
+                if (j > 0) {
+                    llama_kv_cell & seq = cells[seq_id];
+                    if (seq.tail >= 0) {
+                        llama_kv_cell & cell = cells[seq.tail];
+                        // clear cells from seq_ids that become shared
+                        // (should not normally happen, but let's handle it anyway)
+                        cell.seq_id.erase(seq_id);
+                        seq.tail = -1;
+                        if (cell.seq_id.empty()) {
+                            cell.pos = -1;
+                            cell.src = -1;
+                            used -= 1;
+                        }
+                    }
+                }
+            }
+        }
+
+#ifndef NDEBUG
+        {
+            std::vector<int32_t> tails_verif;
+            tails_verif.assign(size, -1);
+            for (uint32_t i = 0; i < size; ++i) {
+                llama_kv_cell & cell = cells[i];
+                for (llama_seq_id seq_id : cell.seq_id) {
+                    if (tails_verif[seq_id] != -1) {
+                        LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
+                    }
+                    tails_verif[seq_id] = i;
+                }
+            }
+            for (uint32_t i = 0; i < size; ++i) {
+                if (tails_verif[i] != cells[i].tail) {
+                    LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
+                }
+            }
+        }
+#endif
+
+        // find next empty cell
+        uint32_t next_empty_cell = head;
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (next_empty_cell >= size) { next_empty_cell -= size; }
+            llama_kv_cell & cell = cells[next_empty_cell];
+            if (cell.is_empty()) { break; }
+            next_empty_cell += 1;
+        }
+
+        // find usable cell range
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+            llama_kv_cell & seq_meta = cells[seq_id];
+            bool has_cell = false;
+            if (seq_meta.tail >= 0) {
+                llama_kv_cell & cell = cells[seq_meta.tail];
+                GGML_ASSERT(cell.has_seq_id(seq_id));
+                // does this seq_id "own" the cell?
+                if (cell.seq_id.size() == 1) { has_cell = true; }
+            }
+            if (!has_cell) {
+                llama_kv_cell & empty_cell = cells[next_empty_cell];
+                GGML_ASSERT(empty_cell.is_empty());
+                // copy old tail into the empty cell
+                if (seq_meta.tail >= 0) {
+                    llama_kv_cell & orig_cell = cells[seq_meta.tail];
+                    empty_cell.pos = orig_cell.pos;
+                    empty_cell.src = orig_cell.src;
+                    orig_cell.seq_id.erase(seq_id);
+                    empty_cell.seq_id.insert(seq_id); // will be overwritten
+                }
+                seq_meta.tail = next_empty_cell;
+                // find next empty cell
+                if (s + 1 < n_seqs) {
+                    next_empty_cell += 1;
+                    for (uint32_t i = 0; i < size; ++i) {
+                        if (next_empty_cell >= size) { next_empty_cell -= size; }
+                        llama_kv_cell & cell = cells[next_empty_cell];
+                        if (cell.is_empty()) { break; }
+                        next_empty_cell += 1;
+                    }
+                }
+            }
+            if (min > seq_meta.tail) { min = seq_meta.tail; }
+            if (max < seq_meta.tail) { max = seq_meta.tail; }
+        }
+
+        // gather and re-order
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            int32_t dst_id = s + min;
+            int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
+            if (dst_id != src_id) {
+                llama_kv_cell & dst_cell = cells[dst_id];
+                llama_kv_cell & src_cell = cells[src_id];
+
+                std::swap(dst_cell.pos, src_cell.pos);
+                std::swap(dst_cell.src, src_cell.src);
+                std::swap(dst_cell.seq_id, src_cell.seq_id);
+
+                // swap tails (assuming they NEVER overlap)
+                for (const llama_seq_id seq_id : src_cell.seq_id) {
+                    cells[seq_id].tail = src_id;
+                }
+                for (const llama_seq_id seq_id : dst_cell.seq_id) {
+                    cells[seq_id].tail = dst_id;
+                }
+            }
+        }
+
+        // update the pos of the used seqs
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
+            int32_t cell_id = s + min;
+            llama_kv_cell & cell = cells[cell_id];
+
+            if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
+                // What should happen when the pos backtracks or skips a value?
+                // Clearing the state mid-batch would require special-casing which isn't done.
+                LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
+                    __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
+            }
+            cell.pos = last_pos;
+            cell.seq_id.clear();
+            for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
+                const llama_seq_id seq_id = ubatch.seq_id[s][j];
+                cell.seq_id.insert(seq_id);
+                cells[seq_id].tail = cell_id;
+            }
+        }
+
+        // allow getting the range of used cells, from head to head + n
+        head = min;
+        n    = max - min + 1;
+        used = std::count_if(cells.begin(), cells.end(),
+            [](const llama_kv_cell& cell){ return !cell.is_empty(); });
+
+        // sanity check
+        return llama_kv_cache_slot_info(n >= n_seqs);
+    }
+
+    // otherwise, one cell per token.
+
+    if (n_tokens > size) {
+        LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
+        return llama_kv_cache_slot_info_failed;
+    }
+
+    uint32_t n_tested = 0;
+
+    while (true) {
+        if (head + n_tokens > size) {
+            n_tested += size - head;
+            head = 0;
+            continue;
+        }
+
+        bool found = true;
+        for (uint32_t i = 0; i < n_tokens; i++) {
+            if (cells[head + i].pos >= 0) {
+                found = false;
+                head     += i + 1;
+                n_tested += i + 1;
+                break;
+            }
+        }
+
+        if (found) {
+            break;
+        }
+
+        if (n_tested >= size) {
+            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+            return llama_kv_cache_slot_info_failed;
+        }
+    }
+
+    for (uint32_t s = 0; s < n_seqs; s++) {
+        for (uint32_t i = 0; i < n_seq_tokens; ++i) {
+            uint32_t k = s*n_seq_tokens + i;
+            cells[head + k].pos = ubatch.pos[k];
+
+            for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
+                cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
+            }
+        }
+    }
+
+    used += n_tokens;
+
+    return llama_kv_cache_slot_info(head, head + n_tokens);
+}
+
+uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const {
+    // the FA kernels require padding to avoid extra runtime boundary checks
+    return cparams.flash_attn ? 256u : 32u;
+}
+
+uint32_t llama_kv_cache_unified::cell_max() const {
+    for (uint32_t i = size; i > 0; --i) {
+        const llama_kv_cell & cell = cells[i - 1];
+
+        if (cell.pos >= 0 && !cell.is_empty()) {
+            return i;
+        }
+    }
+
+    return 0;
+}
+
+size_t llama_kv_cache_unified::size_k_bytes() const {
+    size_t size_k_bytes = 0;
+
+    for (const auto & k : k_l) {
+        size_k_bytes += ggml_nbytes(k);
+    }
+
+    return size_k_bytes;
+}
+
+size_t llama_kv_cache_unified::size_v_bytes() const {
+    size_t size_v_bytes = 0;
+
+    for (const auto & v : v_l) {
+        size_v_bytes += ggml_nbytes(v);
+    }
+
+    return size_v_bytes;
+}
+
+void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+    uint32_t cell_count = 0;
+
+    // Count the number of cells with the specified seq_id
+    // Find all the ranges of cells with this seq id (or all, when -1)
+    uint32_t cell_range_begin = size;
+    for (uint32_t i = 0; i < size; ++i) {
+        const auto & cell = cells[i];
+        if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
+            ++cell_count;
+            if (cell_range_begin == size) {
+                cell_range_begin = i;
+            }
+        } else {
+            if (cell_range_begin != size) {
+                cell_ranges.emplace_back(cell_range_begin, i);
+                cell_range_begin = size;
+            }
+        }
+    }
+    if (cell_range_begin != size) {
+        cell_ranges.emplace_back(cell_range_begin, size);
+    }
+
+    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+    uint32_t cell_count_check = 0;
+    for (const auto & range : cell_ranges) {
+        cell_count_check += range.second - range.first;
+    }
+    GGML_ASSERT(cell_count == cell_count_check);
+
+    io.write(&cell_count, sizeof(cell_count));
+
+    state_write_meta(io, cell_ranges, seq_id);
+    state_write_data(io, cell_ranges);
+}
+
+void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    uint32_t cell_count;
+    io.read_to(&cell_count, sizeof(cell_count));
+
+    bool res = true;
+    res = res && state_read_meta(io, cell_count, seq_id);
+    res = res && state_read_data(io, cell_count);
+
+    if (!res) {
+        if (seq_id == -1) {
+            clear();
+        } else {
+            seq_rm(seq_id, -1, -1);
+        }
+        throw std::runtime_error("failed to restore kv cache");
+    }
+}
+
+void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+    for (const auto & range : cell_ranges) {
+        for (uint32_t i = range.first; i < range.second; ++i) {
+            const auto & cell = cells[i];
+            const llama_pos pos      = cell.pos;
+            const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
+
+            io.write(&pos,      sizeof(pos));
+            io.write(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id) {
+                for (auto seq_id : cell.seq_id) {
+                    io.write(&seq_id, sizeof(seq_id));
+                }
+            }
+        }
+    }
+}
+
+void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
+    const uint32_t v_trans = this->v_trans ? 1 : 0;
+    const uint32_t n_layer = hparams.n_layer;
+
+    io.write(&v_trans, sizeof(v_trans));
+    io.write(&n_layer, sizeof(n_layer));
+
+    std::vector<uint8_t> tmp_buf;
+
+    // Iterate and write all the keys first, each row is a cell
+    // Get whole range at a time
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Write key type
+        const int32_t k_type_i = (int32_t)k_l[il]->type;
+        io.write(&k_type_i, sizeof(k_type_i));
+
+        // Write row size of key
+        const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        io.write(&k_size_row, sizeof(k_size_row));
+
+        // Read each range of cells of k_size length each into tmp_buf and write out
+        for (const auto & range : cell_ranges) {
+            const size_t range_size = range.second - range.first;
+            const size_t buf_size = range_size * k_size_row;
+            io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write row size of value
+            const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            io.write(&v_size_row, sizeof(v_size_row));
+
+            // Read each range of cells of v_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                const size_t buf_size = range_size * v_size_row;
+                io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
+            }
+        }
+    } else {
+        // When v is transposed, we also need the element size and get the element ranges from each row
+        const uint32_t kv_size = size;
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write element size
+            const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
+            io.write(&v_size_el, sizeof(v_size_el));
+
+            // Write GQA embedding size
+            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+            // For each row, we get the element values of each cell
+            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+                    const size_t buf_size = range_size * v_size_el;
+                    io.write_tensor(v_l[il], src_offset, buf_size);
+                }
+            }
+        }
+    }
+}
+
+bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+    if (dest_seq_id != -1) {
+        // single sequence
+
+        seq_rm(dest_seq_id, -1, -1);
+
+        llama_sbatch sbatch;
+        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
+
+        batch.n_tokens = cell_count;
+        batch.n_seq_tokens = cell_count;
+        batch.n_seqs = 1;
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id != 0) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                return false;
+            }
+
+            batch.pos[i] = pos;
+        }
+        batch.n_seq_id[0] = 1;
+        batch.seq_id[0] = &dest_seq_id;
+        if (!find_slot(batch)) {
+            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            return false;
+        }
+
+        // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+        // Assume that this is one contiguous block of cells
+        GGML_ASSERT(head + cell_count <= size);
+        GGML_ASSERT(cells[head].pos == batch.pos[0]);
+        GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+        GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
+        GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
+    } else {
+        // whole KV cache restore
+
+        if (cell_count > size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+            return false;
+        }
+
+        clear();
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_kv_cell & cell = cells[i];
+
+            llama_pos pos;
+            uint32_t  n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            cell.pos = pos;
+
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+
+                // TODO: llama_kv_cache_unified should have a notion of max sequences
+                //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                if (seq_id < 0) {
+                    //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
+                    return false;
+                }
+
+                cell.seq_id.insert(seq_id);
+
+                if (recurrent) {
+                    int32_t & tail = cells[seq_id].tail;
+                    if (tail != -1) {
+                        LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
+                        return false;
+                    }
+                    tail = i;
+                }
+            }
+        }
+
+        head = 0;
+        used = cell_count;
+    }
+
+    if (recurrent) {
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            uint32_t cell_id = head + i;
+            // make sure the recurrent states will keep their restored state
+            cells[cell_id].src = cell_id;
+        }
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
+    uint32_t v_trans;
+    uint32_t n_layer;
+    io.read_to(&v_trans, sizeof(v_trans));
+    io.read_to(&n_layer, sizeof(n_layer));
+
+    if (n_layer != hparams.n_layer) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+        return false;
+    }
+    if (cell_count > size) {
+        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
+        return false;
+    }
+    if (v_trans != (bool) v_trans) {
+        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+        return false;
+    }
+
+    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Read type of key
+        int32_t k_type_i_ref;
+        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+        const int32_t k_type_i = (int32_t) k_l[il]->type;
+        if (k_type_i != k_type_i_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+            return false;
+        }
+
+        // Read row size of key
+        uint64_t k_size_row_ref;
+        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        if (k_size_row != k_size_row_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+            return false;
+        }
+
+        if (cell_count) {
+            // Read and set the keys for the whole cell range
+            ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of value
+            uint64_t v_size_row_ref;
+            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+            const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            if (v_size_row != v_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the values for the whole cell range
+                ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+            }
+        }
+    } else {
+        // For each layer, read the values for each cell (transposed)
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read element size of value
+            uint32_t v_size_el_ref;
+            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+            const size_t v_size_el = ggml_type_size(v_l[il]->type);
+            if (v_size_el != v_size_el_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                return false;
+            }
+
+            // Read GQA embedding size
+            uint32_t n_embd_v_gqa_ref;
+            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // For each row in the transposed matrix, read the values for the whole cell range
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    const size_t dst_offset = (head + j * size) * v_size_el;
+                    ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+//
+// interface implementation
+//
+
+int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) {
+    if (!kv) {
+        return 0;
+    }
+
+    return kv->n_tokens();
+}
+
+int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
+    if (!kv) {
+        return 0;
+    }
+
+    return kv->used_cells();
+}
+
+void llama_kv_cache_clear(llama_kv_cache * kv) {
+    if (!kv) {
+        return;
+    }
+
+    kv->clear();
+}
+
+bool llama_kv_cache_seq_rm(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1) {
+    if (!kv) {
+        return true;
+    }
+
+    return kv->seq_rm(seq_id, p0, p1);
+}
+
+void llama_kv_cache_seq_cp(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id_src,
+          llama_seq_id   seq_id_dst,
+             llama_pos   p0,
+             llama_pos   p1) {
+    if (!kv) {
+        return;
+    }
+
+    kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id) {
+    if (!kv) {
+        return;
+    }
+
+    kv->seq_keep(seq_id);
+}
+
+void llama_kv_cache_seq_add(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1,
+             llama_pos   delta) {
+    if (!kv) {
+        return;
+    }
+
+    kv->seq_add(seq_id, p0, p1, delta);
+}
+
+void llama_kv_cache_seq_div(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1,
+                   int   d) {
+    if (!kv) {
+        return;
+    }
+
+    kv->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id) {
+    if (!kv) {
+        return 0;
+    }
+
+    return kv->seq_pos_max(seq_id);
+}
+
+void llama_kv_cache_defrag(llama_kv_cache * kv) {
+    if (!kv) {
+        return;
+    }
+
+    kv->defrag();
+}
+
+bool llama_kv_cache_can_shift(const llama_kv_cache * kv) {
+    if (!kv) {
+        return false;
+    }
+
+    return kv->get_can_shift();
+}
+
+//
+// kv cache view
+//
+
+struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max) {
+    struct llama_kv_cache_view result = {
+        /*.n_cells            = */ 0,
+        /*.n_seq_max          = */ n_seq_max,
+        /*.token_count        = */ 0,
+        /*.used_cells         = */ llama_kv_cache_used_cells(&kv),
+        /*.max_contiguous     = */ 0,
+        /*.max_contiguous_idx = */ -1,
+        /*.cells              = */ nullptr,
+        /*.cells_sequences    = */ nullptr,
+    };
+
+    return result;
+}
+
+void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
+    if (view->cells != nullptr) {
+        free(view->cells);
+        view->cells = nullptr;
+    }
+    if (view->cells_sequences != nullptr) {
+        free(view->cells_sequences);
+        view->cells_sequences = nullptr;
+    }
+}
+
+void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache * kv) {
+    // TODO: rework this in the future, for now quick hack
+    const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
+    if (kvu == nullptr) {
+        LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
+        return;
+    }
+
+    if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
+        view->n_cells = int32_t(kvu->size);
+        void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
+        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
+        view->cells = (struct llama_kv_cache_view_cell *)p;
+        p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
+        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
+        view->cells_sequences = (llama_seq_id *)p;
+    }
+
+    const std::vector<llama_kv_cell> & kv_cells = kvu->cells;
+    llama_kv_cache_view_cell * c_curr = view->cells;
+    llama_seq_id * cs_curr = view->cells_sequences;
+    int32_t used_cells = 0;
+    int32_t token_count = 0;
+    int32_t curr_contig_idx = -1;
+    uint32_t max_contig = 0;
+    int32_t max_contig_idx = -1;
+
+    for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
+        const size_t curr_size = kv_cells[i].seq_id.size();
+        token_count += curr_size;
+        c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
+
+        if (curr_size > 0) {
+            if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
+                max_contig = i - curr_contig_idx;
+                max_contig_idx = curr_contig_idx;
+            }
+            curr_contig_idx = -1;
+        } else if (curr_contig_idx < 0) {
+            curr_contig_idx = i;
+        }
+
+        int seq_idx = 0;
+        for (const llama_seq_id it : kv_cells[i].seq_id) {
+            if (seq_idx >= view->n_seq_max) {
+                break;
+            }
+            cs_curr[seq_idx] = it;
+            seq_idx++;
+        }
+        if (seq_idx != 0) {
+            used_cells++;
+        }
+        for (; seq_idx < view->n_seq_max; seq_idx++) {
+            cs_curr[seq_idx] = -1;
+        }
+    }
+    if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
+        max_contig_idx = curr_contig_idx;
+        max_contig = kv_cells.size() - curr_contig_idx;
+    }
+    view->max_contiguous = max_contig;
+    view->max_contiguous_idx = max_contig_idx;
+    view->token_count = token_count;
+    view->used_cells = used_cells;
+    if (uint32_t(used_cells) != kvu->used) {
+        LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
+            __func__, kvu->used, used_cells);
+    }
+}
diff --git a/src/llama-memory.h b/src/llama-memory.h
new file mode 100644
index 0000000000000..69e6e34ca4516
--- /dev/null
+++ b/src/llama-memory.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "llama.h"
+
+// general concept of LLM memory
+// the KV cache is a type of LLM memory, but there can be other types
+class llama_memory_i {
+public:
+    virtual void clear() = 0;
+    virtual void defrag() = 0;
+
+    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
+    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
+    virtual void seq_keep(llama_seq_id seq_id) = 0;
+    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) = 0;
+    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
+
+    virtual llama_pos seq_pos_max(llama_seq_id seq_id) = 0;
+
+    virtual bool get_can_edit() const = 0;
+};

From 7f02ee562efae35fa0abcd8f4ae3bbfe3728be27 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 28 Feb 2025 14:09:20 +0200
Subject: [PATCH 81/84] context : decouple inputs, llama_graph_i become const
 (WIP)

ggml-ci
---
 src/llama-context.cpp | 737 ++++++++++++++++++++++++------------------
 src/llama-context.h   |  53 ++-
 src/llama-graph.cpp   |  25 +-
 src/llama-graph.h     |  86 ++++-
 src/llama-model.cpp   | 463 +++++++++++++-------------
 src/llama-model.h     |   5 +-
 6 files changed, 789 insertions(+), 580 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index c599801763181..5ac28f983027e 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -45,6 +45,137 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
 // llama_context_base
 //
 
+class llama_graph_input_embd : public llama_graph_input_i {
+public:
+    llama_graph_input_embd()          = default;
+    virtual ~llama_graph_input_embd() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * tokens = nullptr; // I32 [n_batch]
+    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
+};
+
+void llama_graph_input_embd::set_input(const llama_ubatch * ubatch) {
+    if (ubatch->token) {
+        const int64_t n_tokens = ubatch->n_tokens;
+
+        ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens));
+    }
+
+    if (ubatch->embd) {
+        const int64_t n_embd   = embd->ne[0];
+        const int64_t n_tokens = ubatch->n_tokens;
+
+        ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
+    }
+}
+
+class llama_graph_input_attn_base : public llama_graph_input_attn_i {
+public:
+    llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) :
+        hparams(hparams),
+        cparams(cparams) {
+    }
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * get_kq_mask() override { return kq_mask_cnv; }
+
+    ggml_tensor * kq_mask     = nullptr; // F32 [n_tokens, n_batch]
+    ggml_tensor * kq_mask_cnv = nullptr; //     [n_tokens, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+};
+
+void llama_graph_input_attn_base::set_input(const llama_ubatch * ubatch) {
+    if (kq_mask) {
+        if (cparams.causal_attn) {
+            const int64_t n_kv         = ubatch->n_tokens;
+            const int64_t n_tokens     = ubatch->n_tokens;
+            const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+            const int64_t n_seqs       = ubatch->n_seqs;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
+            float * data = (float *) kq_mask->data;
+
+            for (int h = 0; h < 1; ++h) {
+                for (int s1 = 0; s1 < n_seqs; ++s1) {
+                    const llama_seq_id seq_id = ubatch->seq_id[s1][0];
+
+                    for (int j = 0; j < n_seq_tokens; ++j) {
+                        const int32_t tj = s1*n_seq_tokens + j;
+
+                        for (int s0 = 0; s0 < n_seqs; ++s0) {
+                            for (int i = 0; i < n_seq_tokens; ++i) {
+                                const int32_t ti = s0*n_seq_tokens + i;
+                                float f = -INFINITY;
+
+                                for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
+                                    if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
+                                        if (hparams.use_alibi) {
+                                            f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
+                                        } else {
+                                            f = 0.0f;
+                                        }
+                                        break;
+                                    }
+                                }
+
+                                data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f;
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            const int64_t n_tokens     = ubatch->n_tokens;
+            const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+            const int64_t n_seqs       = ubatch->n_seqs;
+            const int64_t n_stride     = ubatch->n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
+
+            float * data = (float *) kq_mask->data;
+
+            for (int h = 0; h < 1; ++h) {
+                for (int s1 = 0; s1 < n_seqs; ++s1) {
+                    const llama_seq_id seq_id = ubatch->seq_id[s1][0];
+
+                    for (int j = 0; j < n_seq_tokens; ++j) {
+                        const int32_t tj = s1*n_seq_tokens + j;
+
+                        for (int s0 = 0; s0 < n_seqs; ++s0) {
+                            for (int i = 0; i < n_seq_tokens; ++i) {
+                                const int32_t ti = s0*n_seq_tokens + i;
+                                float f = -INFINITY;
+
+                                for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
+                                    if (ubatch->seq_id[s0][s] == seq_id) {
+                                        if (hparams.use_alibi) {
+                                            f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
+                                        } else {
+                                            f = 0.0f;
+                                        }
+                                        break;
+                                    }
+                                }
+
+                                data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
+                            }
+                        }
+
+                        for (int i = n_tokens; i < n_stride; ++i) {
+                            data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 llama_context_base::llama_context_base(
         const llama_model & model,
               llama_context_params params,
@@ -714,7 +845,8 @@ int llama_context_base::encode(llama_batch & inp_batch) {
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-    input_set(ubatch);
+    res->set_inputs(&ubatch);
+    input_set(ubatch); // TODO: remove, tmp here, until all inputs are migrated outside the context
 
     const auto compute_status = graph_compute(gf, n_tokens > 1);
     switch (compute_status) {
@@ -729,7 +861,7 @@ int llama_context_base::encode(llama_batch & inp_batch) {
             return -3;
     }
 
-    auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd;
+    auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
 
     // extract embeddings
     if (t_embd) {
@@ -870,7 +1002,8 @@ int llama_context_base::decode(llama_batch & inp_batch) {
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-    input_set(ubatch);
+    res->set_inputs(&ubatch);
+    input_set(ubatch); // TODO: remove
 
     const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
     if (compute_status != GGML_STATUS_SUCCESS) {
@@ -885,11 +1018,11 @@ int llama_context_base::decode(llama_batch & inp_batch) {
         }
     }
 
-    auto * t_logits = cparams.embeddings ? nullptr    : res.t_logits;
-    auto * t_embd   = cparams.embeddings ? res.t_embd : nullptr;
+    auto * t_logits = cparams.embeddings ? nullptr         : res->get_logits();
+    auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
 
-    if (t_embd && res.t_embd_pooled) {
-        t_embd = res.t_embd_pooled;
+    if (t_embd && res->get_embd_pooled()) {
+        t_embd = res->get_embd_pooled();
     }
 
     // extract logits
@@ -1002,19 +1135,6 @@ int64_t llama_context_base::n_pos_per_token() const {
 void llama_context_base::input_set(const llama_ubatch & ubatch) {
     const llama_hparams & hparams = model.hparams;
 
-    if (ubatch.token) {
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp.tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp.tokens));
-    }
-
-    if (ubatch.embd) {
-        const int64_t n_embd   = hparams.n_embd;
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp.embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp.embd));
-    }
-
     if (ubatch.pos && inp.pos) {
         const int64_t n_tokens = ubatch.n_tokens;
 
@@ -1159,91 +1279,6 @@ void llama_context_base::input_set(const llama_ubatch & ubatch) {
         }
     }
 
-    if (inp.kq_mask) {
-        if (cparams.causal_attn) {
-            const int64_t n_kv         = ubatch.n_tokens;
-            const int64_t n_tokens     = ubatch.n_tokens;
-            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-            const int64_t n_seqs       = ubatch.n_seqs;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp.kq_mask->buffer));
-            float * data = (float *) inp.kq_mask->data;
-
-            for (int h = 0; h < 1; ++h) {
-                for (int s1 = 0; s1 < n_seqs; ++s1) {
-                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
-
-                    for (int j = 0; j < n_seq_tokens; ++j) {
-                        const int32_t tj = s1*n_seq_tokens + j;
-
-                        for (int s0 = 0; s0 < n_seqs; ++s0) {
-                            for (int i = 0; i < n_seq_tokens; ++i) {
-                                const int32_t ti = s0*n_seq_tokens + i;
-                                float f = -INFINITY;
-
-                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
-                                    if (ubatch.seq_id[s0][s] == seq_id && ubatch.pos[ti] <= ubatch.pos[tj]) {
-                                        if (hparams.use_alibi) {
-                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
-                                        } else {
-                                            f = 0.0f;
-                                        }
-                                        break;
-                                    }
-                                }
-
-                                data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f;
-                            }
-                        }
-                    }
-                }
-            }
-        } else {
-            const int64_t n_tokens     = ubatch.n_tokens;
-            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-            const int64_t n_seqs       = ubatch.n_seqs;
-            const int64_t n_stride     = ubatch.n_tokens;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp.kq_mask->buffer));
-
-            float * data = (float *) inp.kq_mask->data;
-
-            for (int h = 0; h < 1; ++h) {
-                for (int s1 = 0; s1 < n_seqs; ++s1) {
-                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
-
-                    for (int j = 0; j < n_seq_tokens; ++j) {
-                        const int32_t tj = s1*n_seq_tokens + j;
-
-                        for (int s0 = 0; s0 < n_seqs; ++s0) {
-                            for (int i = 0; i < n_seq_tokens; ++i) {
-                                const int32_t ti = s0*n_seq_tokens + i;
-                                float f = -INFINITY;
-
-                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
-                                    if (ubatch.seq_id[s0][s] == seq_id) {
-                                        if (hparams.use_alibi) {
-                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
-                                        } else {
-                                            f = 0.0f;
-                                        }
-                                        break;
-                                    }
-                                }
-
-                                data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
-                            }
-                        }
-
-                        for (int i = n_tokens; i < n_stride; ++i) {
-                            data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
     if (inp.pos_bucket) {
         const int64_t n_tokens = ubatch.n_tokens;
 
@@ -1401,7 +1436,7 @@ ggml_cgraph * llama_context_base::graph_init() {
     return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
 }
 
-llama_graph_result llama_context_base::graph_build(
+llama_graph_result_ptr llama_context_base::graph_build(
             ggml_context * ctx,
              ggml_cgraph * gf,
       const llama_ubatch & ubatch) {
@@ -1604,21 +1639,24 @@ ggml_tensor * llama_context_base::build_rope_shift(
 }
 
 ggml_tensor * llama_context_base::build_inp_embd(
-        ggml_context * ctx0,
-         ggml_tensor * tok_embd,
-  const llama_ubatch & ubatch) {
+        llama_graph_result * res,
+              ggml_context * ctx0,
+               ggml_tensor * tok_embd,
+        const llama_ubatch & ubatch) const {
     const auto & hparams = model.hparams;
 
     const int64_t n_embd = hparams.n_embd;
 
+    auto inp = std::make_shared<llama_graph_input_embd>();
+
     struct ggml_tensor * inpL;
 
     if (ubatch.token) {
-        inp.tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-        //cb(inp.tokens, "inp_tokens", -1);
-        ggml_set_input(inp.tokens);
+        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+        //cb(inp->tokens, "inp_tokens", -1);
+        ggml_set_input(inp->tokens);
 
-        inpL = ggml_get_rows(ctx0, tok_embd, inp.tokens);
+        inpL = ggml_get_rows(ctx0, tok_embd, inp->tokens);
 
         // apply lora for embedding tokens if needed
         for (const auto & lora : loras) {
@@ -1632,15 +1670,15 @@ ggml_tensor * llama_context_base::build_inp_embd(
 
             struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
                         ctx0, lw->b, // non-transposed lora_b
-                        ggml_get_rows(ctx0, lw->a, inp.tokens)
+                        ggml_get_rows(ctx0, lw->a, inp->tokens)
                         ), scale);
 
             inpL = ggml_add(ctx0, inpL, inpL_delta);
         }
     } else {
-        inp.embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
-        inpL = inp.embd;
-        ggml_set_input(inp.embd);
+        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+        inpL = inp->embd;
+        ggml_set_input(inp->embd);
     }
 
     // For Granite architecture
@@ -1648,6 +1686,8 @@ ggml_tensor * llama_context_base::build_inp_embd(
         inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale);
     }
 
+    res->add_input(std::move(inp));
+
     //cb(inpL, "inp_embd", -1);
 
     return inpL;
@@ -1699,23 +1739,31 @@ ggml_tensor * llama_context_base::build_inp_cls(
     return inp.cls;
 }
 
-void llama_context_base::build_attn_inp(
-        ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   causal,
-                bool   swa) {
+llama_graph_input_attn_ptr llama_context_base::build_attn_inp(
+        llama_graph_result * res,
+              ggml_context * ctx0,
+                   int32_t   n_tokens,
+                      bool   causal,
+                      bool   swa) const {
+    auto inp = std::make_shared<llama_graph_input_attn_base>(model.hparams, cparams);
+
     // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
     GGML_UNUSED(causal);
     GGML_UNUSED(swa);
 
-    inp.kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
     //cb(inp_kq_mask, "KQ_mask", -1);
-    ggml_set_input(inp.kq_mask);
+    ggml_set_input(inp->kq_mask);
+
+    inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
 
-    inp.kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.kq_mask, GGML_TYPE_F16) : inp.kq_mask;
+    res->add_input(inp);
+
+    return inp;
 }
 
 ggml_tensor * llama_context_base::build_attn(
+        llama_graph_input_attn_i * inp,
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * q_cur,
@@ -1723,10 +1771,10 @@ ggml_tensor * llama_context_base::build_attn(
          ggml_tensor * v_cur,
          ggml_tensor * kq_b,
              float     kq_scale,
-             int       il) {
+             int       il) const {
     GGML_UNUSED(il);
 
-    const auto & kq_mask = inp.kq_mask_cnv;
+    const auto & kq_mask = inp->get_kq_mask();
 
     ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
     //cb(q, "q", il);
@@ -1751,7 +1799,7 @@ ggml_tensor * llama_context_base::build_attn_mha(
          ggml_tensor * kq_b,
          ggml_tensor * kq_mask,
              bool      v_trans,
-             float     kq_scale) {
+             float     kq_scale) const {
     const auto & hparams = model.hparams;
 
   //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
@@ -2380,6 +2428,156 @@ size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_i
 // llama_context_kv_self
 //
 
+class llama_graph_input_attn_kv_self : public llama_graph_input_attn_i {
+public:
+    llama_graph_input_attn_kv_self(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_kv_cache_unified * kv_self) :
+        hparams(hparams),
+        cparams(cparams),
+        kv_self(kv_self) {
+    }
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * get_kq_mask()     override { return self_kq_mask_cnv; }
+    ggml_tensor * get_kq_mask_swa() override { return self_kq_mask_swa_cnv; }
+
+    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+void llama_graph_input_attn_kv_self::set_input(const llama_ubatch * ubatch) {
+    if (self_kq_mask || self_kq_mask_swa) {
+        // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
+        if (cparams.causal_attn) {
+            const int64_t n_kv         = kv_self->n;
+            const int64_t n_tokens     = ubatch->n_tokens;
+            const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+            const int64_t n_seqs       = ubatch->n_seqs;
+
+            float * data     = nullptr;
+            float * data_swa = nullptr;
+
+            if (self_kq_mask) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
+                data = (float *) self_kq_mask->data;
+            }
+
+            if (self_kq_mask_swa) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
+                data_swa = (float *) self_kq_mask_swa->data;
+            }
+
+            // For causal attention, use only the previous KV cells
+            // of the correct sequence for each token of the ubatch.
+            // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
+            for (int h = 0; h < 1; ++h) {
+                for (int s = 0; s < n_seqs; ++s) {
+                    const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+                    for (int j = 0; j < n_seq_tokens; ++j) {
+                        const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
+
+                        for (int i = 0; i < n_kv; ++i) {
+                            float f;
+                            if (!kv_self->cells[i].has_seq_id(seq_id) || kv_self->cells[i].pos > pos) {
+                                f = -INFINITY;
+                            } else {
+                                if (hparams.use_alibi) {
+                                    f = -std::abs(kv_self->cells[i].pos - pos);
+                                } else {
+                                    f = 0.0f;
+                                }
+                            }
+
+                            if (data) {
+                                data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
+                            }
+
+                            // may need to cut off old tokens for sliding window
+                            if (data_swa) {
+                                if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
+                                    f = -INFINITY;
+                                }
+                                data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
+                            }
+                        }
+                    }
+                }
+
+                if (data) {
+                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                        for (int j = 0; j < n_kv; ++j) {
+                            data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                        }
+                    }
+                }
+
+                if (data_swa) {
+                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                        for (int j = 0; j < n_kv; ++j) {
+                            data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                        }
+                    }
+                }
+            }
+        } else {
+            const int64_t n_tokens     = ubatch->n_tokens;
+            const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+            const int64_t n_seqs       = ubatch->n_seqs;
+            // when using kv cache, the mask needs to match the kv cache size
+            const int64_t n_stride     = n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
+
+            float * data = (float *) self_kq_mask->data;
+
+            for (int h = 0; h < 1; ++h) {
+                for (int s1 = 0; s1 < n_seqs; ++s1) {
+                    const llama_seq_id seq_id = ubatch->seq_id[s1][0];
+
+                    for (int j = 0; j < n_seq_tokens; ++j) {
+                        const int32_t tj = s1*n_seq_tokens + j;
+
+                        for (int s0 = 0; s0 < n_seqs; ++s0) {
+                            for (int i = 0; i < n_seq_tokens; ++i) {
+                                const int32_t ti = s0*n_seq_tokens + i;
+                                float f = -INFINITY;
+
+                                for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
+                                    if (ubatch->seq_id[s0][s] == seq_id) {
+                                        if (hparams.use_alibi) {
+                                            f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
+                                        } else {
+                                            f = 0.0f;
+                                        }
+                                        break;
+                                    }
+                                }
+
+                                data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
+                            }
+                        }
+
+                        for (int i = n_tokens; i < n_stride; ++i) {
+                            data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 llama_context_kv_self::llama_context_kv_self(
         const llama_model & model,
               llama_context_params params,
@@ -2593,7 +2791,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-    input_set(ubatch);
+    res->set_inputs(&ubatch);
+    input_set(ubatch); // TODO: remove
 
     const auto compute_status = graph_compute(gf, n_tokens > 1);
     switch (compute_status) {
@@ -2608,7 +2807,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
             return -3;
     }
 
-    auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd;
+    auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
 
     // extract embeddings
     if (t_embd) {
@@ -2831,7 +3030,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
 
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-        input_set(ubatch);
+        res->set_inputs(&ubatch);
+        input_set(ubatch); // TODO: remove
 
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
@@ -2861,11 +3061,11 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
         //}
 
-        auto * t_logits = cparams.embeddings ? nullptr    : res.t_logits;
-        auto * t_embd   = cparams.embeddings ? res.t_embd : nullptr;
+        auto * t_logits = cparams.embeddings ? nullptr         : res->get_logits();
+        auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
 
-        if (t_embd && res.t_embd_pooled) {
-            t_embd = res.t_embd_pooled;
+        if (t_embd && res->get_embd_pooled()) {
+            t_embd = res->get_embd_pooled();
         }
 
         // extract logits
@@ -3009,127 +3209,6 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
     // call base functionality
     llama_context_base::input_set(ubatch);
 
-    if (inp.self_kq_mask || inp.self_kq_mask_swa) {
-        // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
-        if (cparams.causal_attn) {
-            const int64_t n_kv         = kv_self->n;
-            const int64_t n_tokens     = ubatch.n_tokens;
-            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-            const int64_t n_seqs       = ubatch.n_seqs;
-
-            float * data     = nullptr;
-            float * data_swa = nullptr;
-
-            if (inp.self_kq_mask) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask->buffer));
-                data = (float *) inp.self_kq_mask->data;
-            }
-
-            if (inp.self_kq_mask_swa) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask_swa->buffer));
-                data_swa = (float *) inp.self_kq_mask_swa->data;
-            }
-
-            // For causal attention, use only the previous KV cells
-            // of the correct sequence for each token of the ubatch.
-            // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
-            for (int h = 0; h < 1; ++h) {
-                for (int s = 0; s < n_seqs; ++s) {
-                    const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-                    for (int j = 0; j < n_seq_tokens; ++j) {
-                        const llama_pos pos = ubatch.pos[s*n_seq_tokens + j];
-
-                        for (int i = 0; i < n_kv; ++i) {
-                            float f;
-                            if (!kv_self->cells[i].has_seq_id(seq_id) || kv_self->cells[i].pos > pos) {
-                                f = -INFINITY;
-                            } else {
-                                if (hparams.use_alibi) {
-                                    f = -std::abs(kv_self->cells[i].pos - pos);
-                                } else {
-                                    f = 0.0f;
-                                }
-                            }
-
-                            if (data) {
-                                data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                            }
-
-                            // may need to cut off old tokens for sliding window
-                            if (data_swa) {
-                                if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
-                                    f = -INFINITY;
-                                }
-                                data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                            }
-                        }
-                    }
-                }
-
-                if (data) {
-                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                        for (int j = 0; j < n_kv; ++j) {
-                            data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                        }
-                    }
-                }
-
-                if (data_swa) {
-                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                        for (int j = 0; j < n_kv; ++j) {
-                            data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                        }
-                    }
-                }
-            }
-        } else {
-            const int64_t n_tokens     = ubatch.n_tokens;
-            const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-            const int64_t n_seqs       = ubatch.n_seqs;
-            // when using kv cache, the mask needs to match the kv cache size
-            const int64_t n_stride     = n_tokens;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask->buffer));
-
-            float * data = (float *) inp.self_kq_mask->data;
-
-            for (int h = 0; h < 1; ++h) {
-                for (int s1 = 0; s1 < n_seqs; ++s1) {
-                    const llama_seq_id seq_id = ubatch.seq_id[s1][0];
-
-                    for (int j = 0; j < n_seq_tokens; ++j) {
-                        const int32_t tj = s1*n_seq_tokens + j;
-
-                        for (int s0 = 0; s0 < n_seqs; ++s0) {
-                            for (int i = 0; i < n_seq_tokens; ++i) {
-                                const int32_t ti = s0*n_seq_tokens + i;
-                                float f = -INFINITY;
-
-                                for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
-                                    if (ubatch.seq_id[s0][s] == seq_id) {
-                                        if (hparams.use_alibi) {
-                                            f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
-                                        } else {
-                                            f = 0.0f;
-                                        }
-                                        break;
-                                    }
-                                }
-
-                                data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
-                            }
-                        }
-
-                        for (int i = n_tokens; i < n_stride; ++i) {
-                            data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
     if (inp.self_pos_bucket) {
         const int64_t n_tokens = ubatch.n_tokens;
 
@@ -3173,37 +3252,45 @@ ggml_tensor * llama_context_kv_self::build_inp_pos_bucket(
     return inp.self_pos_bucket;
 }
 
-void llama_context_kv_self::build_attn_inp(
-        ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   causal,
-                bool   swa) {
+llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp(
+      llama_graph_result * res,
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   causal,
+                    bool   swa) const {
+    auto inp = std::make_shared<llama_graph_input_attn_kv_self>(model.hparams, cparams, kv_self.get());
+
     const auto n_kv = kv_self->n;
 
-    inp.self_kq_mask = causal
+    inp->self_kq_mask = causal
         ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
         : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    //cb(inp.self_kq_mask, "KQ_mask", -1);
-    ggml_set_input(inp.self_kq_mask);
+    //cb(inp->self_kq_mask, "KQ_mask", -1);
+    ggml_set_input(inp->self_kq_mask);
 
-    inp.self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.self_kq_mask, GGML_TYPE_F16) : inp.self_kq_mask;
+    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
 
     if (swa) {
         const auto & hparams = model.hparams;
 
         GGML_ASSERT(hparams.n_swa > 0);
 
-        inp.self_kq_mask_swa = causal
+        inp->self_kq_mask_swa = causal
             ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
             : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp.self_kq_mask_swa, "KQ_mask_swa", -1);
-        ggml_set_input(inp.self_kq_mask_swa);
+        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
+        ggml_set_input(inp->self_kq_mask_swa);
 
-        inp.self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.self_kq_mask_swa, GGML_TYPE_F16) : inp.self_kq_mask_swa;
+        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
     }
+
+    res->add_input(inp);
+
+    return inp;
 }
 
 ggml_tensor * llama_context_kv_self::build_attn(
+        llama_graph_input_attn_i * inp,
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * q_cur,
@@ -3211,7 +3298,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
          ggml_tensor * v_cur,
          ggml_tensor * kq_b,
              float     kq_scale,
-             int       il) {
+             int       il) const {
     const auto & hparams = model.hparams;
 
     const auto & n_ctx = cparams.n_ctx;
@@ -3280,7 +3367,7 @@ ggml_tensor * llama_context_kv_self::build_attn(
             }
     };
 
-    const auto & kq_mask = is_sliding ? inp.self_kq_mask_swa_cnv : inp.self_kq_mask_cnv;
+    const auto & kq_mask = is_sliding ? inp->get_kq_mask_swa() : inp->get_kq_mask();
 
     const auto n_kv = kv_self->n;
 
@@ -3897,7 +3984,8 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
 
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-        input_set(ubatch);
+        res->set_inputs(&ubatch);
+        input_set(ubatch); // TODO: remove
 
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
@@ -3927,11 +4015,11 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
         //}
 
-        auto * t_logits = cparams.embeddings ? nullptr    : res.t_logits;
-        auto * t_embd   = cparams.embeddings ? res.t_embd : nullptr;
+        auto * t_logits = cparams.embeddings ? nullptr         : res->get_logits();
+        auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
 
-        if (t_embd && res.t_embd_pooled) {
-            t_embd = res.t_embd_pooled;
+        if (t_embd && res->get_embd_pooled()) {
+            t_embd = res->get_embd_pooled();
         }
 
         // extract logits
@@ -4604,7 +4692,8 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
 
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-    input_set(ubatch);
+    res->set_inputs(&ubatch);
+    input_set(ubatch); // TODO: remove
 
     const auto compute_status = graph_compute(gf, n_tokens > 1);
     switch (compute_status) {
@@ -4619,7 +4708,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
             return -3;
     }
 
-    auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd;
+    auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
 
     // extract embeddings
     if (t_embd) {
@@ -4693,38 +4782,41 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
 // llama_context_dec
 //
 
-void llama_context_dec::reserve() {
-    // simulate full KV cache
-    cross->t_embd = nullptr;
+class llama_graph_input_attn_dec : public llama_graph_input_attn_i {
+public:
+    llama_graph_input_attn_dec(
+            llama_graph_input_attn_i * inp_kv_self,
+            const llama_cross * cross) : inp_kv_self(inp_kv_self), cross(cross) {}
 
-    llama_context_kv_self::reserve();
-}
+    void set_input(const llama_ubatch * ubatch) override;
 
-void llama_context_dec::input_set(const llama_ubatch & ubatch) {
-    // call base functionality
-    llama_context_kv_self::input_set(ubatch);
+    ggml_tensor * get_kq_mask()       override { return inp_kv_self->get_kq_mask(); }
+    ggml_tensor * get_kq_mask_swa()   override { return inp_kv_self->get_kq_mask_swa(); }
+    ggml_tensor * get_kq_mask_cross() override { return cross_kq_mask_cnv; }
 
-    if (inp.cross_embd && cross->t_embd) {
-        assert(inp.cross_embd->type == GGML_TYPE_F32);
+    ggml_tensor * cross_kq_mask     = nullptr; // F32 [n_outputs_enc, n_batch]
+    ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch]
 
-        ggml_backend_tensor_set(inp.cross_embd, cross->v_embd, 0, ggml_nbytes(inp.cross_embd));
-    }
+    llama_graph_input_attn_i * inp_kv_self = nullptr;
+    const llama_cross * cross = nullptr;
+};
 
-    if (inp.cross_kq_mask) {
-        const int64_t n_enc    = inp.cross_kq_mask->ne[0];
-        const int64_t n_tokens = ubatch.n_tokens;
+void llama_graph_input_attn_dec::set_input(const llama_ubatch * ubatch) {
+    if (cross_kq_mask) {
+        const int64_t n_enc    = cross_kq_mask->ne[0];
+        const int64_t n_tokens = ubatch->n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp.cross_kq_mask->buffer));
-        GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
+        GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
+        GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
 
-        float * data = (float *) inp.cross_kq_mask->data;
+        float * data = (float *) cross_kq_mask->data;
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 for (int i = 0; i < n_enc; ++i) {
                     float f = -INFINITY;
-                    for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
-                        const llama_seq_id seq_id = ubatch.seq_id[j][s];
+                    for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
+                        const llama_seq_id seq_id = ubatch->seq_id[j][s];
                         if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
                             f = 0.0f;
                         }
@@ -4742,6 +4834,25 @@ void llama_context_dec::input_set(const llama_ubatch & ubatch) {
     }
 }
 
+void llama_context_dec::reserve() {
+    // simulate full KV cache
+    cross->t_embd = nullptr;
+
+    llama_context_kv_self::reserve();
+}
+
+void llama_context_dec::input_set(const llama_ubatch & ubatch) {
+    // call base functionality
+    llama_context_kv_self::input_set(ubatch);
+
+    if (inp.cross_embd && cross->t_embd) {
+        assert(inp.cross_embd->type == GGML_TYPE_F32);
+
+        ggml_backend_tensor_set(inp.cross_embd, cross->v_embd, 0, ggml_nbytes(inp.cross_embd));
+    }
+
+}
+
 ggml_cgraph * llama_context_dec::graph_init() {
     inp = {};
 
@@ -4769,22 +4880,30 @@ ggml_tensor * llama_context_dec::build_inp_cross_embd(
     return inp.cross_embd;
 }
 
-void llama_context_dec::build_attn_inp(
-        ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   causal,
-                bool   swa) {
-    llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa);
+llama_graph_input_attn_ptr llama_context_dec::build_attn_inp(
+      llama_graph_result * res,
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   causal,
+                    bool   swa) const {
+    auto inp_kv_self = llama_context_kv_self::build_attn_inp(res, ctx0, n_tokens, causal, swa);
+
+    auto inp = std::make_shared<llama_graph_input_attn_dec>(inp_kv_self.get(), cross);
 
     const int32_t n_enc = cross->t_embd ? cross->t_embd->ne[1] : model.hparams.n_ctx_train;
 
-    inp.cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    ggml_set_input(inp.cross_kq_mask);
+    inp->cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    ggml_set_input(inp->cross_kq_mask);
+
+    inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
+
+    res->add_input(inp);
 
-    inp.cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.cross_kq_mask, GGML_TYPE_F16) : inp.cross_kq_mask;
+    return inp;
 }
 
 ggml_tensor * llama_context_dec::build_attn_cross(
+        llama_graph_input_attn_i * inp,
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * q_cur,
@@ -4792,10 +4911,10 @@ ggml_tensor * llama_context_dec::build_attn_cross(
          ggml_tensor * v_cur,
          ggml_tensor * kq_b,
              float     kq_scale,
-             int       il) {
+             int       il) const {
     GGML_UNUSED(il);
 
-    const auto & kq_mask = inp.cross_kq_mask_cnv;
+    const auto & kq_mask = inp->get_kq_mask_cross();
 
     ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
     //cb(q, "q", il);
diff --git a/src/llama-context.h b/src/llama-context.h
index f44652e2d1f18..0f248537eded3 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -251,22 +251,18 @@ class llama_context_base : public llama_context, public llama_graph_i {
     // when the compute graph is built, it creates the input tensors that it needs
     // the contents of the input tensors are set by the input_set() function
 
+    // TODO: remove, replace by llama_graph_input_i->set_input()
     virtual void input_set(const llama_ubatch & ubatch);
 
 private:
+    // TODO: remove, implement as llama_graph_input_xxx
     struct {
         // base input tensors
-        ggml_tensor * tokens;     // I32 [n_batch]
-        ggml_tensor * embd;       // F32 [n_embd, n_batch]
         ggml_tensor * pos;        // I32 [n_batch]
         ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
         ggml_tensor * out_ids;    // I32 [n_outputs]
         ggml_tensor * mean;       // F32 [n_batch, n_batch]
         ggml_tensor * cls;        // I32 [n_batch]
-
-        // KQ mask input tensors
-        ggml_tensor * kq_mask;     // F32 [n_tokens, n_batch]
-        ggml_tensor * kq_mask_cnv; //     [n_tokens, n_batch]
     } inp;
 
 protected:
@@ -292,7 +288,7 @@ class llama_context_base : public llama_context, public llama_graph_i {
     virtual ggml_cgraph * graph_init();
 
     // TODO: add encode/decode graphs
-    virtual llama_graph_result graph_build(
+    virtual llama_graph_result_ptr graph_build(
             ggml_context * ctx,
              ggml_cgraph * gf,
       const llama_ubatch & ubatch);
@@ -344,9 +340,10 @@ class llama_context_base : public llama_context, public llama_graph_i {
              ggml_backend_buffer * bbuf) override;
 
     ggml_tensor * build_inp_embd(
-            ggml_context * ctx0,
-             ggml_tensor * tok_embd,
-      const llama_ubatch & ubatch) override;
+            llama_graph_result * res,
+                  ggml_context * ctx0,
+                   ggml_tensor * tok_embd,
+            const llama_ubatch & ubatch) const override;
 
     ggml_tensor * build_inp_pos(
             ggml_context * ctx0,
@@ -367,21 +364,23 @@ class llama_context_base : public llama_context, public llama_graph_i {
             ggml_context * ctx0,
                  int32_t   n_tokens) override;
 
-    void build_attn_inp(
+    llama_graph_input_attn_ptr build_attn_inp(
+      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
-                    bool   swa) override;
+                    bool   swa) const override;
 
     ggml_tensor * build_attn(
+            llama_graph_input_attn_i * inp,
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
              ggml_tensor * kq_b,
-                 float     kq_scale,
-                 int       il) override;
+                   float   kq_scale,
+                     int   il) const override;
 
 protected:
     virtual ggml_tensor * build_attn_mha(
@@ -393,7 +392,7 @@ class llama_context_base : public llama_context, public llama_graph_i {
              ggml_tensor * kq_b,
              ggml_tensor * kq_mask,
                  bool      v_trans,
-                 float     kq_scale);
+                 float     kq_scale) const;
 
     virtual ggml_tensor * build_inp_self_k_shift(
             ggml_context * ctx0);
@@ -563,10 +562,6 @@ class llama_context_kv_self : public llama_context_base {
 private:
     struct {
         ggml_tensor * self_pos_bucket;      // I32 [n_kv, n_batch]
-        ggml_tensor * self_kq_mask;         // F32 [n_kv, n_batch]
-        ggml_tensor * self_kq_mask_cnv;     //     [n_kv, n_batch]
-        ggml_tensor * self_kq_mask_swa;     // F32 [n_kv, n_batch]
-        ggml_tensor * self_kq_mask_swa_cnv; //     [n_kv, n_batch]
         ggml_tensor * self_k_shift;         // I32 [kv_size]
     } inp;
 
@@ -586,21 +581,23 @@ class llama_context_kv_self : public llama_context_base {
             ggml_context * ctx0,
                  int32_t   n_tokens) override;
 
-    void build_attn_inp(
+    llama_graph_input_attn_ptr build_attn_inp(
+      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
-                    bool   swa) override;
+                    bool   swa) const override;
 
     ggml_tensor * build_attn(
+            llama_graph_input_attn_i * inp,
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
              ggml_tensor * kq_b,
-                 float     kq_scale,
-                 int       il) override;
+                   float   kq_scale,
+                     int   il) const override;
 
 protected:
     ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
@@ -786,8 +783,6 @@ class llama_context_dec : public llama_context_kv_self {
 private:
     struct {
         ggml_tensor * cross_embd;        // F32 [n_embd, n_outputs_enc]
-        ggml_tensor * cross_kq_mask;     // F32 [n_outputs_enc, n_batch]
-        ggml_tensor * cross_kq_mask_cnv; // F32 [n_outputs_enc, n_batch]
     } inp;
 
 protected:
@@ -800,13 +795,15 @@ class llama_context_dec : public llama_context_kv_self {
     ggml_tensor * build_inp_cross_embd(
             ggml_context * ctx0) override;
 
-    void build_attn_inp(
+    llama_graph_input_attn_ptr build_attn_inp(
+      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
-                    bool   swa) override;
+                    bool   swa) const override;
 
     ggml_tensor * build_attn_cross(
+            llama_graph_input_attn_i * inp,
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * q_cur,
@@ -814,7 +811,7 @@ class llama_context_dec : public llama_context_kv_self {
              ggml_tensor * v_cur,
              ggml_tensor * kq_b,
                  float     kq_scale,
-                 int       il) override;
+                 int       il) const override;
 
 public:
     llama_cross * cross = nullptr;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 1e336e844ada0..549a42c53ba22 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -2,17 +2,34 @@
 
 #include "llama-impl.h"
 
+ggml_tensor * llama_graph_input_attn_i::get_kq_mask() {
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+    return nullptr;
+}
+
+ggml_tensor * llama_graph_input_attn_i::get_kq_mask_swa() {
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+    return nullptr;
+}
+
+ggml_tensor * llama_graph_input_attn_i::get_kq_mask_cross() {
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+    return nullptr;
+}
+
 llama_graph_i::llama_graph_i(llama_graph_type type) : type(type) {}
 
 ggml_tensor * llama_graph_i::build_attn(
+        llama_graph_input_attn_i * inp,
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * q_cur,
          ggml_tensor * k_cur,
          ggml_tensor * v_cur,
          ggml_tensor * kq_b,
-             float     kq_scale,
-             int       il) {
+               float   kq_scale,
+                 int   il) const {
+    GGML_UNUSED(inp);
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(q_cur);
@@ -27,6 +44,7 @@ ggml_tensor * llama_graph_i::build_attn(
 }
 
 ggml_tensor * llama_graph_i::build_attn_cross(
+        llama_graph_input_attn_i * inp,
         ggml_context * ctx0,
          ggml_cgraph * gf,
          ggml_tensor * q_cur,
@@ -34,7 +52,8 @@ ggml_tensor * llama_graph_i::build_attn_cross(
          ggml_tensor * v_cur,
          ggml_tensor * kq_b,
              float     kq_scale,
-             int       il) {
+             int       il) const {
+    GGML_UNUSED(inp);
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(q_cur);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 28e8a563067db..a6a9ef00ca860 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <cstdint>
+#include <vector>
+#include <memory>
 
 // note: do not add high-level objects here, such as llama_context, llama_kv_cache, etc.
 //       not sure about llama_batch/llama_sbatch yet
@@ -9,6 +11,7 @@ struct ggml_cgraph;
 struct ggml_context;
 struct ggml_tensor;
 struct ggml_backend_buffer;
+
 struct llama_ubatch;
 
 enum llama_graph_type {
@@ -17,13 +20,78 @@ enum llama_graph_type {
     LLAMA_GRAPH_TYPE_DECODER,
 };
 
-struct llama_graph_result {
+//
+// llama_graph_input
+//
+
+class llama_graph_input_i {
+public:
+    virtual ~llama_graph_input_i() = default;
+
+    virtual void set_input(const llama_ubatch * ubatch) = 0;
+};
+
+using llama_graph_input_ptr = std::shared_ptr<llama_graph_input_i>;
+
+class llama_graph_input_attn_i : public llama_graph_input_i {
+public:
+    virtual ~llama_graph_input_attn_i() = default;
+
+    virtual ggml_tensor * get_kq_mask();
+    virtual ggml_tensor * get_kq_mask_swa();
+    virtual ggml_tensor * get_kq_mask_cross();
+};
+
+using llama_graph_input_attn_ptr = std::shared_ptr<llama_graph_input_attn_i>;
+
+//
+// llama_graph_result
+//
+
+class llama_graph_result_i {
+public:
+    virtual ~llama_graph_result_i() = default;
+
+    virtual ggml_tensor * get_logits()      = 0;
+    virtual ggml_tensor * get_embd()        = 0;
+    virtual ggml_tensor * get_embd_pooled() = 0;
+
+    virtual void set_inputs(const llama_ubatch * ubatch) = 0;
+};
+
+using llama_graph_result_ptr = std::unique_ptr<llama_graph_result_i>;
+
+class llama_graph_result : public llama_graph_result_i {
+public:
+    llama_graph_result()          = default;
+    virtual ~llama_graph_result() = default;
+
+    ggml_tensor * get_logits()      override { return t_logits; }
+    ggml_tensor * get_embd()        override { return t_embd; }
+    ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
+
+    void set_inputs(const llama_ubatch * ubatch) override {
+        for (auto & input : inputs) {
+            input->set_input(ubatch);
+        }
+    }
+
+    void add_input(llama_graph_input_ptr && input) {
+        inputs.emplace_back(std::move(input));
+    }
+
     // important graph nodes
     ggml_tensor * t_logits      = nullptr;
     ggml_tensor * t_embd        = nullptr;
     ggml_tensor * t_embd_pooled = nullptr;
+
+    std::vector<llama_graph_input_ptr> inputs;
 };
 
+//
+// llama_graph
+//
+
 // TODO: can become more granular in the future
 class llama_graph_i {
 public:
@@ -75,9 +143,10 @@ class llama_graph_i {
     // graph build API (context-specific)
 
     virtual ggml_tensor * build_inp_embd(
+      llama_graph_result * res,
             ggml_context * ctx0,
              ggml_tensor * tok_embd,
-      const llama_ubatch & ubatch) = 0;
+      const llama_ubatch & ubatch) const = 0; // note these methods will become const, i.e. they don't mutate the llama_context that implements them
 
     virtual ggml_tensor * build_inp_pos(
             ggml_context * ctx0,
@@ -98,23 +167,26 @@ class llama_graph_i {
             ggml_context * ctx0,
                  int32_t   n_tokens) = 0;
 
-    virtual void build_attn_inp(
+    virtual llama_graph_input_attn_ptr build_attn_inp(
+      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
-                    bool   swa) = 0;
+                    bool   swa) const = 0;
 
     virtual ggml_tensor * build_attn(
+            llama_graph_input_attn_i * inp,
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * q_cur,
              ggml_tensor * k_cur,
              ggml_tensor * v_cur,
              ggml_tensor * kq_b,
-                 float     kq_scale,
-                 int       il);
+                   float   kq_scale,
+                     int   il) const;
 
     virtual ggml_tensor * build_attn_cross(
+            llama_graph_input_attn_i * inp,
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * q_cur,
@@ -122,7 +194,7 @@ class llama_graph_i {
              ggml_tensor * v_cur,
              ggml_tensor * kq_b,
                  float     kq_scale,
-                 int       il);
+                 int       il) const;
 
     virtual ggml_tensor * build_inp_cross_embd(
             ggml_context * ctx0);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 25a705c657cd9..b6adbb1a1bbed 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2,7 +2,6 @@
 
 #include "llama-impl.h"
 #include "llama-mmap.h"
-#include "llama-graph.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-model-loader.h"
@@ -3853,7 +3852,7 @@ struct llm_build_context {
     ggml_context * ctx0 = nullptr;
     llama_graph_i * lgf = nullptr;
 
-    llama_graph_result res;
+    std::unique_ptr<llama_graph_result> res;
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
@@ -3892,7 +3891,8 @@ struct llm_build_context {
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
         ctx0             (ctx),
-        lgf              (lgf) {
+        lgf              (lgf),
+        res              (std::make_unique<llama_graph_result>()) {
         }
 
     // TODO: tmp
@@ -3902,7 +3902,7 @@ struct llm_build_context {
 
     // TODO: tmp
     struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
-        struct ggml_tensor * inpL = lgf->build_inp_embd(ctx0, tok_embd, ubatch);
+        struct ggml_tensor * inpL = lgf->build_inp_embd(res.get(), ctx0, tok_embd, ubatch);
         cb(inpL, "inp_embd", -1);
 
         return inpL;
@@ -4259,15 +4259,16 @@ struct llm_build_context {
     }
 
     struct ggml_tensor * build_attn(
-             struct ggml_cgraph * gf,
-             struct ggml_tensor * wo,
-             struct ggml_tensor * wo_b,
-             struct ggml_tensor * q_cur,
-             struct ggml_tensor * k_cur,
-             struct ggml_tensor * v_cur,
-                        int32_t   n_tokens, // TODO: remove
-                        float     kq_scale,
-                        int       il) {
+            llama_graph_input_attn_i * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur,
+            ggml_tensor * k_cur,
+            ggml_tensor * v_cur,
+                int32_t   n_tokens, // TODO: remove
+                float     kq_scale,
+                int       il) {
         GGML_UNUSED(n_tokens);
 
         // these nodes are added to the graph together so that they are not reordered
@@ -4276,7 +4277,7 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, k_cur);
         ggml_build_forward_expand(gf, v_cur);
 
-        ggml_tensor * cur = lgf->build_attn(ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il);
+        ggml_tensor * cur = lgf->build_attn(inp, ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il);
         cb(cur, "kqv_out", il);
 
         if (wo) {
@@ -4295,15 +4296,16 @@ struct llm_build_context {
     }
 
     struct ggml_tensor * build_attn_cross(
-             struct ggml_cgraph * gf,
-             struct ggml_tensor * wo,
-             struct ggml_tensor * wo_b,
-             struct ggml_tensor * q_cur,
-             struct ggml_tensor * k_cur,
-             struct ggml_tensor * v_cur,
-                        int32_t   n_tokens, // TODO: remove
-                        float     kq_scale,
-                        int       il) {
+            llama_graph_input_attn_i * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur,
+            ggml_tensor * k_cur,
+            ggml_tensor * v_cur,
+                int32_t   n_tokens, // TODO: remove
+                float     kq_scale,
+                int       il) {
         GGML_UNUSED(n_tokens);
 
         // these nodes are added to the graph together so that they are not reordered
@@ -4312,7 +4314,7 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, k_cur);
         ggml_build_forward_expand(gf, v_cur);
 
-        ggml_tensor * cur = lgf->build_attn_cross(ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il);
+        ggml_tensor * cur = lgf->build_attn_cross(inp, ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il);
         cb(cur, "kqv_out", il);
 
         if (wo) {
@@ -4331,16 +4333,17 @@ struct llm_build_context {
     }
 
     struct ggml_tensor * build_attn_with_kq_b(
-             struct ggml_cgraph * gf,
-             struct ggml_tensor * wo,
-             struct ggml_tensor * wo_b,
-             struct ggml_tensor * q_cur,
-             struct ggml_tensor * k_cur,
-             struct ggml_tensor * v_cur,
-             struct ggml_tensor * kq_b,
-                        int32_t   n_tokens, // TODO: remove
-                        float     kq_scale,
-                        int       il) {
+            llama_graph_input_attn_i * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur,
+            ggml_tensor * k_cur,
+            ggml_tensor * v_cur,
+            ggml_tensor * kq_b,
+                int32_t   n_tokens, // TODO: remove
+                float     kq_scale,
+                int       il) {
         GGML_UNUSED(n_tokens);
 
         // these nodes are added to the graph together so that they are not reordered
@@ -4349,7 +4352,7 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, k_cur);
         ggml_build_forward_expand(gf, v_cur);
 
-        ggml_tensor * cur = lgf->build_attn(ctx0, gf, q_cur, k_cur, v_cur, kq_b, kq_scale, il);
+        ggml_tensor * cur = lgf->build_attn(inp, ctx0, gf, q_cur, k_cur, v_cur, kq_b, kq_scale, il);
         cb(cur, "kqv_out", il);
 
         if (wo) {
@@ -4397,7 +4400,7 @@ struct llm_build_context {
     }
 
     void append_pooling(struct ggml_cgraph * gf) {
-        struct ggml_tensor * inp = res.t_embd;
+        struct ggml_tensor * inp = res->t_embd;
 
         //// find result_norm tensor for input
         //for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
@@ -4457,7 +4460,7 @@ struct llm_build_context {
         }
 
         cb(cur, "result_embd_pooled", -1);
-        res.t_embd_pooled = cur;
+        res->t_embd_pooled = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -4495,7 +4498,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -4548,7 +4551,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, kq_scale, il);
             }
@@ -4626,7 +4629,7 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -4637,7 +4640,7 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -4656,7 +4659,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -4720,7 +4723,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, kq_scale, il);
             }
@@ -4782,7 +4785,7 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -4793,7 +4796,7 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -4812,7 +4815,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4856,7 +4859,7 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -4903,13 +4906,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -4928,7 +4931,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4962,7 +4965,7 @@ struct llm_build_context {
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -5007,13 +5010,13 @@ struct llm_build_context {
         cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5033,7 +5036,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
@@ -5084,7 +5087,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -5129,12 +5132,12 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5156,7 +5159,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5206,7 +5209,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
             }
@@ -5277,7 +5280,7 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -5288,7 +5291,7 @@ struct llm_build_context {
         cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5308,7 +5311,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5353,7 +5356,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -5405,13 +5408,13 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5430,7 +5433,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -5463,7 +5466,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -5511,12 +5514,12 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5531,7 +5534,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5558,7 +5561,7 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 cb(Qcur, "Qcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -5605,13 +5608,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5645,7 +5648,7 @@ struct llm_build_context {
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
-        lgf->build_attn_inp(ctx0, n_tokens, false, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false);
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
@@ -5710,7 +5713,7 @@ struct llm_build_context {
             cb(Kcur, "Kcur", il);
             cb(Vcur, "Vcur", il);
 
-            cur = build_attn(gf,
+            cur = build_attn(inp_attn.get(), gf,
                     model.layers[il].wo, model.layers[il].bo,
                     Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             cb(cur, "kqv_out", il);
@@ -5774,7 +5777,7 @@ struct llm_build_context {
         cur = inpL;
 
         cb(cur, "result_embd", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5790,7 +5793,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         inpL = build_norm(inpL,
                 model.tok_norm,
@@ -5823,7 +5826,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -5871,12 +5874,12 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5893,7 +5896,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         if (model.pos_embd) {
             // inp_pos - contains the positions
@@ -5956,13 +5959,13 @@ struct llm_build_context {
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
-                    cur = build_attn(gf,
+                    cur = build_attn(inp_attn.get(), gf,
                             model.layers[il].wo, model.layers[il].bo,
                             Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
                 } else {
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                    cur = build_attn(gf,
+                    cur = build_attn(inp_attn.get(), gf,
                             model.layers[il].wo, model.layers[il].bo,
                             Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
                 }
@@ -6012,12 +6015,12 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6035,7 +6038,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -6108,7 +6111,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -6162,13 +6165,13 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6186,7 +6189,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6228,7 +6231,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -6275,13 +6278,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6300,7 +6303,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6343,7 +6346,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -6388,13 +6391,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6413,7 +6416,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         int sections[4];
         std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -6461,7 +6464,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -6506,13 +6509,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6531,7 +6534,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6574,7 +6577,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -6651,13 +6654,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6678,7 +6681,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             attn_norm_output = build_norm(inpL,
@@ -6733,7 +6736,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
             }
@@ -6773,7 +6776,7 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output_no_bias", -1);
@@ -6781,7 +6784,7 @@ struct llm_build_context {
         cur = ggml_add(ctx0, cur, model.output_b);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6801,7 +6804,7 @@ struct llm_build_context {
         struct ggml_tensor * inp_pos = build_inp_pos();
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        lgf->build_attn_inp(ctx0, n_tokens, true, true);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true);
 
         for (int il = 0; il < n_layer; ++il) {
             auto * residual = inpL;
@@ -6856,7 +6859,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
             }
@@ -6916,7 +6919,7 @@ struct llm_build_context {
             LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
@@ -6926,7 +6929,7 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6945,7 +6948,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -6981,7 +6984,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -7025,13 +7028,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7051,7 +7054,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -7084,7 +7087,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -7132,12 +7135,12 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7157,7 +7160,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -7196,7 +7199,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -7244,12 +7247,12 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7268,7 +7271,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7317,7 +7320,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -7362,13 +7365,13 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7387,7 +7390,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7436,7 +7439,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -7481,13 +7484,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7515,7 +7518,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7630,7 +7633,7 @@ struct llm_build_context {
                 struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
                 cb(k_states, "k_states", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         q_states, k_states, v_states, n_tokens, kq_scale, il);
             }
@@ -7686,7 +7689,7 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head scaling
         const float scale_lmhead = float(n_embd_base)/float(n_embd);
@@ -7697,7 +7700,7 @@ struct llm_build_context {
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7716,7 +7719,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7752,7 +7755,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
             }
@@ -7799,13 +7802,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7824,7 +7827,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, true);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7866,7 +7869,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
             }
@@ -7923,7 +7926,7 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -7934,7 +7937,7 @@ struct llm_build_context {
         cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7954,7 +7957,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8003,7 +8006,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -8049,13 +8052,13 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8103,13 +8106,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8129,7 +8132,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -8203,7 +8206,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -8247,7 +8250,7 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -8257,7 +8260,7 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8277,7 +8280,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, true);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true);
 
         // sliding window switch pattern
         const int32_t sliding_window_pattern = 4;
@@ -8338,7 +8341,7 @@ struct llm_build_context {
                     cb(Kcur, "Kcur", il);
                 }
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -8377,7 +8380,7 @@ struct llm_build_context {
         cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -8387,7 +8390,7 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8412,7 +8415,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8461,7 +8464,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, nullptr,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -8507,13 +8510,13 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8532,7 +8535,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8576,7 +8579,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -8627,13 +8630,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8656,7 +8659,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8704,7 +8707,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -8754,13 +8757,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8777,7 +8780,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             const int64_t n_head    = hparams.n_head(il);
@@ -8834,7 +8837,7 @@ struct llm_build_context {
                 Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
                 cb(Qcur, "Vcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -8881,12 +8884,12 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8905,7 +8908,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -8944,7 +8947,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -9025,12 +9028,12 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9049,7 +9052,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9086,7 +9089,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -9154,13 +9157,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9179,7 +9182,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
@@ -9233,7 +9236,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, kq_scale, il);
             }
@@ -9309,13 +9312,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9342,7 +9345,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9461,7 +9464,7 @@ struct llm_build_context {
                 struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
                 cb(k_states, "k_states", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         q_states, k_states, v_states, n_tokens, kq_scale, il);
             }
@@ -9536,13 +9539,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = ggml_mul_mat(ctx0, model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9560,7 +9563,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9619,7 +9622,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         NULL, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
 
@@ -9687,14 +9690,14 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         // FIXME: do not use model.tok_embd directly, duplicate as model.output
         cur = build_lora_mm(model.tok_embd, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9711,7 +9714,7 @@ struct llm_build_context {
 
         struct ggml_tensor * pos_bucket_enc = build_pos_bucket();
 
-        lgf->build_attn_inp(ctx0, n_tokens, false, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9740,7 +9743,7 @@ struct llm_build_context {
                 struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
                 struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
 
-                cur = build_attn_with_kq_b(gf,
+                cur = build_attn_with_kq_b(inp_attn.get(), gf,
                         model.layers[il].wo_enc, nullptr,
                         Qcur, Kcur, Vcur, kq_b, n_tokens, 1.0f, il);
                 cb(cur, "kqv_out", il);
@@ -9793,7 +9796,7 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9814,7 +9817,7 @@ struct llm_build_context {
 
         const int64_t n_outputs_enc = embd_enc->ne[1];
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9843,7 +9846,7 @@ struct llm_build_context {
                 struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
                 struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
 
-                cur = build_attn_with_kq_b(gf,
+                cur = build_attn_with_kq_b(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, kq_b, n_tokens, 1.0f, il);
                 cb(cur, "kqv_out", il);
@@ -9875,7 +9878,7 @@ struct llm_build_context {
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
 
-                cur = build_attn_cross(gf,
+                cur = build_attn_cross(inp_attn.get(), gf,
                         model.layers[il].wo_cross, nullptr,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f, il);
                 cb(cur, "kqv_out", il);
@@ -9955,13 +9958,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9977,7 +9980,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -10004,7 +10007,7 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/float(n_embd_head), il);
             }
@@ -10047,12 +10050,12 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10071,7 +10074,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10132,7 +10135,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
 
@@ -10177,12 +10180,12 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10201,7 +10204,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10251,7 +10254,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -10297,13 +10300,13 @@ struct llm_build_context {
                 LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10322,7 +10325,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10374,7 +10377,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, model.layers[il].bo,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
             }
@@ -10420,13 +10423,13 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10513,12 +10516,12 @@ struct llm_build_context {
         cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10597,12 +10600,12 @@ struct llm_build_context {
         cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10627,7 +10630,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        lgf->build_attn_inp(ctx0, n_tokens, true, false);
+        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10696,7 +10699,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = build_attn(gf,
+                cur = build_attn(inp_attn.get(), gf,
                         model.layers[il].wo, nullptr,
                         Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il);
 
@@ -10757,7 +10760,7 @@ struct llm_build_context {
                 LLM_NORM_RMS, -1);
 
         cb(cur, "result_norm", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -10777,7 +10780,7 @@ struct llm_build_context {
         cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
 
         cb(cur, "result_output", -1);
-        res.t_logits = cur;
+        res->t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10927,13 +10930,13 @@ struct llm_build_context {
         cur = ggml_add(ctx0, cur, model.output_b);
 
         cb(cur, "result_embd", -1);
-        res.t_embd = cur;
+        res->t_embd = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 };
 
-llama_graph_result llama_model::build_graph(
+llama_graph_result_ptr llama_model::build_graph(
           ggml_context * ctx,
            ggml_cgraph * gf,
          llama_graph_i * lgf,
@@ -11166,7 +11169,7 @@ llama_graph_result llama_model::build_graph(
         llm.append_pooling(gf);
     }
 
-    return llm.res;
+    return std::move(llm.res);
 }
 
 //
diff --git a/src/llama-model.h b/src/llama-model.h
index 447fc0d0576d6..2d64c0d242c09 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -3,6 +3,7 @@
 #include "llama.h"
 #include "llama-arch.h"
 #include "llama-hparams.h"
+#include "llama-graph.h"
 #include "llama-vocab.h"
 
 #include <memory>
@@ -10,11 +11,9 @@
 #include <unordered_map>
 #include <vector>
 
-class  llama_graph_i;
 struct llama_cparams;
 struct llama_ubatch;
 struct llama_model_loader;
-struct llama_graph_result;
 
 // available models
 enum llm_type {
@@ -367,7 +366,7 @@ struct llama_model {
     const struct ggml_tensor * get_tensor(const char * name) const;
 
     // TODO: add encode/decode graphs
-    llama_graph_result build_graph(
+    llama_graph_result_ptr build_graph(
               ggml_context * ctx,
                ggml_cgraph * gf,
              llama_graph_i * lgf,

From 9cab53c7ddeb029c7aeb787cf9fa7ea1779ba4b4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 28 Feb 2025 18:01:25 +0200
Subject: [PATCH 82/84] cont : migrate the rest of the inputs out of
 llama_context

ggml-ci
---
 src/llama-context.cpp | 922 ++++++++++++++++++++++++------------------
 src/llama-context.h   | 127 ++----
 src/llama-graph.cpp   |  86 ++--
 src/llama-graph.h     |  53 +--
 src/llama-model.cpp   |  36 +-
 5 files changed, 646 insertions(+), 578 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5ac28f983027e..8587f480fd96f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -71,6 +71,243 @@ void llama_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     }
 }
 
+class llama_graph_input_pos : public llama_graph_input_i {
+public:
+    llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    virtual ~llama_graph_input_pos() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * pos = nullptr; // I32 [n_batch]
+
+    const int64_t n_pos_per_token = 1;
+};
+
+void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) {
+    if (ubatch->pos && pos) {
+        const int64_t n_tokens = ubatch->n_tokens;
+
+        ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
+    }
+}
+
+class llama_graph_input_pos_bucket : public llama_graph_input_i {
+public:
+    llama_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
+    virtual ~llama_graph_input_pos_bucket() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
+
+    const llama_hparams & hparams;
+};
+
+void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
+    if (pos_bucket) {
+        const int64_t n_tokens = ubatch->n_tokens;
+
+        GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
+        GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
+
+        int32_t * data = (int32_t *) pos_bucket->data;
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
+                }
+            }
+        }
+    }
+}
+
+class llama_graph_input_out_ids : public llama_graph_input_i {
+public:
+    llama_graph_input_out_ids(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
+    virtual ~llama_graph_input_out_ids() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * out_ids; // I32 [n_outputs]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const int32_t n_outputs;
+};
+
+void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
+    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        //GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
+
+        if (!out_ids) {
+            LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
+        } else {
+            const int64_t n_tokens = ubatch->n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
+            int32_t * data = (int32_t *) out_ids->data;
+
+            if (n_outputs == n_tokens) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = i;
+                }
+            } else if (ubatch->output) {
+                int32_t n_outputs = 0;
+                for (int i = 0; i < n_tokens; ++i) {
+                    if (ubatch->output[i]) {
+                        data[n_outputs++] = i;
+                    }
+                }
+                // the graph needs to have been passed the correct number of outputs
+                GGML_ASSERT(n_outputs == n_outputs);
+            } else if (n_outputs == 1) {
+                // only keep last output
+                data[0] = n_tokens - 1;
+            } else {
+                GGML_ASSERT(n_outputs == 0);
+            }
+        }
+    }
+}
+
+class llama_graph_input_mean : public llama_graph_input_i {
+public:
+    llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
+    virtual ~llama_graph_input_mean() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * mean; // F32 [n_batch, n_batch]
+
+    const llama_cparams & cparams;
+};
+
+void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) {
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+        const int64_t n_tokens     = ubatch->n_tokens;
+        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+        const int64_t n_seqs       = ubatch->n_seqs;
+
+        GGML_ASSERT(mean);
+        GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
+
+        float * data = (float *) mean->data;
+        memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean));
+
+        std::vector<uint64_t> sum(n_tokens, 0);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
+
+            sum[seq_id] += ubatch->n_seq_tokens;
+        }
+
+        std::vector<float> div(n_tokens, 0.0f);
+        for (int i = 0; i < n_tokens; ++i) {
+            const uint64_t s = sum[i];
+            if (s > 0) {
+                div[i] = 1.0f/float(s);
+            }
+        }
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
+            }
+        }
+    }
+}
+
+class llama_graph_input_cls : public llama_graph_input_i {
+public:
+    llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
+    virtual ~llama_graph_input_cls() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * cls; // I32 [n_batch]
+
+    const llama_cparams & cparams;
+};
+
+void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) {
+    if (cparams.embeddings && (
+                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
+        const int64_t n_tokens     = ubatch->n_tokens;
+        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+        const int64_t n_seqs       = ubatch->n_seqs;
+
+        GGML_ASSERT(cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
+
+        uint32_t * data = (uint32_t *) cls->data;
+        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
+
+                if (pos == 0) {
+                    data[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+    }
+
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+        const int64_t n_tokens     = ubatch->n_tokens;
+        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+        const int64_t n_seqs       = ubatch->n_seqs;
+
+        GGML_ASSERT(cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
+
+        uint32_t * data = (uint32_t *) cls->data;
+        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
+
+        std::vector<int> last_pos(n_tokens, -1);
+        std::vector<int> last_row(n_tokens, -1);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
+
+                if (pos >= last_pos[seq_id]) {
+                    last_pos[seq_id] = pos;
+                    last_row[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+
+        for (int i = 0; i < n_tokens; ++i) {
+            if (last_row[i] >= 0) {
+                data[i] = last_row[i];
+            }
+        }
+    }
+}
+
 class llama_graph_input_attn_base : public llama_graph_input_attn_i {
 public:
     llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) :
@@ -846,7 +1083,6 @@ int llama_context_base::encode(llama_batch & inp_batch) {
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
     res->set_inputs(&ubatch);
-    input_set(ubatch); // TODO: remove, tmp here, until all inputs are migrated outside the context
 
     const auto compute_status = graph_compute(gf, n_tokens > 1);
     switch (compute_status) {
@@ -1003,7 +1239,6 @@ int llama_context_base::decode(llama_batch & inp_batch) {
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
     res->set_inputs(&ubatch);
-    input_set(ubatch); // TODO: remove
 
     const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
     if (compute_status != GGML_STATUS_SUCCESS) {
@@ -1132,178 +1367,6 @@ int64_t llama_context_base::n_pos_per_token() const {
     return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
 }
 
-void llama_context_base::input_set(const llama_ubatch & ubatch) {
-    const llama_hparams & hparams = model.hparams;
-
-    if (ubatch.pos && inp.pos) {
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        ggml_backend_tensor_set(inp.pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp.pos));
-    }
-
-    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        //GGML_ASSERT(inp.out_ids && "every model that can must skip unused outputs");
-
-        if (!inp.out_ids) {
-            LLAMA_LOG_WARN("%s: 'inp.out_ids' is not created\n", __func__);
-        } else {
-            const int64_t n_tokens = ubatch.n_tokens;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(inp.out_ids->buffer));
-            int32_t * data = (int32_t *) inp.out_ids->data;
-
-            if (n_outputs == n_tokens) {
-                for (int i = 0; i < n_tokens; ++i) {
-                    data[i] = i;
-                }
-            } else if (ubatch.output) {
-                int32_t n_outputs = 0;
-                for (int i = 0; i < n_tokens; ++i) {
-                    if (ubatch.output[i]) {
-                        data[n_outputs++] = i;
-                    }
-                }
-                // the graph needs to have been passed the correct number of outputs
-                GGML_ASSERT(n_outputs == n_outputs);
-            } else if (n_outputs == 1) {
-                // only keep last output
-                data[0] = n_tokens - 1;
-            } else {
-                GGML_ASSERT(n_outputs == 0);
-            }
-        }
-    }
-
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp.mean);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp.mean->buffer));
-
-        float * data = (float *) inp.mean->data;
-        memset(inp.mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp.mean));
-
-        std::vector<uint64_t> sum(n_tokens, 0);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
-            sum[seq_id] += ubatch.n_seq_tokens;
-        }
-
-        std::vector<float> div(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
-            const uint64_t s = sum[i];
-            if (s > 0) {
-                div[i] = 1.0f/float(s);
-            }
-        }
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
-            }
-        }
-    }
-
-    if (cparams.embeddings && (
-                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
-                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp.cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp.cls->buffer));
-
-        uint32_t * data = (uint32_t *) inp.cls->data;
-        memset(inp.cls->data, 0, n_tokens * ggml_element_size(inp.cls));
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
-
-                if (pos == 0) {
-                    data[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-    }
-
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
-        const int64_t n_tokens     = ubatch.n_tokens;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_seqs       = ubatch.n_seqs;
-
-        GGML_ASSERT(inp.cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp.cls->buffer));
-
-        uint32_t * data = (uint32_t *) inp.cls->data;
-        memset(inp.cls->data, 0, n_tokens * ggml_element_size(inp.cls));
-
-        std::vector<int> last_pos(n_tokens, -1);
-        std::vector<int> last_row(n_tokens, -1);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
-
-                if (pos >= last_pos[seq_id]) {
-                    last_pos[seq_id] = pos;
-                    last_row[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-
-        for (int i = 0; i < n_tokens; ++i) {
-            if (last_row[i] >= 0) {
-                data[i] = last_row[i];
-            }
-        }
-    }
-
-    if (inp.pos_bucket) {
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp.pos_bucket->buffer));
-        GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
-
-        int32_t * data = (int32_t *) inp.pos_bucket->data;
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_tokens; ++i) {
-                    data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, true);
-                }
-            }
-        }
-    }
-
-    GGML_ASSERT(
-            // (!a || b) is a logical implication (a -> b)
-            // !hparams.causal_attn -> !cparams.causal_attn
-            (hparams.causal_attn || !cparams.causal_attn) &&
-            "causal attention is not supported by this model"
-            );
-}
-
 //
 // output
 //
@@ -1423,8 +1486,6 @@ int32_t llama_context_base::graph_max_nodes() const {
 }
 
 ggml_cgraph * llama_context_base::graph_init() {
-    inp = {};
-
     struct ggml_init_params params = {
         /*.mem_size   =*/ buf_compute_meta.size(),
         /*.mem_buffer =*/ buf_compute_meta.data(),
@@ -1478,7 +1539,7 @@ void llama_context_base::build_cb(
          ggml_tensor * cur,
           const char * name,
   const llama_ubatch & ubatch,
-                 int   il) {
+                 int   il) const {
     if (il >= 0) {
         ggml_format_name(cur, "%s-%d", name, il);
     } else {
@@ -1498,7 +1559,7 @@ void llama_context_base::build_cb(
     if (ubatch.n_tokens < 32 || full_offload) {
         if (il != -1 && strcmp(name, "norm") == 0) {
             const auto & dev_layer = model.dev_layer(il);
-            for (auto & backend : backends) {
+            for (const auto & backend : backends) {
                 if (ggml_backend_get_device(backend.get()) == dev_layer) {
                     if (ggml_backend_supports_op(backend.get(), cur)) {
                         ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get());
@@ -1512,14 +1573,14 @@ void llama_context_base::build_cb(
 ggml_tensor * llama_context_base::build_cvec(
         ggml_context * ctx0,
          ggml_tensor * cur,
-                 int   il) {
+                 int   il) const {
     return cvec.apply_to(ctx0, cur, il);
 }
 
 ggml_tensor * llama_context_base::build_lora_mm(
         ggml_context * ctx0,
          ggml_tensor * w,
-         ggml_tensor * cur) {
+         ggml_tensor * cur) const {
     struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
 
     for (const auto & lora : loras) {
@@ -1547,7 +1608,7 @@ ggml_tensor * llama_context_base::build_lora_mm_id(
         ggml_context * ctx0,
          ggml_tensor * w,
          ggml_tensor * cur,
-         ggml_tensor * ids) {
+         ggml_tensor * ids) const {
     struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
     for (const auto & lora : loras) {
         struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
@@ -1572,7 +1633,7 @@ ggml_tensor * llama_context_base::build_lora_mm_id(
     return res;
 }
 
-ggml_tensor * llama_context_base::build_rope_factors(int il) {
+ggml_tensor * llama_context_base::build_rope_factors(int il) const {
     const auto & hparams = model.hparams;
 
     // choose long/short freq factors based on the context size
@@ -1594,7 +1655,7 @@ ggml_tensor * llama_context_base::build_rope_shift(
         ggml_tensor * cur,
         ggml_tensor * shift,
         ggml_tensor * factors,
-        ggml_backend_buffer * bbuf) {
+        ggml_backend_buffer * bbuf) const {
     const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
     const auto & freq_base  = cparams.rope_freq_base;
     const auto & freq_scale = cparams.rope_freq_scale;
@@ -1614,7 +1675,7 @@ ggml_tensor * llama_context_base::build_rope_shift(
         tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
 
         if (bbuf) {
-            for (auto & backend : backends) {
+            for (const auto & backend : backends) {
                 // Figure out which backend KV cache belongs to
                 if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
                     ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
@@ -1693,50 +1754,73 @@ ggml_tensor * llama_context_base::build_inp_embd(
     return inpL;
 }
 
-ggml_tensor * llama_context_base::build_inp_pos(
-        ggml_context * ctx0,
-             int32_t   n_tokens) {
-    inp.pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
-    ggml_set_input(inp.pos);
+ggml_tensor * llama_context_base::build_inp_pos(
+        llama_graph_result * res,
+              ggml_context * ctx0,
+                   int32_t   n_tokens) const {
+    auto inp = std::make_shared<llama_graph_input_pos>(n_pos_per_token());
+
+    inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
+    ggml_set_input(inp->pos);
+
+    res->add_input(inp);
 
-    return inp.pos;
+    return inp->pos;
 }
 
 ggml_tensor * llama_context_base::build_inp_pos_bucket(
-        ggml_context * ctx0,
-             int32_t   n_tokens) {
-    inp.pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
-    ggml_set_input(inp.pos_bucket);
+        llama_graph_result * res,
+              ggml_context * ctx0,
+                   int32_t   n_tokens) const {
+    auto inp = std::make_shared<llama_graph_input_pos_bucket>(model.hparams);
+
+    inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
+    ggml_set_input(inp->pos_bucket);
+
+    res->add_input(inp);
 
-    return inp.pos_bucket;
+    return inp->pos_bucket;
 }
 
 ggml_tensor * llama_context_base::build_inp_out_ids(
-        ggml_context * ctx0) {
-    const int32_t n_out_ids = n_outputs;
+        llama_graph_result * res,
+              ggml_context * ctx0) const {
+    auto inp = std::make_shared<llama_graph_input_out_ids>(model.hparams, cparams, n_outputs);
 
-    inp.out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids);
-    ggml_set_input(inp.out_ids);
+    inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
+    ggml_set_input(inp->out_ids);
+
+    res->add_input(inp);
 
-    return inp.out_ids;
+    return inp->out_ids;
 }
 
 ggml_tensor * llama_context_base::build_inp_mean(
-        ggml_context * ctx0,
-             int32_t   n_tokens) {
-    inp.mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
-    ggml_set_input(inp.mean);
+        llama_graph_result * res,
+              ggml_context * ctx0,
+                   int32_t   n_tokens) const {
+    auto inp = std::make_shared<llama_graph_input_mean>(cparams);
+
+    inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+    ggml_set_input(inp->mean);
 
-    return inp.mean;
+    res->add_input(inp);
+
+    return inp->mean;
 }
 
 ggml_tensor * llama_context_base::build_inp_cls(
-        ggml_context * ctx0,
-             int32_t   n_tokens) {
-    inp.cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_set_input(inp.cls);
+        llama_graph_result * res,
+              ggml_context * ctx0,
+                   int32_t   n_tokens) const {
+    auto inp = std::make_shared<llama_graph_input_cls>(cparams);
+
+    inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp->cls);
+
+    res->add_input(inp);
 
-    return inp.cls;
+    return inp->cls;
 }
 
 llama_graph_input_attn_ptr llama_context_base::build_attn_inp(
@@ -1887,33 +1971,6 @@ ggml_tensor * llama_context_base::build_attn_mha(
     return cur;
 }
 
-ggml_tensor * llama_context_base::build_inp_self_k_shift(
-        ggml_context * ctx0) {
-    GGML_UNUSED(ctx0);
-
-    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
-    return nullptr;
-}
-
-void llama_context_base::build_kv_self_shift(
-        ggml_context * ctx0,
-        ggml_cgraph * gf) {
-    GGML_UNUSED(ctx0);
-    GGML_UNUSED(gf);
-
-    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
-}
-
-void llama_context_base::build_kv_self_defrag(
-        ggml_context * ctx0,
-        ggml_cgraph * gf) {
-    GGML_UNUSED(ctx0);
-    GGML_UNUSED(gf);
-
-    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
-}
-
-
 //
 // perf
 //
@@ -2428,6 +2485,68 @@ size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_i
 // llama_context_kv_self
 //
 
+class llama_graph_input_pos_bucket_kv : public llama_graph_input_i {
+public:
+    llama_graph_input_pos_bucket_kv(
+            const llama_hparams & hparams,
+            const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {}
+    virtual ~llama_graph_input_pos_bucket_kv() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_kv_cache_unified * kv_self;
+};
+
+void llama_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
+    if (pos_bucket) {
+        const int64_t n_tokens = ubatch->n_tokens;
+
+        GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
+        GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
+
+        int32_t * data = (int32_t *) pos_bucket->data;
+
+        const int64_t n_kv = kv_self->n;
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_kv; ++i) {
+                    data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
+                }
+            }
+        }
+    }
+}
+
+class llama_graph_input_k_shift : public llama_graph_input_i {
+public:
+    llama_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    virtual ~llama_graph_input_k_shift() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * k_shift; // I32 [kv_size]
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+void llama_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
+    GGML_UNUSED(ubatch);
+
+    if (k_shift) {
+        assert(ggml_backend_buffer_is_host(k_shift->buffer));
+
+        int32_t * data = (int32_t *) k_shift->data;
+
+        for (uint32_t i = 0; i < kv_self->size; ++i) {
+            data[i] = kv_self->cells[i].delta;
+        }
+    }
+}
+
 class llama_graph_input_attn_kv_self : public llama_graph_input_attn_i {
 public:
     llama_graph_input_attn_kv_self(
@@ -2661,11 +2780,11 @@ void llama_context_kv_self::kv_self_update() {
 
             auto * gf = graph_init();
 
-            build_kv_self_shift(ctx_compute.get(), gf);
+            auto res = graph_build_kv_self_shift(ctx_compute.get(), gf);
 
             ggml_backend_sched_alloc_graph(sched.get(), gf);
 
-            input_set({});
+            res->set_inputs(nullptr);
 
             graph_compute(gf, false);
 
@@ -2689,7 +2808,7 @@ void llama_context_kv_self::kv_self_update() {
 
         auto * gf = graph_init();
 
-        build_kv_self_defrag(ctx_compute.get(), gf);
+        graph_build_kv_self_defrag(ctx_compute.get(), gf);
 
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
@@ -2792,7 +2911,6 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
     res->set_inputs(&ubatch);
-    input_set(ubatch); // TODO: remove
 
     const auto compute_status = graph_compute(gf, n_tokens > 1);
     switch (compute_status) {
@@ -3031,7 +3149,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
         res->set_inputs(&ubatch);
-        input_set(ubatch); // TODO: remove
 
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
@@ -3190,66 +3307,24 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
-    const llama_hparams & hparams = model.hparams;
-
-    if (inp.self_k_shift) {
-        assert(ggml_backend_buffer_is_host(inp.self_k_shift->buffer));
-
-        int32_t * data = (int32_t *) inp.self_k_shift->data;
-
-        for (uint32_t i = 0; i < kv_self->size; ++i) {
-            data[i] = kv_self->cells[i].delta;
-        }
-
-        // the K-shift graph requires just this input
-        return;
-    }
-
-    // call base functionality
-    llama_context_base::input_set(ubatch);
-
-    if (inp.self_pos_bucket) {
-        const int64_t n_tokens = ubatch.n_tokens;
-
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_pos_bucket->buffer));
-        GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
-
-        int32_t * data = (int32_t *) inp.self_pos_bucket->data;
-
-        const int64_t n_kv = kv_self->n;
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_kv; ++i) {
-                    data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, false);
-                }
-            }
-        }
-    }
-}
-
 ggml_cgraph * llama_context_kv_self::graph_init() {
-    inp = {};
-
     return llama_context_base::graph_init();
 }
 
-ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) {
-    inp.self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx());
-    ggml_set_input(inp.self_k_shift);
-
-    return inp.self_k_shift;
-}
-
 ggml_tensor * llama_context_kv_self::build_inp_pos_bucket(
-        ggml_context * ctx0,
-             int32_t   n_tokens) {
+        llama_graph_result * res,
+              ggml_context * ctx0,
+                   int32_t   n_tokens) const {
+    auto inp = std::make_shared<llama_graph_input_pos_bucket_kv>(model.hparams, kv_self.get());
+
     const auto n_kv = kv_self->n;
 
-    inp.self_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
-    ggml_set_input(inp.self_pos_bucket);
+    inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
+    ggml_set_input(inp->pos_bucket);
 
-    return inp.self_pos_bucket;
+    res->inputs.push_back(inp);
+
+    return inp->pos_bucket;
 }
 
 llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp(
@@ -3404,9 +3479,11 @@ ggml_tensor * llama_context_kv_self::build_attn(
     return cur;
 }
 
-void llama_context_kv_self::build_kv_self_shift(
+llama_graph_result_ptr llama_context_kv_self::graph_build_kv_self_shift(
         ggml_context * ctx0,
-        ggml_cgraph * gf) {
+        ggml_cgraph * gf) const {
+    auto res = std::make_unique<llama_graph_result>();
+
     const auto & hparams = model.hparams;
 
     const auto & n_layer = hparams.n_layer;
@@ -3416,7 +3493,12 @@ void llama_context_kv_self::build_kv_self_shift(
 
     //GGML_ASSERT(kv_self->size == n_ctx);
 
-    ggml_tensor * inp_self_k_shift = build_inp_self_k_shift(ctx0);
+    auto inp = std::make_shared<llama_graph_input_k_shift>(kv_self.get());
+
+    inp->k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx());
+    ggml_set_input(inp->k_shift);
+
+    res->add_input(inp);
 
     for (uint32_t il = 0; il < n_layer; ++il) {
         const int64_t n_head_kv    = hparams.n_head_kv(il);
@@ -3431,15 +3513,17 @@ void llama_context_kv_self::build_kv_self_shift(
                 ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
                 0);
 
-        ggml_tensor * cur = build_rope_shift(ctx0, k, inp_self_k_shift, rope_factors, kv_self->k_l[il]->buffer);
+        ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, kv_self->k_l[il]->buffer);
 
         ggml_build_forward_expand(gf, cur);
     }
+
+    return res;
 }
 
-void llama_context_kv_self::build_kv_self_defrag(
+llama_graph_result_ptr llama_context_kv_self::graph_build_kv_self_defrag(
         ggml_context * ctx0,
-        ggml_cgraph * gf) {
+        ggml_cgraph * gf) const {
     const auto & hparams = model.hparams;
 
     const uint32_t n_layer = hparams.n_layer;
@@ -3454,7 +3538,7 @@ void llama_context_kv_self::build_kv_self_defrag(
     // number of cells moved
     uint32_t n_moves = 0;
 
-    // each move requires 6*n_layer tensors (see build_kv_self_defrag)
+    // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
     //   - source view, destination view, copy operation
     //   - x2 for keys and values
     //const uint32_t max_moves = max_nodes()/(6*n_layer);
@@ -3565,7 +3649,7 @@ void llama_context_kv_self::build_kv_self_defrag(
     }
 
     if (n_moves == 0) {
-        return;
+        return nullptr;
     }
 
     //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
@@ -3705,6 +3789,8 @@ void llama_context_kv_self::build_kv_self_defrag(
 
     //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
 #endif
+
+    return nullptr;
 }
 
 // state save/load
@@ -3747,6 +3833,89 @@ size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_se
 // llama_context_recurrent
 //
 
+class llama_graph_input_s_copy : public llama_graph_input_i {
+public:
+    llama_graph_input_s_copy(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
+    virtual ~llama_graph_input_s_copy() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * s_copy; // I32 [kv_size]
+
+    llama_kv_cache_recurrent * kv_self;
+};
+
+void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
+    GGML_UNUSED(ubatch);
+
+    const int64_t n_kv = kv_self->n;
+
+    if (s_copy) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
+        int32_t * data = (int32_t *) s_copy->data;
+
+        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t  cell_id = i + kv_self->head;
+            llama_kv_cell & kv_cell = kv_self->cells[cell_id];
+
+            //////////////////////////////////////////////
+            // TODO: this should not mutate the KV cache !
+
+            // prevent out-of-bound sources
+            if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
+                kv_cell.src = cell_id;
+            }
+
+            data[i] = kv_cell.src;
+
+            // TODO: do not mutate the KV cache
+            // ensure copy only happens once
+            if (kv_cell.src != (int32_t) cell_id) {
+                kv_cell.src = cell_id;
+            }
+        }
+    }
+}
+
+class llama_graph_input_s_mask : public llama_graph_input_i {
+public:
+    llama_graph_input_s_mask(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
+    virtual ~llama_graph_input_s_mask() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * s_mask; // F32 [1, n_kv]
+
+    llama_kv_cache_recurrent * kv_self;
+};
+
+void llama_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
+    GGML_UNUSED(ubatch);
+
+    const int64_t n_kv = kv_self->n;
+
+    if (s_mask) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
+        float * data = (float *) s_mask->data;
+
+        // clear unused states
+        for (int i = 0; i < n_kv; ++i) {
+            const uint32_t  cell_id = i + kv_self->head;
+            llama_kv_cell & kv_cell = kv_self->cells[cell_id];
+
+            data[i] = (float) (kv_cell.src >= 0);
+
+            //////////////////////////////////////////////
+            // TODO: this should not mutate the KV cache !
+            // only clear once
+            if (kv_cell.src < 0) {
+                kv_cell.src = cell_id;
+            }
+        }
+    }
+}
+
 llama_context_recurrent::llama_context_recurrent(
         const llama_model & model,
               llama_context_params params,
@@ -3985,7 +4154,6 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
         ggml_backend_sched_alloc_graph(sched.get(), gf);
 
         res->set_inputs(&ubatch);
-        input_set(ubatch); // TODO: remove
 
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
@@ -4130,85 +4298,40 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-void llama_context_recurrent::input_set(const llama_ubatch & ubatch) {
-    // call base functionality
-    llama_context_base::input_set(ubatch);
-
-    GGML_ASSERT(kv_self->recurrent);
-
-    const int64_t n_kv = kv_self->n;
-
-    if (inp.s_mask) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_mask->buffer));
-        float * data = (float *) inp.s_mask->data;
-
-        // clear unused states
-        for (int i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self->head;
-            llama_kv_cell & kv_cell = kv_self->cells[cell_id];
-
-            data[i] = (float) (kv_cell.src >= 0);
-
-            // TODO: do not mutate the KV cache
-            // only clear once
-            if (kv_cell.src < 0) {
-                kv_cell.src = cell_id;
-            }
-        }
-    }
-
-    if (inp.s_copy) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_copy->buffer));
-        int32_t * data = (int32_t *) inp.s_copy->data;
-
-        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self->head;
-            llama_kv_cell & kv_cell = kv_self->cells[cell_id];
-
-            // prevent out-of-bound sources
-            if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
-                kv_cell.src = cell_id;
-            }
-
-            data[i] = kv_cell.src;
-
-            // TODO: do not mutate the KV cache
-            // ensure copy only happens once
-            if (kv_cell.src != (int32_t) cell_id) {
-                kv_cell.src = cell_id;
-            }
-        }
-    }
-}
-
 ggml_cgraph * llama_context_recurrent::graph_init() {
-    inp.s_copy = nullptr;
-    inp.s_mask = nullptr;
-
     return llama_context_base::graph_init();
 }
 
 ggml_tensor * llama_context_recurrent::build_inp_s_copy(
-        ggml_context * ctx0) {
+      llama_graph_result * res,
+            ggml_context * ctx0) const {
+    auto inp = std::make_shared<llama_graph_input_s_copy>(kv_self.get());
+
     const auto n_kv = kv_self->n;
 
-    inp.s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
+    inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
     //cb(inp.s_copy, "inp_s_copy", -1);
-    ggml_set_input(inp.s_copy);
+    ggml_set_input(inp->s_copy);
+
+    res->add_input(inp);
 
-    return inp.s_copy;
+    return inp->s_copy;
 }
 
 ggml_tensor * llama_context_recurrent::build_inp_s_mask(
-        ggml_context * ctx0) {
+      llama_graph_result * res,
+            ggml_context * ctx0) const {
+    auto inp = std::make_shared<llama_graph_input_s_mask>(kv_self.get());
+
     const auto n_kv = kv_self->n;
 
-    inp.s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
-    //cb(inp.s_mask, "inp_s_mask", -1);
-    ggml_set_input(inp.s_mask);
+    inp->s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
+    //cb(inp->s_mask, "inp_s_mask", -1);
+    ggml_set_input(inp->s_mask);
+
+    res->add_input(inp);
 
-    return inp.s_mask;
+    return inp->s_mask;
 }
 
 ggml_tensor * llama_context_recurrent::build_copy_mask_state(
@@ -4218,7 +4341,7 @@ ggml_tensor * llama_context_recurrent::build_copy_mask_state(
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
              int32_t   n_state,
-             int32_t   n_seqs) {
+             int32_t   n_seqs) const {
     const auto n_kv    = kv_self->n;
     const auto kv_head = kv_self->head;
 
@@ -4251,7 +4374,7 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer(
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
-                 int   il) {
+                 int   il) const {
     const auto & hparams = model.hparams;
 
     const auto kv_head = kv_self->head;
@@ -4383,7 +4506,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_load(
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
-                 int   il) {
+                 int   il) const {
     const auto & hparams = model.hparams;
 
     const auto token_shift_count = hparams.token_shift_count;
@@ -4405,7 +4528,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_store(
         ggml_context * ctx0,
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
-                 int   il) {
+                 int   il) const {
     const auto & hparams = model.hparams;
 
     const auto token_shift_count = hparams.token_shift_count;
@@ -4430,7 +4553,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix(
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
-                 int   il) {
+                 int   il) const {
     const auto & hparams = model.hparams;
 
     const auto n_tokens = ubatch.n_tokens;
@@ -4693,7 +4816,6 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
     ggml_backend_sched_alloc_graph(sched.get(), gf);
 
     res->set_inputs(&ubatch);
-    input_set(ubatch); // TODO: remove
 
     const auto compute_status = graph_compute(gf, n_tokens > 1);
     switch (compute_status) {
@@ -4782,6 +4904,29 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
 // llama_context_dec
 //
 
+class llama_graph_input_cross_embd : public llama_graph_input_i {
+public:
+    llama_graph_input_cross_embd(
+            const llama_cross * cross) : cross(cross) {}
+    virtual ~llama_graph_input_cross_embd() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
+
+    const llama_cross * cross;
+};
+
+void llama_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
+    GGML_UNUSED(ubatch);
+
+    if (cross_embd && cross->t_embd) {
+        assert(cross_embd->type == GGML_TYPE_F32);
+
+        ggml_backend_tensor_set(cross_embd, cross->v_embd, 0, ggml_nbytes(cross_embd));
+    }
+}
+
 class llama_graph_input_attn_dec : public llama_graph_input_attn_i {
 public:
     llama_graph_input_attn_dec(
@@ -4841,32 +4986,21 @@ void llama_context_dec::reserve() {
     llama_context_kv_self::reserve();
 }
 
-void llama_context_dec::input_set(const llama_ubatch & ubatch) {
-    // call base functionality
-    llama_context_kv_self::input_set(ubatch);
-
-    if (inp.cross_embd && cross->t_embd) {
-        assert(inp.cross_embd->type == GGML_TYPE_F32);
-
-        ggml_backend_tensor_set(inp.cross_embd, cross->v_embd, 0, ggml_nbytes(inp.cross_embd));
-    }
-
-}
-
 ggml_cgraph * llama_context_dec::graph_init() {
-    inp = {};
-
     return llama_context_kv_self::graph_init();
 }
 
 ggml_tensor * llama_context_dec::build_inp_cross_embd(
-        ggml_context * ctx0) {
+      llama_graph_result * res,
+            ggml_context * ctx0) const {
+    auto inp = std::make_shared<llama_graph_input_cross_embd>(cross);
+
     // if we have the output embeddings from the encoder, use them directly
     // TODO: needs more work to be correct, for now just use the tensor shape
     //if (cross->t_embd) {
-    //    inp.cross_embd = ggml_view_tensor(ctx0, cross->t_embd);
+    //    inp->cross_embd = ggml_view_tensor(ctx0, cross->t_embd);
 
-    //    return inp.cross_embd;
+    //    return inp->cross_embd;
     //}
 
     const auto & hparams = model.hparams;
@@ -4874,10 +5008,12 @@ ggml_tensor * llama_context_dec::build_inp_cross_embd(
     const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd;
     const auto n_enc  = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train;
 
-    inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
-    ggml_set_input(inp.cross_embd);
+    inp->cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
+    ggml_set_input(inp->cross_embd);
+
+    res->add_input(inp);
 
-    return inp.cross_embd;
+    return inp->cross_embd;
 }
 
 llama_graph_input_attn_ptr llama_context_dec::build_attn_inp(
diff --git a/src/llama-context.h b/src/llama-context.h
index 0f248537eded3..21015e8796e40 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -248,24 +248,6 @@ class llama_context_base : public llama_context, public llama_graph_i {
 
     virtual int64_t n_pos_per_token() const; // vision
 
-    // when the compute graph is built, it creates the input tensors that it needs
-    // the contents of the input tensors are set by the input_set() function
-
-    // TODO: remove, replace by llama_graph_input_i->set_input()
-    virtual void input_set(const llama_ubatch & ubatch);
-
-private:
-    // TODO: remove, implement as llama_graph_input_xxx
-    struct {
-        // base input tensors
-        ggml_tensor * pos;        // I32 [n_batch]
-        ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
-        ggml_tensor * out_ids;    // I32 [n_outputs]
-        ggml_tensor * mean;       // F32 [n_batch, n_batch]
-        ggml_tensor * cls;        // I32 [n_batch]
-    } inp;
-
-protected:
     //
     // output
     //
@@ -309,35 +291,35 @@ class llama_context_base : public llama_context, public llama_graph_i {
              ggml_tensor * cur,
               const char * name,
       const llama_ubatch & ubatch,
-                     int   il) override;
+                     int   il) const override;
 
     // apply control vector for layer il
     ggml_tensor * build_cvec(
             ggml_context * ctx0,
              ggml_tensor * cur,
-                     int   il) override;
+                     int   il) const override;
 
     // do mat_mul, while optionally apply lora
     ggml_tensor * build_lora_mm(
             ggml_context * ctx0,
              ggml_tensor * w,
-             ggml_tensor * cur) override;
+             ggml_tensor * cur) const override;
 
     // do mat_mul_id, while optionally apply lora
     ggml_tensor * build_lora_mm_id(
             ggml_context * ctx0,
              ggml_tensor * w,   // struct ggml_tensor * as
              ggml_tensor * cur, // struct ggml_tensor * b
-             ggml_tensor * ids) override;
+             ggml_tensor * ids) const override;
 
-    ggml_tensor * build_rope_factors(int il) override;
+    ggml_tensor * build_rope_factors(int il) const override;
 
     ggml_tensor * build_rope_shift(
             ggml_context * ctx0,
              ggml_tensor * cur,
              ggml_tensor * shift,
              ggml_tensor * factors,
-             ggml_backend_buffer * bbuf) override;
+             ggml_backend_buffer * bbuf) const override;
 
     ggml_tensor * build_inp_embd(
             llama_graph_result * res,
@@ -346,23 +328,28 @@ class llama_context_base : public llama_context, public llama_graph_i {
             const llama_ubatch & ubatch) const override;
 
     ggml_tensor * build_inp_pos(
+      llama_graph_result * res,
             ggml_context * ctx0,
-                 int32_t   n_tokens) override;
+                 int32_t   n_tokens) const override;
 
     ggml_tensor * build_inp_pos_bucket(
+      llama_graph_result * res,
             ggml_context * ctx0,
-                 int32_t   n_tokens) override;
+                 int32_t   n_tokens) const override;
 
     ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0) override;
+      llama_graph_result * res,
+            ggml_context * ctx0) const override;
 
     ggml_tensor * build_inp_mean(
+      llama_graph_result * res,
             ggml_context * ctx0,
-                 int32_t   n_tokens) override;
+                 int32_t   n_tokens) const override;
 
     ggml_tensor * build_inp_cls(
+      llama_graph_result * res,
             ggml_context * ctx0,
-                 int32_t   n_tokens) override;
+                 int32_t   n_tokens) const override;
 
     llama_graph_input_attn_ptr build_attn_inp(
       llama_graph_result * res,
@@ -394,18 +381,6 @@ class llama_context_base : public llama_context, public llama_graph_i {
                  bool      v_trans,
                  float     kq_scale) const;
 
-    virtual ggml_tensor * build_inp_self_k_shift(
-            ggml_context * ctx0);
-
-    virtual void build_kv_self_shift(
-            ggml_context * ctx0,
-            ggml_cgraph * gf);
-
-    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    virtual void build_kv_self_defrag(
-            ggml_context * ctx0,
-            ggml_cgraph * gf);
-
 public:
     //
     // perf
@@ -552,19 +527,6 @@ class llama_context_kv_self : public llama_context_base {
     int encode(llama_batch & inp_batch) override;
     int decode(llama_batch & inp_batch) override;
 
-protected:
-    //
-    // input
-    //
-
-    void input_set(const llama_ubatch & ubatch) override;
-
-private:
-    struct {
-        ggml_tensor * self_pos_bucket;      // I32 [n_kv, n_batch]
-        ggml_tensor * self_k_shift;         // I32 [kv_size]
-    } inp;
-
 protected:
     //
     // graph
@@ -578,8 +540,9 @@ class llama_context_kv_self : public llama_context_base {
     //
 
     ggml_tensor * build_inp_pos_bucket(
+      llama_graph_result * res,
             ggml_context * ctx0,
-                 int32_t   n_tokens) override;
+                 int32_t   n_tokens) const override;
 
     llama_graph_input_attn_ptr build_attn_inp(
       llama_graph_result * res,
@@ -600,16 +563,14 @@ class llama_context_kv_self : public llama_context_base {
                      int   il) const override;
 
 protected:
-    ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
-
-    void build_kv_self_shift(
+    llama_graph_result_ptr graph_build_kv_self_shift(
             ggml_context * ctx0,
-            ggml_cgraph * gf) override;
+            ggml_cgraph * gf) const;
 
     // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    void build_kv_self_defrag(
+    llama_graph_result_ptr graph_build_kv_self_defrag(
             ggml_context * ctx0,
-            ggml_cgraph * gf) override;
+            ggml_cgraph * gf) const;
 
     //
     // state save/load
@@ -651,19 +612,6 @@ class llama_context_recurrent : public llama_context_base {
     int encode(llama_batch & inp_batch) override;
     int decode(llama_batch & inp_batch) override;
 
-protected:
-    //
-    // input
-    //
-
-    void input_set(const llama_ubatch & ubatch) override;
-
-private:
-    struct {
-        ggml_tensor * s_copy; // I32 [kv_size]
-        ggml_tensor * s_mask; // F32 [1, n_kv]
-    } inp;
-
 protected:
     //
     // graph
@@ -677,10 +625,12 @@ class llama_context_recurrent : public llama_context_base {
     //
 
     ggml_tensor * build_inp_s_copy(
-            ggml_context * ctx0) override;
+      llama_graph_result * res,
+            ggml_context * ctx0) const override;
 
     ggml_tensor * build_inp_s_mask(
-            ggml_context * ctx0) override;
+      llama_graph_result * res,
+            ggml_context * ctx0) const override;
 
     ggml_tensor * build_copy_mask_state(
             ggml_context * ctx0,
@@ -689,7 +639,7 @@ class llama_context_recurrent : public llama_context_base {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
                  int32_t   n_state,
-                 int32_t   n_seqs) override;
+                 int32_t   n_seqs) const override;
 
     ggml_tensor * build_mamba_layer(
             ggml_context * ctx0,
@@ -698,7 +648,7 @@ class llama_context_recurrent : public llama_context_base {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il) override;
+                     int   il) const override;
 
     ggml_tensor * build_rwkv_token_shift_load(
             ggml_context * ctx0,
@@ -706,13 +656,13 @@ class llama_context_recurrent : public llama_context_base {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il) override;
+                     int   il) const override;
 
     ggml_tensor * build_rwkv_token_shift_store(
             ggml_context * ctx0,
              ggml_tensor * token_shift,
       const llama_ubatch & ubatch,
-                     int   il) override;
+                     int   il) const override;
 
     ggml_tensor * build_rwkv6_time_mix(
             ggml_context * ctx0,
@@ -722,7 +672,7 @@ class llama_context_recurrent : public llama_context_base {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il) override;
+                     int   il) const override;
 
 protected:
     //
@@ -774,18 +724,6 @@ class llama_context_dec : public llama_context_kv_self {
 protected:
     void reserve() override;
 
-    //
-    // input
-    //
-
-    void input_set(const llama_ubatch & ubatch) override;
-
-private:
-    struct {
-        ggml_tensor * cross_embd;        // F32 [n_embd, n_outputs_enc]
-    } inp;
-
-protected:
     //
     // graph
     //
@@ -793,7 +731,8 @@ class llama_context_dec : public llama_context_kv_self {
     ggml_cgraph * graph_init() override;
 
     ggml_tensor * build_inp_cross_embd(
-            ggml_context * ctx0) override;
+      llama_graph_result * res,
+            ggml_context * ctx0) const override;
 
     llama_graph_input_attn_ptr build_attn_inp(
       llama_graph_result * res,
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 549a42c53ba22..79b26d1734ca3 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -68,25 +68,19 @@ ggml_tensor * llama_graph_i::build_attn_cross(
 }
 
 ggml_tensor * llama_graph_i::build_inp_cross_embd(
-        ggml_context * ctx0) {
+      llama_graph_result * res,
+            ggml_context * ctx0) const {
+    GGML_UNUSED(res);
     GGML_UNUSED(ctx0);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
     return nullptr;
 }
 
-ggml_tensor * llama_graph_i::build_inp_cross_kq_mask(
-        ggml_context * ctx0,
-             int32_t   n_tokens) {
-    GGML_UNUSED(ctx0);
-    GGML_UNUSED(n_tokens);
-
-    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
-    return nullptr;
-}
-
 ggml_tensor * llama_graph_i::build_inp_s_copy (
-        ggml_context * ctx0) {
+      llama_graph_result * res,
+            ggml_context * ctx0) const {
+    GGML_UNUSED(res);
     GGML_UNUSED(ctx0);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
@@ -95,7 +89,9 @@ ggml_tensor * llama_graph_i::build_inp_s_copy (
 }
 
 ggml_tensor * llama_graph_i::build_inp_s_mask(
-        ggml_context * ctx0) {
+      llama_graph_result * res,
+            ggml_context * ctx0) const {
+    GGML_UNUSED(res);
     GGML_UNUSED(ctx0);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
@@ -104,13 +100,13 @@ ggml_tensor * llama_graph_i::build_inp_s_mask(
 }
 
 ggml_tensor * llama_graph_i::build_copy_mask_state(
-        ggml_context * ctx0,
-         ggml_cgraph * gf,
-         ggml_tensor * s,
-         ggml_tensor * state_copy,
-         ggml_tensor * state_mask,
-             int32_t   n_state,
-             int32_t   n_seqs) {
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+             ggml_tensor * s,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+                 int32_t   n_state,
+                 int32_t   n_seqs) const {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(s);
@@ -125,13 +121,13 @@ ggml_tensor * llama_graph_i::build_copy_mask_state(
 }
 
 ggml_tensor * llama_graph_i::build_mamba_layer(
-        ggml_context * ctx0,
-         ggml_cgraph * gf,
-         ggml_tensor * cur,
-         ggml_tensor * state_copy,
-         ggml_tensor * state_mask,
-  const llama_ubatch & ubatch,
-                 int   il) {
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+             ggml_tensor * cur,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il) const {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(cur);
@@ -146,12 +142,12 @@ ggml_tensor * llama_graph_i::build_mamba_layer(
 }
 
 ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
-        ggml_context * ctx0,
-         ggml_cgraph * gf,
-         ggml_tensor * state_copy,
-         ggml_tensor * state_mask,
-  const llama_ubatch & ubatch,
-                 int   il) {
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il) const {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(state_copy);
@@ -165,10 +161,10 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
 }
 
 ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
-        ggml_context * ctx0,
-         ggml_tensor * token_shift,
-  const llama_ubatch & ubatch,
-                 int   il) {
+            ggml_context * ctx0,
+             ggml_tensor * token_shift,
+      const llama_ubatch & ubatch,
+                     int   il) const {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(token_shift);
     GGML_UNUSED(ubatch);
@@ -180,14 +176,14 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
 }
 
 ggml_tensor * llama_graph_i::build_rwkv6_time_mix(
-        ggml_context * ctx0,
-         ggml_cgraph * gf,
-         ggml_tensor * cur,
-         ggml_tensor * x_prev,
-         ggml_tensor * state_copy,
-         ggml_tensor * state_mask,
-  const llama_ubatch & ubatch,
-                 int   il) {
+            ggml_context * ctx0,
+             ggml_cgraph * gf,
+             ggml_tensor * cur,
+             ggml_tensor * x_prev,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il) const {
     GGML_UNUSED(ctx0);
     GGML_UNUSED(gf);
     GGML_UNUSED(cur);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index a6a9ef00ca860..7ae99becc7e23 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -93,6 +93,7 @@ class llama_graph_result : public llama_graph_result_i {
 //
 
 // TODO: can become more granular in the future
+// TODO: move all methods that do not require things from llama_context to llm_build_context
 class llama_graph_i {
 public:
     llama_graph_i(llama_graph_type type);
@@ -109,28 +110,28 @@ class llama_graph_i {
              ggml_tensor * cur,
               const char * name,
       const llama_ubatch & ubatch,
-                     int   il) = 0;
+                     int   il) const = 0;
 
     // apply control vector for layer il
     virtual ggml_tensor * build_cvec(
             ggml_context * ctx0,
              ggml_tensor * cur,
-                     int   il) = 0;
+                     int   il) const = 0;
 
     // do mat_mul, while optionally apply lora
     virtual ggml_tensor * build_lora_mm(
             ggml_context * ctx0,
              ggml_tensor * w,
-             ggml_tensor * cur) = 0;
+             ggml_tensor * cur) const = 0;
 
     // do mat_mul_id, while optionally apply lora
     virtual ggml_tensor * build_lora_mm_id(
             ggml_context * ctx0,
              ggml_tensor * w,   // struct ggml_tensor * as
              ggml_tensor * cur, // struct ggml_tensor * b
-             ggml_tensor * ids) = 0;
+             ggml_tensor * ids) const = 0;
 
-    virtual ggml_tensor * build_rope_factors(int il) = 0;
+    virtual ggml_tensor * build_rope_factors(int il) const = 0;
 
     // note: optionally set the backend to be the same as the bbuf's backend
     virtual ggml_tensor * build_rope_shift(
@@ -138,7 +139,7 @@ class llama_graph_i {
              ggml_tensor * cur,
              ggml_tensor * shift,
              ggml_tensor * factors,
-             ggml_backend_buffer * bbuf) = 0;
+             ggml_backend_buffer * bbuf) const = 0;
 
     // graph build API (context-specific)
 
@@ -146,26 +147,31 @@ class llama_graph_i {
       llama_graph_result * res,
             ggml_context * ctx0,
              ggml_tensor * tok_embd,
-      const llama_ubatch & ubatch) const = 0; // note these methods will become const, i.e. they don't mutate the llama_context that implements them
+      const llama_ubatch & ubatch) const = 0;
 
     virtual ggml_tensor * build_inp_pos(
+      llama_graph_result * res,
             ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
+                 int32_t   n_tokens) const = 0;
 
     virtual ggml_tensor * build_inp_pos_bucket(
+      llama_graph_result * res,
             ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
+                 int32_t   n_tokens) const = 0;
 
     virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0) = 0;
+      llama_graph_result * res,
+            ggml_context * ctx0) const = 0;
 
     virtual ggml_tensor * build_inp_mean(
+      llama_graph_result * res,
             ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
+                 int32_t   n_tokens) const = 0;
 
     virtual ggml_tensor * build_inp_cls(
+      llama_graph_result * res,
             ggml_context * ctx0,
-                 int32_t   n_tokens) = 0;
+                 int32_t   n_tokens) const = 0;
 
     virtual llama_graph_input_attn_ptr build_attn_inp(
       llama_graph_result * res,
@@ -197,17 +203,16 @@ class llama_graph_i {
                  int       il) const;
 
     virtual ggml_tensor * build_inp_cross_embd(
-            ggml_context * ctx0);
-
-    virtual ggml_tensor * build_inp_cross_kq_mask(
-            ggml_context * ctx0,
-                 int32_t   n_tokens);
+      llama_graph_result * res,
+            ggml_context * ctx0) const;
 
     virtual ggml_tensor * build_inp_s_copy(
-            ggml_context * ctx0);
+      llama_graph_result * res,
+            ggml_context * ctx0) const;
 
     virtual ggml_tensor * build_inp_s_mask(
-            ggml_context * ctx0);
+      llama_graph_result * res,
+            ggml_context * ctx0) const;
 
     virtual ggml_tensor * build_copy_mask_state(
             ggml_context * ctx0,
@@ -216,7 +221,7 @@ class llama_graph_i {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
                  int32_t   n_state,
-                 int32_t   n_seqs);
+                 int32_t   n_seqs) const;
 
     virtual ggml_tensor * build_mamba_layer(
             ggml_context * ctx0,
@@ -225,7 +230,7 @@ class llama_graph_i {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) const;
 
     virtual ggml_tensor * build_rwkv_token_shift_load(
             ggml_context * ctx0,
@@ -233,13 +238,13 @@ class llama_graph_i {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) const;
 
     virtual ggml_tensor * build_rwkv_token_shift_store(
             ggml_context * ctx0,
              ggml_tensor * token_shift,
       const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) const;
 
     virtual ggml_tensor * build_rwkv6_time_mix(
             ggml_context * ctx0,
@@ -249,5 +254,5 @@ class llama_graph_i {
              ggml_tensor * state_copy,
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) const;
 };
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index b6adbb1a1bbed..7fae82c6ecc49 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3910,7 +3910,7 @@ struct llm_build_context {
 
     // TODO: tmp
     struct ggml_tensor * build_inp_pos() {
-        ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens);
         cb(cur, "inp_pos", -1);
 
         return cur;
@@ -3918,7 +3918,7 @@ struct llm_build_context {
 
     // TODO: tmp
     struct ggml_tensor * build_inp_out_ids() {
-        ggml_tensor * cur = lgf->build_inp_out_ids(ctx0);
+        ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0);
         cb(cur, "inp_out_ids", -1);
 
         return cur;
@@ -3926,7 +3926,7 @@ struct llm_build_context {
 
     // TODO: tmp
     struct ggml_tensor * build_inp_mean() {
-        ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens);
         cb(cur, "inp_mean", -1);
 
         return cur;
@@ -3934,7 +3934,7 @@ struct llm_build_context {
 
     // TODO: tmp
     struct ggml_tensor * build_inp_cls() {
-        ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens);
         cb(cur, "inp_cls", -1);
 
         return cur;
@@ -3957,7 +3957,7 @@ struct llm_build_context {
 
     // TODO: tmp
     struct ggml_tensor * build_pos_bucket() {
-        ggml_tensor * cur = lgf->build_inp_pos_bucket(ctx0, n_tokens);
+        ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens);
         cb(cur, "pos_bucket", -1);
 
         return cur;
@@ -3965,20 +3965,12 @@ struct llm_build_context {
 
     // TODO: tmp
     struct ggml_tensor * build_inp_cross_embd() {
-        ggml_tensor * cur = lgf->build_inp_cross_embd(ctx0);
+        ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0);
         cb(cur, "embd_enc", -1);
 
         return cur;
     }
 
-    // TODO: tmp
-    struct ggml_tensor * build_inp_cross_kq_mask() {
-        ggml_tensor * cur = lgf->build_inp_cross_kq_mask(ctx0, n_tokens);
-        cb(cur, "KQ_mask_cross", -1);
-
-        return cur;
-    }
-
     struct ggml_tensor * build_norm(
              struct ggml_tensor * cur,
              struct ggml_tensor * mw,
@@ -3986,8 +3978,8 @@ struct llm_build_context {
                   llm_norm_type   type,
                             int   il) {
         switch (type) {
-            case LLM_NORM:       cur = ggml_norm      (ctx0, cur, hparams.f_norm_eps);     break;
-            case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx0, cur, hparams.f_norm_rms_eps); break;
+            case LLM_NORM:       cur = ggml_norm    (ctx0, cur, hparams.f_norm_eps);     break;
+            case LLM_NORM_RMS:   cur = ggml_rms_norm(ctx0, cur, hparams.f_norm_rms_eps); break;
             case LLM_NORM_GROUP:
                 {
                     cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
@@ -8070,8 +8062,8 @@ struct llm_build_context {
         // {n_embd, n_tokens}
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -10443,8 +10435,8 @@ struct llm_build_context {
         inpL = build_inp_embd(model.tok_embd);
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10535,8 +10527,8 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;

From 0f7daa9d1bce23b962d6c648dc4d7f71d338c8c6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 28 Feb 2025 19:56:10 +0200
Subject: [PATCH 83/84] graph : move non-context related logic to
 llm_build_context

ggml-ci
---
 src/llama-context.cpp | 520 +++++++++---------------------------------
 src/llama-context.h   | 118 ++++------
 src/llama-graph.cpp   |  12 +-
 src/llama-graph.h     |  67 +++---
 src/llama-model.cpp   | 425 +++++++++++++++++++++++++++-------
 src/llama-model.h     |   1 -
 6 files changed, 529 insertions(+), 614 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8587f480fd96f..7ba86a2a7f91a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -71,26 +71,7 @@ void llama_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-class llama_graph_input_pos : public llama_graph_input_i {
-public:
-    llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
-    virtual ~llama_graph_input_pos() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * pos = nullptr; // I32 [n_batch]
-
-    const int64_t n_pos_per_token = 1;
-};
-
-void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) {
-    if (ubatch->pos && pos) {
-        const int64_t n_tokens = ubatch->n_tokens;
-
-        ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
-    }
-}
-
+// I32 [n_batch, n_batch]
 class llama_graph_input_pos_bucket : public llama_graph_input_i {
 public:
     llama_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
@@ -98,19 +79,17 @@ class llama_graph_input_pos_bucket : public llama_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
-
     const llama_hparams & hparams;
 };
 
 void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
-    if (pos_bucket) {
+    if (cur) {
         const int64_t n_tokens = ubatch->n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
         GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
 
-        int32_t * data = (int32_t *) pos_bucket->data;
+        int32_t * data = (int32_t *) cur->data;
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
@@ -122,192 +101,6 @@ void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-class llama_graph_input_out_ids : public llama_graph_input_i {
-public:
-    llama_graph_input_out_ids(
-            const llama_hparams & hparams,
-            const llama_cparams & cparams,
-            int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
-    virtual ~llama_graph_input_out_ids() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * out_ids; // I32 [n_outputs]
-
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
-
-    const int32_t n_outputs;
-};
-
-void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
-    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        //GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
-
-        if (!out_ids) {
-            LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
-        } else {
-            const int64_t n_tokens = ubatch->n_tokens;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
-            int32_t * data = (int32_t *) out_ids->data;
-
-            if (n_outputs == n_tokens) {
-                for (int i = 0; i < n_tokens; ++i) {
-                    data[i] = i;
-                }
-            } else if (ubatch->output) {
-                int32_t n_outputs = 0;
-                for (int i = 0; i < n_tokens; ++i) {
-                    if (ubatch->output[i]) {
-                        data[n_outputs++] = i;
-                    }
-                }
-                // the graph needs to have been passed the correct number of outputs
-                GGML_ASSERT(n_outputs == n_outputs);
-            } else if (n_outputs == 1) {
-                // only keep last output
-                data[0] = n_tokens - 1;
-            } else {
-                GGML_ASSERT(n_outputs == 0);
-            }
-        }
-    }
-}
-
-class llama_graph_input_mean : public llama_graph_input_i {
-public:
-    llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
-    virtual ~llama_graph_input_mean() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * mean; // F32 [n_batch, n_batch]
-
-    const llama_cparams & cparams;
-};
-
-void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) {
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
-
-        GGML_ASSERT(mean);
-        GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
-
-        float * data = (float *) mean->data;
-        memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean));
-
-        std::vector<uint64_t> sum(n_tokens, 0);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
-            sum[seq_id] += ubatch->n_seq_tokens;
-        }
-
-        std::vector<float> div(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
-            const uint64_t s = sum[i];
-            if (s > 0) {
-                div[i] = 1.0f/float(s);
-            }
-        }
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
-            }
-        }
-    }
-}
-
-class llama_graph_input_cls : public llama_graph_input_i {
-public:
-    llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
-    virtual ~llama_graph_input_cls() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * cls; // I32 [n_batch]
-
-    const llama_cparams & cparams;
-};
-
-void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) {
-    if (cparams.embeddings && (
-                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
-                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
-
-        GGML_ASSERT(cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
-
-        uint32_t * data = (uint32_t *) cls->data;
-        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
-
-                if (pos == 0) {
-                    data[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-    }
-
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
-
-        GGML_ASSERT(cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
-
-        uint32_t * data = (uint32_t *) cls->data;
-        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
-
-        std::vector<int> last_pos(n_tokens, -1);
-        std::vector<int> last_row(n_tokens, -1);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
-
-                if (pos >= last_pos[seq_id]) {
-                    last_pos[seq_id] = pos;
-                    last_row[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-
-        for (int i = 0; i < n_tokens; ++i) {
-            if (last_row[i] >= 0) {
-                data[i] = last_row[i];
-            }
-        }
-    }
-}
-
 class llama_graph_input_attn_base : public llama_graph_input_attn_i {
 public:
     llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) :
@@ -1359,14 +1152,6 @@ int llama_context_base::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-//
-// input
-//
-
-int64_t llama_context_base::n_pos_per_token() const {
-    return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
-}
-
 //
 // output
 //
@@ -1535,6 +1320,10 @@ enum ggml_status llama_context_base::graph_compute(
 // graph build API
 //
 
+int32_t llama_context_base::get_n_outputs() const {
+    return n_outputs;
+}
+
 void llama_context_base::build_cb(
          ggml_tensor * cur,
           const char * name,
@@ -1650,57 +1439,7 @@ ggml_tensor * llama_context_base::build_rope_factors(int il) const {
     return model.layers[il].rope_short;
 }
 
-ggml_tensor * llama_context_base::build_rope_shift(
-        ggml_context * ctx0,
-        ggml_tensor * cur,
-        ggml_tensor * shift,
-        ggml_tensor * factors,
-        ggml_backend_buffer * bbuf) const {
-    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
-    const auto & freq_base  = cparams.rope_freq_base;
-    const auto & freq_scale = cparams.rope_freq_scale;
-
-    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
-    const auto & yarn_attn_factor = cparams.yarn_attn_factor;
-    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
-
-    const auto & n_rot     = model.hparams.n_rot;
-    const auto & rope_type = model.hparams.rope_type;
-
-    struct ggml_tensor * tmp;
-
-    if (ggml_is_quantized(cur->type)) {
-        // dequantize to f32 -> RoPE -> quantize back
-        tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
-
-        if (bbuf) {
-            for (const auto & backend : backends) {
-                // Figure out which backend KV cache belongs to
-                if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
-                    ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
-                    break;
-                }
-            }
-        }
-
-        tmp = ggml_rope_ext_inplace(ctx0, tmp,
-                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-
-        tmp = ggml_cpy(ctx0, tmp, cur);
-    } else {
-        // we rotate only the first n_rot dimensions
-        tmp = ggml_rope_ext_inplace(ctx0, cur,
-                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-    }
-
-    return tmp;
-}
-
-ggml_tensor * llama_context_base::build_inp_embd(
-        llama_graph_result * res,
+llama_graph_input_ptr llama_context_base::build_inp_embd(
               ggml_context * ctx0,
                ggml_tensor * tok_embd,
         const llama_ubatch & ubatch) const {
@@ -1710,14 +1449,14 @@ ggml_tensor * llama_context_base::build_inp_embd(
 
     auto inp = std::make_shared<llama_graph_input_embd>();
 
-    struct ggml_tensor * inpL;
+    auto & cur = inp->cur;
 
     if (ubatch.token) {
         inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
         //cb(inp->tokens, "inp_tokens", -1);
         ggml_set_input(inp->tokens);
 
-        inpL = ggml_get_rows(ctx0, tok_embd, inp->tokens);
+        cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
 
         // apply lora for embedding tokens if needed
         for (const auto & lora : loras) {
@@ -1734,97 +1473,36 @@ ggml_tensor * llama_context_base::build_inp_embd(
                         ggml_get_rows(ctx0, lw->a, inp->tokens)
                         ), scale);
 
-            inpL = ggml_add(ctx0, inpL, inpL_delta);
+            cur = ggml_add(ctx0, cur, inpL_delta);
         }
     } else {
         inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
-        inpL = inp->embd;
+        cur = inp->embd;
         ggml_set_input(inp->embd);
     }
 
     // For Granite architecture
     if (hparams.f_embedding_scale != 0.0f) {
-        inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale);
+        cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
     }
 
-    res->add_input(std::move(inp));
-
-    //cb(inpL, "inp_embd", -1);
-
-    return inpL;
-}
-
-ggml_tensor * llama_context_base::build_inp_pos(
-        llama_graph_result * res,
-              ggml_context * ctx0,
-                   int32_t   n_tokens) const {
-    auto inp = std::make_shared<llama_graph_input_pos>(n_pos_per_token());
-
-    inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
-    ggml_set_input(inp->pos);
-
-    res->add_input(inp);
+    //cb(cur, "inp_embd", -1);
 
-    return inp->pos;
+    return inp;
 }
 
-ggml_tensor * llama_context_base::build_inp_pos_bucket(
-        llama_graph_result * res,
+llama_graph_input_ptr llama_context_base::build_inp_pos_bucket(
               ggml_context * ctx0,
                    int32_t   n_tokens) const {
     auto inp = std::make_shared<llama_graph_input_pos_bucket>(model.hparams);
 
-    inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
-    ggml_set_input(inp->pos_bucket);
-
-    res->add_input(inp);
-
-    return inp->pos_bucket;
-}
-
-ggml_tensor * llama_context_base::build_inp_out_ids(
-        llama_graph_result * res,
-              ggml_context * ctx0) const {
-    auto inp = std::make_shared<llama_graph_input_out_ids>(model.hparams, cparams, n_outputs);
-
-    inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
-    ggml_set_input(inp->out_ids);
-
-    res->add_input(inp);
-
-    return inp->out_ids;
-}
-
-ggml_tensor * llama_context_base::build_inp_mean(
-        llama_graph_result * res,
-              ggml_context * ctx0,
-                   int32_t   n_tokens) const {
-    auto inp = std::make_shared<llama_graph_input_mean>(cparams);
-
-    inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
-    ggml_set_input(inp->mean);
-
-    res->add_input(inp);
+    inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
+    ggml_set_input(inp->cur);
 
-    return inp->mean;
-}
-
-ggml_tensor * llama_context_base::build_inp_cls(
-        llama_graph_result * res,
-              ggml_context * ctx0,
-                   int32_t   n_tokens) const {
-    auto inp = std::make_shared<llama_graph_input_cls>(cparams);
-
-    inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_set_input(inp->cls);
-
-    res->add_input(inp);
-
-    return inp->cls;
+    return inp;
 }
 
 llama_graph_input_attn_ptr llama_context_base::build_attn_inp(
-        llama_graph_result * res,
               ggml_context * ctx0,
                    int32_t   n_tokens,
                       bool   causal,
@@ -1841,8 +1519,6 @@ llama_graph_input_attn_ptr llama_context_base::build_attn_inp(
 
     inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
 
-    res->add_input(inp);
-
     return inp;
 }
 
@@ -1874,6 +1550,55 @@ ggml_tensor * llama_context_base::build_attn(
     return cur;
 }
 
+ggml_tensor * llama_context_base::build_rope_shift(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * shift,
+        ggml_tensor * factors,
+        ggml_backend_buffer * bbuf) const {
+    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
+    const auto & freq_base  = cparams.rope_freq_base;
+    const auto & freq_scale = cparams.rope_freq_scale;
+
+    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
+    const auto & yarn_attn_factor = cparams.yarn_attn_factor;
+    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
+
+    const auto & n_rot     = model.hparams.n_rot;
+    const auto & rope_type = model.hparams.rope_type;
+
+    struct ggml_tensor * tmp;
+
+    if (ggml_is_quantized(cur->type)) {
+        // dequantize to f32 -> RoPE -> quantize back
+        tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
+
+        if (bbuf) {
+            for (const auto & backend : backends) {
+                // Figure out which backend KV cache belongs to
+                if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
+                    ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
+                    break;
+                }
+            }
+        }
+
+        tmp = ggml_rope_ext_inplace(ctx0, tmp,
+                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+
+        tmp = ggml_cpy(ctx0, tmp, cur);
+    } else {
+        // we rotate only the first n_rot dimensions
+        tmp = ggml_rope_ext_inplace(ctx0, cur,
+                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+    }
+
+    return tmp;
+}
+
 ggml_tensor * llama_context_base::build_attn_mha(
         ggml_context * ctx0,
          ggml_cgraph * gf,
@@ -2485,6 +2210,7 @@ size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_i
 // llama_context_kv_self
 //
 
+// I32 [n_kv, n_batch]
 class llama_graph_input_pos_bucket_kv : public llama_graph_input_i {
 public:
     llama_graph_input_pos_bucket_kv(
@@ -2494,20 +2220,18 @@ class llama_graph_input_pos_bucket_kv : public llama_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
-
     const llama_hparams & hparams;
     const llama_kv_cache_unified * kv_self;
 };
 
 void llama_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
-    if (pos_bucket) {
+    if (cur) {
         const int64_t n_tokens = ubatch->n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
         GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
 
-        int32_t * data = (int32_t *) pos_bucket->data;
+        int32_t * data = (int32_t *) cur->data;
 
         const int64_t n_kv = kv_self->n;
 
@@ -3311,24 +3035,20 @@ ggml_cgraph * llama_context_kv_self::graph_init() {
     return llama_context_base::graph_init();
 }
 
-ggml_tensor * llama_context_kv_self::build_inp_pos_bucket(
-        llama_graph_result * res,
+llama_graph_input_ptr llama_context_kv_self::build_inp_pos_bucket(
               ggml_context * ctx0,
                    int32_t   n_tokens) const {
     auto inp = std::make_shared<llama_graph_input_pos_bucket_kv>(model.hparams, kv_self.get());
 
     const auto n_kv = kv_self->n;
 
-    inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
-    ggml_set_input(inp->pos_bucket);
+    inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
+    ggml_set_input(inp->cur);
 
-    res->inputs.push_back(inp);
-
-    return inp->pos_bucket;
+    return inp;
 }
 
 llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
@@ -3359,8 +3079,6 @@ llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp(
         inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
     }
 
-    res->add_input(inp);
-
     return inp;
 }
 
@@ -3833,6 +3551,7 @@ size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_se
 // llama_context_recurrent
 //
 
+// I32 [kv_size]
 class llama_graph_input_s_copy : public llama_graph_input_i {
 public:
     llama_graph_input_s_copy(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
@@ -3840,8 +3559,6 @@ class llama_graph_input_s_copy : public llama_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    ggml_tensor * s_copy; // I32 [kv_size]
-
     llama_kv_cache_recurrent * kv_self;
 };
 
@@ -3850,9 +3567,9 @@ void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
 
     const int64_t n_kv = kv_self->n;
 
-    if (s_copy) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
-        int32_t * data = (int32_t *) s_copy->data;
+    if (cur) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
+        int32_t * data = (int32_t *) cur->data;
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
         for (uint32_t i = 0; i < n_kv; ++i) {
@@ -3878,6 +3595,7 @@ void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
     }
 }
 
+// F32 [1, n_kv]
 class llama_graph_input_s_mask : public llama_graph_input_i {
 public:
     llama_graph_input_s_mask(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
@@ -3885,8 +3603,6 @@ class llama_graph_input_s_mask : public llama_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    ggml_tensor * s_mask; // F32 [1, n_kv]
-
     llama_kv_cache_recurrent * kv_self;
 };
 
@@ -3895,9 +3611,9 @@ void llama_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
 
     const int64_t n_kv = kv_self->n;
 
-    if (s_mask) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
-        float * data = (float *) s_mask->data;
+    if (cur) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
+        float * data = (float *) cur->data;
 
         // clear unused states
         for (int i = 0; i < n_kv; ++i) {
@@ -4302,36 +4018,30 @@ ggml_cgraph * llama_context_recurrent::graph_init() {
     return llama_context_base::graph_init();
 }
 
-ggml_tensor * llama_context_recurrent::build_inp_s_copy(
-      llama_graph_result * res,
+llama_graph_input_ptr llama_context_recurrent::build_inp_s_copy(
             ggml_context * ctx0) const {
     auto inp = std::make_shared<llama_graph_input_s_copy>(kv_self.get());
 
     const auto n_kv = kv_self->n;
 
-    inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
-    //cb(inp.s_copy, "inp_s_copy", -1);
-    ggml_set_input(inp->s_copy);
-
-    res->add_input(inp);
+    inp->cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
+    //cb(inp.cur, "inp_s_copy", -1);
+    ggml_set_input(inp->cur);
 
-    return inp->s_copy;
+    return inp;
 }
 
-ggml_tensor * llama_context_recurrent::build_inp_s_mask(
-      llama_graph_result * res,
+llama_graph_input_ptr llama_context_recurrent::build_inp_s_mask(
             ggml_context * ctx0) const {
     auto inp = std::make_shared<llama_graph_input_s_mask>(kv_self.get());
 
     const auto n_kv = kv_self->n;
 
-    inp->s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
-    //cb(inp->s_mask, "inp_s_mask", -1);
-    ggml_set_input(inp->s_mask);
-
-    res->add_input(inp);
+    inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
+    //cb(inp->cur, "inp_s_mask", -1);
+    ggml_set_input(inp->cur);
 
-    return inp->s_mask;
+    return inp;
 }
 
 ggml_tensor * llama_context_recurrent::build_copy_mask_state(
@@ -4904,6 +4614,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
 // llama_context_dec
 //
 
+// F32 [n_embd, n_outputs_enc]
 class llama_graph_input_cross_embd : public llama_graph_input_i {
 public:
     llama_graph_input_cross_embd(
@@ -4912,26 +4623,24 @@ class llama_graph_input_cross_embd : public llama_graph_input_i {
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
-
     const llama_cross * cross;
 };
 
 void llama_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
     GGML_UNUSED(ubatch);
 
-    if (cross_embd && cross->t_embd) {
-        assert(cross_embd->type == GGML_TYPE_F32);
+    if (cur && cross->t_embd) {
+        assert(cur->type == GGML_TYPE_F32);
 
-        ggml_backend_tensor_set(cross_embd, cross->v_embd, 0, ggml_nbytes(cross_embd));
+        ggml_backend_tensor_set(cur, cross->v_embd, 0, ggml_nbytes(cur));
     }
 }
 
 class llama_graph_input_attn_dec : public llama_graph_input_attn_i {
 public:
     llama_graph_input_attn_dec(
-            llama_graph_input_attn_i * inp_kv_self,
-            const llama_cross * cross) : inp_kv_self(inp_kv_self), cross(cross) {}
+            llama_graph_input_attn_ptr inp_kv_self,
+            const llama_cross * cross) : inp_kv_self(std::move(inp_kv_self)), cross(cross) {}
 
     void set_input(const llama_ubatch * ubatch) override;
 
@@ -4942,11 +4651,14 @@ class llama_graph_input_attn_dec : public llama_graph_input_attn_i {
     ggml_tensor * cross_kq_mask     = nullptr; // F32 [n_outputs_enc, n_batch]
     ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch]
 
-    llama_graph_input_attn_i * inp_kv_self = nullptr;
+    llama_graph_input_attn_ptr inp_kv_self = nullptr;
+
     const llama_cross * cross = nullptr;
 };
 
 void llama_graph_input_attn_dec::set_input(const llama_ubatch * ubatch) {
+    inp_kv_self->set_input(ubatch);
+
     if (cross_kq_mask) {
         const int64_t n_enc    = cross_kq_mask->ne[0];
         const int64_t n_tokens = ubatch->n_tokens;
@@ -4990,17 +4702,16 @@ ggml_cgraph * llama_context_dec::graph_init() {
     return llama_context_kv_self::graph_init();
 }
 
-ggml_tensor * llama_context_dec::build_inp_cross_embd(
-      llama_graph_result * res,
+llama_graph_input_ptr llama_context_dec::build_inp_cross_embd(
             ggml_context * ctx0) const {
     auto inp = std::make_shared<llama_graph_input_cross_embd>(cross);
 
     // if we have the output embeddings from the encoder, use them directly
     // TODO: needs more work to be correct, for now just use the tensor shape
     //if (cross->t_embd) {
-    //    inp->cross_embd = ggml_view_tensor(ctx0, cross->t_embd);
+    //    inp->cur = ggml_view_tensor(ctx0, cross->t_embd);
 
-    //    return inp->cross_embd;
+    //    return inp->cur;
     //}
 
     const auto & hparams = model.hparams;
@@ -5008,23 +4719,20 @@ ggml_tensor * llama_context_dec::build_inp_cross_embd(
     const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd;
     const auto n_enc  = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train;
 
-    inp->cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
-    ggml_set_input(inp->cross_embd);
-
-    res->add_input(inp);
+    inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
+    ggml_set_input(inp->cur);
 
-    return inp->cross_embd;
+    return inp;
 }
 
 llama_graph_input_attn_ptr llama_context_dec::build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
                     bool   swa) const {
-    auto inp_kv_self = llama_context_kv_self::build_attn_inp(res, ctx0, n_tokens, causal, swa);
+    auto inp_kv_self = llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa);
 
-    auto inp = std::make_shared<llama_graph_input_attn_dec>(inp_kv_self.get(), cross);
+    auto inp = std::make_shared<llama_graph_input_attn_dec>(std::move(inp_kv_self), cross);
 
     const int32_t n_enc = cross->t_embd ? cross->t_embd->ne[1] : model.hparams.n_ctx_train;
 
@@ -5033,8 +4741,6 @@ llama_graph_input_attn_ptr llama_context_dec::build_attn_inp(
 
     inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
 
-    res->add_input(inp);
-
     return inp;
 }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 21015e8796e40..a5159bc5b34b6 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -242,12 +242,6 @@ class llama_context_base : public llama_context, public llama_graph_i {
     int decode(llama_batch & inp_batch) override;
 
 protected:
-    //
-    // input
-    //
-
-    virtual int64_t n_pos_per_token() const; // vision
-
     //
     // output
     //
@@ -287,6 +281,8 @@ class llama_context_base : public llama_context, public llama_graph_i {
     // graph build
     //
 
+    int32_t get_n_outputs() const override;
+
     void build_cb(
              ggml_tensor * cur,
               const char * name,
@@ -314,45 +310,16 @@ class llama_context_base : public llama_context, public llama_graph_i {
 
     ggml_tensor * build_rope_factors(int il) const override;
 
-    ggml_tensor * build_rope_shift(
-            ggml_context * ctx0,
-             ggml_tensor * cur,
-             ggml_tensor * shift,
-             ggml_tensor * factors,
-             ggml_backend_buffer * bbuf) const override;
-
-    ggml_tensor * build_inp_embd(
-            llama_graph_result * res,
+    llama_graph_input_ptr build_inp_embd(
                   ggml_context * ctx0,
                    ggml_tensor * tok_embd,
             const llama_ubatch & ubatch) const override;
 
-    ggml_tensor * build_inp_pos(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const override;
-
-    ggml_tensor * build_inp_pos_bucket(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const override;
-
-    ggml_tensor * build_inp_out_ids(
-      llama_graph_result * res,
-            ggml_context * ctx0) const override;
-
-    ggml_tensor * build_inp_mean(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const override;
-
-    ggml_tensor * build_inp_cls(
-      llama_graph_result * res,
+    llama_graph_input_ptr build_inp_pos_bucket(
             ggml_context * ctx0,
                  int32_t   n_tokens) const override;
 
     llama_graph_input_attn_ptr build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
@@ -370,7 +337,15 @@ class llama_context_base : public llama_context, public llama_graph_i {
                      int   il) const override;
 
 protected:
-    virtual ggml_tensor * build_attn_mha(
+    // note: optionally set the backend to be the same as the bbuf's backend
+    ggml_tensor * build_rope_shift(
+            ggml_context * ctx0,
+             ggml_tensor * cur,
+             ggml_tensor * shift,
+             ggml_tensor * factors,
+             ggml_backend_buffer * bbuf) const;
+
+    ggml_tensor * build_attn_mha(
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * q,
@@ -458,28 +433,9 @@ class llama_context_base : public llama_context, public llama_graph_i {
     llama_loras        loras;
     llama_sbatch       sbatch;
 
-    ggml_threadpool_t threadpool       = nullptr;
-    ggml_threadpool_t threadpool_batch = nullptr;
-
-    ggml_abort_callback abort_callback      = nullptr;
-    void *              abort_callback_data = nullptr;
-
-    ggml_backend_t backend_cpu = nullptr;
-    std::vector<ggml_backend_ptr> backends;
-
-    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
-
     ggml_backend_sched_ptr sched;
 
-    // buffer types used for the compute buffer of each backend
-    std::vector<ggml_backend_t>             backend_ptrs;
-    std::vector<ggml_backend_buffer_type_t> backend_buft;
-
-    // memory buffers used to evaluate the model
-    std::vector<uint8_t> buf_compute_meta;
-
-    // host buffer for the model output (logits and embeddings)
-    ggml_backend_buffer_ptr buf_output;
+    // TODO: these below likely need some rework in the future, together with the batch-refactoring
 
     // TODO: remove
     bool logits_all = false;
@@ -502,6 +458,30 @@ class llama_context_base : public llama_context, public llama_graph_i {
 
     std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
 
+private:
+    // base functionality - should not leak into derived classes
+
+    ggml_threadpool_t threadpool       = nullptr;
+    ggml_threadpool_t threadpool_batch = nullptr;
+
+    ggml_abort_callback abort_callback      = nullptr;
+    void *              abort_callback_data = nullptr;
+
+    ggml_backend_t backend_cpu = nullptr;
+    std::vector<ggml_backend_ptr> backends;
+
+    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+
+    // buffer types used for the compute buffer of each backend
+    std::vector<ggml_backend_t>             backend_ptrs;
+    std::vector<ggml_backend_buffer_type_t> backend_buft;
+
+    // memory buffers used to evaluate the model
+    std::vector<uint8_t> buf_compute_meta;
+
+    // host buffer for the model output (logits and embeddings)
+    ggml_backend_buffer_ptr buf_output;
+
     bool has_evaluated_once = false;
 };
 
@@ -539,13 +519,11 @@ class llama_context_kv_self : public llama_context_base {
     // graph build
     //
 
-    ggml_tensor * build_inp_pos_bucket(
-      llama_graph_result * res,
+    llama_graph_input_ptr build_inp_pos_bucket(
             ggml_context * ctx0,
                  int32_t   n_tokens) const override;
 
     llama_graph_input_attn_ptr build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
@@ -624,12 +602,10 @@ class llama_context_recurrent : public llama_context_base {
     // graph build
     //
 
-    ggml_tensor * build_inp_s_copy(
-      llama_graph_result * res,
+    llama_graph_input_ptr build_inp_s_copy(
             ggml_context * ctx0) const override;
 
-    ggml_tensor * build_inp_s_mask(
-      llama_graph_result * res,
+    llama_graph_input_ptr build_inp_s_mask(
             ggml_context * ctx0) const override;
 
     ggml_tensor * build_copy_mask_state(
@@ -694,6 +670,10 @@ class llama_context_recurrent : public llama_context_base {
     std::unique_ptr<llama_kv_cache_recurrent> kv_self;
 };
 
+//
+// enc-dec
+//
+
 // TODO: tmp - need something better to pass the data from the encoder to the decoder
 struct llama_cross {
     // the output embeddings from the encoder as a ggml tensor
@@ -714,7 +694,7 @@ class llama_context_enc : public llama_context_base {
 
     int encode(llama_batch & inp_batch) override;
 
-    llama_cross * cross = nullptr;
+    llama_cross * cross = nullptr; // TODO: hacky, rework
 };
 
 class llama_context_dec : public llama_context_kv_self {
@@ -730,12 +710,10 @@ class llama_context_dec : public llama_context_kv_self {
 
     ggml_cgraph * graph_init() override;
 
-    ggml_tensor * build_inp_cross_embd(
-      llama_graph_result * res,
+    llama_graph_input_ptr build_inp_cross_embd(
             ggml_context * ctx0) const override;
 
     llama_graph_input_attn_ptr build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
@@ -753,7 +731,7 @@ class llama_context_dec : public llama_context_kv_self {
                  int       il) const override;
 
 public:
-    llama_cross * cross = nullptr;
+    llama_cross * cross = nullptr; // TODO: hacky, rework
 };
 
 class llama_context_enc_dec : public llama_context {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 79b26d1734ca3..89e311a915a31 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -67,20 +67,16 @@ ggml_tensor * llama_graph_i::build_attn_cross(
     return nullptr;
 }
 
-ggml_tensor * llama_graph_i::build_inp_cross_embd(
-      llama_graph_result * res,
+llama_graph_input_ptr llama_graph_i::build_inp_cross_embd(
             ggml_context * ctx0) const {
-    GGML_UNUSED(res);
     GGML_UNUSED(ctx0);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
     return nullptr;
 }
 
-ggml_tensor * llama_graph_i::build_inp_s_copy (
-      llama_graph_result * res,
+llama_graph_input_ptr llama_graph_i::build_inp_s_copy (
             ggml_context * ctx0) const {
-    GGML_UNUSED(res);
     GGML_UNUSED(ctx0);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
@@ -88,10 +84,8 @@ ggml_tensor * llama_graph_i::build_inp_s_copy (
     return nullptr; // NOLINT
 }
 
-ggml_tensor * llama_graph_i::build_inp_s_mask(
-      llama_graph_result * res,
+llama_graph_input_ptr llama_graph_i::build_inp_s_mask(
             ggml_context * ctx0) const {
-    GGML_UNUSED(res);
     GGML_UNUSED(ctx0);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 7ae99becc7e23..343d4a0772277 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -29,6 +29,9 @@ class llama_graph_input_i {
     virtual ~llama_graph_input_i() = default;
 
     virtual void set_input(const llama_ubatch * ubatch) = 0;
+
+    // by default, we produce a single input tensor, but some children could produce more
+    ggml_tensor * cur = nullptr;
 };
 
 using llama_graph_input_ptr = std::shared_ptr<llama_graph_input_i>;
@@ -76,7 +79,7 @@ class llama_graph_result : public llama_graph_result_i {
         }
     }
 
-    void add_input(llama_graph_input_ptr && input) {
+    void add_input(llama_graph_input_ptr input) {
         inputs.emplace_back(std::move(input));
     }
 
@@ -92,19 +95,23 @@ class llama_graph_result : public llama_graph_result_i {
 // llama_graph
 //
 
+// note: keep all methods const
 // TODO: can become more granular in the future
-// TODO: move all methods that do not require things from llama_context to llm_build_context
 class llama_graph_i {
 public:
     llama_graph_i(llama_graph_type type);
     virtual ~llama_graph_i() = default;
 
-    llama_graph_type get_type() const { return type; }
+    llama_graph_type get_type() const {
+        return type;
+    }
 
-protected:
+private:
     llama_graph_type type;
 
 public:
+    virtual int32_t get_n_outputs() const = 0;
+
     // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
     virtual void build_cb(
              ggml_tensor * cur,
@@ -131,50 +138,27 @@ class llama_graph_i {
              ggml_tensor * cur, // struct ggml_tensor * b
              ggml_tensor * ids) const = 0;
 
+    // rope factors based on the current context size
     virtual ggml_tensor * build_rope_factors(int il) const = 0;
 
-    // note: optionally set the backend to be the same as the bbuf's backend
-    virtual ggml_tensor * build_rope_shift(
-            ggml_context * ctx0,
-             ggml_tensor * cur,
-             ggml_tensor * shift,
-             ggml_tensor * factors,
-             ggml_backend_buffer * bbuf) const = 0;
-
     // graph build API (context-specific)
 
-    virtual ggml_tensor * build_inp_embd(
-      llama_graph_result * res,
+    // input embeddings with optional lora
+    virtual llama_graph_input_ptr build_inp_embd(
             ggml_context * ctx0,
              ggml_tensor * tok_embd,
       const llama_ubatch & ubatch) const = 0;
 
-    virtual ggml_tensor * build_inp_pos(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const = 0;
-
-    virtual ggml_tensor * build_inp_pos_bucket(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const = 0;
-
-    virtual ggml_tensor * build_inp_out_ids(
-      llama_graph_result * res,
-            ggml_context * ctx0) const = 0;
-
-    virtual ggml_tensor * build_inp_mean(
-      llama_graph_result * res,
+    // enc-dec pos
+    virtual llama_graph_input_ptr build_inp_pos_bucket(
             ggml_context * ctx0,
                  int32_t   n_tokens) const = 0;
 
-    virtual ggml_tensor * build_inp_cls(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const = 0;
+    //
+    // attention API
+    //
 
     virtual llama_graph_input_attn_ptr build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
@@ -202,16 +186,17 @@ class llama_graph_i {
                  float     kq_scale,
                  int       il) const;
 
-    virtual ggml_tensor * build_inp_cross_embd(
-      llama_graph_result * res,
+    virtual llama_graph_input_ptr build_inp_cross_embd(
             ggml_context * ctx0) const;
 
-    virtual ggml_tensor * build_inp_s_copy(
-      llama_graph_result * res,
+    //
+    // recurrent API
+    //
+
+    virtual llama_graph_input_ptr build_inp_s_copy(
             ggml_context * ctx0) const;
 
-    virtual ggml_tensor * build_inp_s_mask(
-      llama_graph_result * res,
+    virtual llama_graph_input_ptr build_inp_s_mask(
             ggml_context * ctx0) const;
 
     virtual ggml_tensor * build_copy_mask_state(
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 7fae82c6ecc49..60a8cc0f8b0a7 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3813,6 +3813,212 @@ enum llm_norm_type {
     LLM_NORM_GROUP,
 };
 
+class llama_graph_input_pos : public llama_graph_input_i {
+public:
+    llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    virtual ~llama_graph_input_pos() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * pos = nullptr; // I32 [n_batch]
+
+    const int64_t n_pos_per_token = 1;
+};
+
+void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) {
+    if (ubatch->pos && pos) {
+        const int64_t n_tokens = ubatch->n_tokens;
+
+        ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
+    }
+}
+
+class llama_graph_input_out_ids : public llama_graph_input_i {
+public:
+    llama_graph_input_out_ids(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
+    virtual ~llama_graph_input_out_ids() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * out_ids; // I32 [n_outputs]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const int32_t n_outputs;
+};
+
+void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
+    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        //GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
+
+        if (!out_ids) {
+            LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
+        } else {
+            const int64_t n_tokens = ubatch->n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
+            int32_t * data = (int32_t *) out_ids->data;
+
+            if (n_outputs == n_tokens) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = i;
+                }
+            } else if (ubatch->output) {
+                int32_t n_outputs = 0;
+                for (int i = 0; i < n_tokens; ++i) {
+                    if (ubatch->output[i]) {
+                        data[n_outputs++] = i;
+                    }
+                }
+                // the graph needs to have been passed the correct number of outputs
+                GGML_ASSERT(n_outputs == n_outputs);
+            } else if (n_outputs == 1) {
+                // only keep last output
+                data[0] = n_tokens - 1;
+            } else {
+                GGML_ASSERT(n_outputs == 0);
+            }
+        }
+    }
+}
+
+class llama_graph_input_mean : public llama_graph_input_i {
+public:
+    llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
+    virtual ~llama_graph_input_mean() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * mean; // F32 [n_batch, n_batch]
+
+    const llama_cparams & cparams;
+};
+
+void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) {
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+        const int64_t n_tokens     = ubatch->n_tokens;
+        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+        const int64_t n_seqs       = ubatch->n_seqs;
+
+        GGML_ASSERT(mean);
+        GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
+
+        float * data = (float *) mean->data;
+        memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean));
+
+        std::vector<uint64_t> sum(n_tokens, 0);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
+
+            sum[seq_id] += ubatch->n_seq_tokens;
+        }
+
+        std::vector<float> div(n_tokens, 0.0f);
+        for (int i = 0; i < n_tokens; ++i) {
+            const uint64_t s = sum[i];
+            if (s > 0) {
+                div[i] = 1.0f/float(s);
+            }
+        }
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
+            }
+        }
+    }
+}
+
+class llama_graph_input_cls : public llama_graph_input_i {
+public:
+    llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
+    virtual ~llama_graph_input_cls() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * cls; // I32 [n_batch]
+
+    const llama_cparams & cparams;
+};
+
+void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) {
+    if (cparams.embeddings && (
+                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
+        const int64_t n_tokens     = ubatch->n_tokens;
+        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+        const int64_t n_seqs       = ubatch->n_seqs;
+
+        GGML_ASSERT(cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
+
+        uint32_t * data = (uint32_t *) cls->data;
+        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
+
+                if (pos == 0) {
+                    data[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+    }
+
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+        const int64_t n_tokens     = ubatch->n_tokens;
+        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+        const int64_t n_seqs       = ubatch->n_seqs;
+
+        GGML_ASSERT(cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
+
+        uint32_t * data = (uint32_t *) cls->data;
+        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
+
+        std::vector<int> last_pos(n_tokens, -1);
+        std::vector<int> last_row(n_tokens, -1);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
+
+                if (pos >= last_pos[seq_id]) {
+                    last_pos[seq_id] = pos;
+                    last_row[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+
+        for (int i = 0; i < n_tokens; ++i) {
+            if (last_row[i] >= 0) {
+                data[i] = last_row[i];
+            }
+        }
+    }
+}
+
 struct llm_build_context {
     const llama_model   & model;
     const llama_hparams & hparams;
@@ -3895,55 +4101,75 @@ struct llm_build_context {
         res              (std::make_unique<llama_graph_result>()) {
         }
 
+    int64_t n_pos_per_token() const {
+        return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+    }
+
     // TODO: tmp
     void cb(struct ggml_tensor * cur, const char * name, int il) {
         lgf->build_cb(cur, name, ubatch, il);
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
-        struct ggml_tensor * inpL = lgf->build_inp_embd(res.get(), ctx0, tok_embd, ubatch);
-        cb(inpL, "inp_embd", -1);
+        auto inp = lgf->build_inp_embd(ctx0, tok_embd, ubatch);
+
+        cb(inp->cur, "inp_embd", -1);
 
-        return inpL;
+        res->add_input(inp);
+
+        return inp->cur;
     }
 
-    // TODO: tmp
-    struct ggml_tensor * build_inp_pos() {
-        ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens);
-        cb(cur, "inp_pos", -1);
+    struct ggml_tensor * build_inp_pos() const {
+        auto inp = std::make_shared<llama_graph_input_pos>(n_pos_per_token());
 
-        return cur;
+        inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
+        ggml_set_input(inp->pos);
+
+        res->add_input(inp);
+
+        return inp->pos;
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_inp_out_ids() {
-        ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0);
-        cb(cur, "inp_out_ids", -1);
+        const auto n_outputs = lgf->get_n_outputs();
 
-        return cur;
+        auto inp = std::make_shared<llama_graph_input_out_ids>(hparams, cparams, n_outputs);
+
+        inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
+        ggml_set_input(inp->out_ids);
+
+        res->add_input(inp);
+
+        return inp->out_ids;
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_inp_mean() {
-        ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens);
-        cb(cur, "inp_mean", -1);
+        auto inp = std::make_shared<llama_graph_input_mean>(cparams);
 
-        return cur;
+        inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+        ggml_set_input(inp->mean);
+
+        res->add_input(inp);
+
+        return inp->mean;
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_inp_cls() {
-        ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens);
-        cb(cur, "inp_cls", -1);
+        auto inp = std::make_shared<llama_graph_input_cls>(cparams);
 
-        return cur;
+        inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        ggml_set_input(inp->cls);
+
+        res->add_input(inp);
+
+        return inp->cls;
     }
 
     // TODO: tmp
     struct ggml_tensor * build_lora_mm(
               struct ggml_tensor * w,
-              struct ggml_tensor * cur) {
+              struct ggml_tensor * cur) const {
         return lgf->build_lora_mm(ctx0, w, cur);
     }
 
@@ -3951,24 +4177,42 @@ struct llm_build_context {
     struct ggml_tensor * build_lora_mm_id(
               struct ggml_tensor * w,   // struct ggml_tensor * as
               struct ggml_tensor * cur, // struct ggml_tensor * b
-              struct ggml_tensor * ids) {
+              struct ggml_tensor * ids) const {
         return lgf->build_lora_mm_id(ctx0, w, cur, ids);
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_pos_bucket() {
-        ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens);
-        cb(cur, "pos_bucket", -1);
+        auto inp = lgf->build_inp_pos_bucket(ctx0, n_tokens);
+        cb(inp->cur, "pos_bucket", -1);
 
-        return cur;
+        res->add_input(inp);
+
+        return inp->cur;
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_inp_cross_embd() {
-        ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0);
-        cb(cur, "embd_enc", -1);
+        auto inp = lgf->build_inp_cross_embd(ctx0);
+        cb(inp->cur, "embd_enc", -1);
 
-        return cur;
+        res->add_input(inp);
+
+        return inp->cur;
+    }
+
+    struct ggml_tensor * build_inp_s_copy() const {
+        auto inp = lgf->build_inp_s_copy(ctx0);
+
+        res->add_input(inp);
+
+        return inp->cur;
+    }
+
+    struct ggml_tensor * build_inp_s_mask() const {
+        auto inp = lgf->build_inp_s_mask(ctx0);
+
+        res->add_input(inp);
+
+        return inp->cur;
     }
 
     struct ggml_tensor * build_norm(
@@ -4250,6 +4494,18 @@ struct llm_build_context {
         return moe_out;
     }
 
+    llama_graph_input_attn_ptr build_attn_inp(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   causal,
+                    bool   swa) const {
+        auto inp = lgf->build_attn_inp(ctx0, n_tokens, causal, swa);
+
+        res->add_input(inp);
+
+        return inp;
+    }
+
     struct ggml_tensor * build_attn(
             llama_graph_input_attn_i * inp,
             ggml_cgraph * gf,
@@ -4490,7 +4746,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -4651,7 +4907,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -4807,7 +5063,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4923,7 +5179,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5028,7 +5284,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
@@ -5151,7 +5407,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5303,7 +5559,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5425,7 +5681,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -5526,7 +5782,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5640,7 +5896,7 @@ struct llm_build_context {
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, false, false);
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
@@ -5785,7 +6041,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         inpL = build_norm(inpL,
                 model.tok_norm,
@@ -5888,7 +6144,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         if (model.pos_embd) {
             // inp_pos - contains the positions
@@ -6030,11 +6286,9 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
-
-
             // norm
             cur = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -6181,7 +6435,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6295,7 +6549,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6408,7 +6662,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         int sections[4];
         std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -6526,7 +6780,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6673,7 +6927,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             attn_norm_output = build_norm(inpL,
@@ -6795,8 +7049,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true);
 
         for (int il = 0; il < n_layer; ++il) {
             auto * residual = inpL;
@@ -6940,7 +7193,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -7046,7 +7299,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -7152,7 +7405,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -7263,7 +7516,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7382,7 +7635,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7510,7 +7763,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7711,7 +7964,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7819,7 +8072,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7949,7 +8202,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8062,8 +8315,8 @@ struct llm_build_context {
         // {n_embd, n_tokens}
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
+        struct ggml_tensor * state_copy = build_inp_s_copy();
+        struct ggml_tensor * state_mask = build_inp_s_mask();
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -8124,7 +8377,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -8272,7 +8525,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true);
 
         // sliding window switch pattern
         const int32_t sliding_window_pattern = 4;
@@ -8407,7 +8660,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8527,7 +8780,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8651,7 +8904,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8772,7 +9025,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             const int64_t n_head    = hparams.n_head(il);
@@ -8900,7 +9153,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -9044,7 +9297,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9174,7 +9427,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
@@ -9337,7 +9590,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9555,7 +9808,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9706,7 +9959,7 @@ struct llm_build_context {
 
         struct ggml_tensor * pos_bucket_enc = build_pos_bucket();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, false, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9809,7 +10062,7 @@ struct llm_build_context {
 
         const int64_t n_outputs_enc = embd_enc->ne[1];
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9972,7 +10225,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -10066,7 +10319,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10196,7 +10449,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10317,7 +10570,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10435,8 +10688,8 @@ struct llm_build_context {
         inpL = build_inp_embd(model.tok_embd);
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
+        struct ggml_tensor * state_copy = build_inp_s_copy();
+        struct ggml_tensor * state_mask = build_inp_s_mask();
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10527,8 +10780,8 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
+        struct ggml_tensor * state_copy = build_inp_s_copy();
+        struct ggml_tensor * state_mask = build_inp_s_mask();
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10622,7 +10875,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
diff --git a/src/llama-model.h b/src/llama-model.h
index 2d64c0d242c09..45abce7d53d8a 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -365,7 +365,6 @@ struct llama_model {
 
     const struct ggml_tensor * get_tensor(const char * name) const;
 
-    // TODO: add encode/decode graphs
     llama_graph_result_ptr build_graph(
               ggml_context * ctx,
                ggml_cgraph * gf,

From 624f7bd03bdea9e8d5c6d2ca02d87268394cc20c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 28 Feb 2025 21:13:08 +0200
Subject: [PATCH 84/84] graph : add comments

ggml-ci
---
 src/llama-context.cpp |  1 +
 src/llama-graph.cpp   | 16 +++++++-------
 src/llama-graph.h     | 51 ++++++++++++++++++++++++++++++++++++-------
 3 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 7ba86a2a7f91a..8963b85ca8151 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -101,6 +101,7 @@ void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
     }
 }
 
+// note: this does not depend on the context and can technically be moved to llama-model.cpp
 class llama_graph_input_attn_base : public llama_graph_input_attn_i {
 public:
     llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) :
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 89e311a915a31..119f1a56f3841 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -19,6 +19,14 @@ ggml_tensor * llama_graph_input_attn_i::get_kq_mask_cross() {
 
 llama_graph_i::llama_graph_i(llama_graph_type type) : type(type) {}
 
+llama_graph_input_ptr llama_graph_i::build_inp_cross_embd(
+            ggml_context * ctx0) const {
+    GGML_UNUSED(ctx0);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+    return nullptr;
+}
+
 ggml_tensor * llama_graph_i::build_attn(
         llama_graph_input_attn_i * inp,
         ggml_context * ctx0,
@@ -67,14 +75,6 @@ ggml_tensor * llama_graph_i::build_attn_cross(
     return nullptr;
 }
 
-llama_graph_input_ptr llama_graph_i::build_inp_cross_embd(
-            ggml_context * ctx0) const {
-    GGML_UNUSED(ctx0);
-
-    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
-    return nullptr;
-}
-
 llama_graph_input_ptr llama_graph_i::build_inp_s_copy (
             ggml_context * ctx0) const {
     GGML_UNUSED(ctx0);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 343d4a0772277..2d62c674f2679 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -10,32 +10,49 @@
 struct ggml_cgraph;
 struct ggml_context;
 struct ggml_tensor;
-struct ggml_backend_buffer;
 
 struct llama_ubatch;
 
+// certain models (typically multi-modal) can produce different types of graphs
+// the llama_context specifies which type of graph it needs through the llama_graph_i::type member
 enum llama_graph_type {
     LLAMA_GRAPH_TYPE_DEFAULT,
     LLAMA_GRAPH_TYPE_ENCODER,
     LLAMA_GRAPH_TYPE_DECODER,
 };
 
+
 //
 // llama_graph_input
 //
 
+// denotes an input to the graph
+// typically, the data of these objects is populated based on the contents of the current llama_ubatch:
+//
+//  - llama_graph_input_pos
+//  - llama_graph_input_out_ids
+//  - etc.
+//
+// some inputs require context-specific data (e.g. KV cache) - such inputs are defined for the specific llama_context:
+//
+//  - llama_graph_input_embd         (can apply lora)
+//  - llama_graph_input_attn_kv_self (requires KV cache instance)
+//  - etc.
+//
+
 class llama_graph_input_i {
 public:
     virtual ~llama_graph_input_i() = default;
 
     virtual void set_input(const llama_ubatch * ubatch) = 0;
 
-    // by default, we produce a single input tensor, but some children could produce more
+    // by default, we produce a single input tensor, but some implementations could produce more
     ggml_tensor * cur = nullptr;
 };
 
 using llama_graph_input_ptr = std::shared_ptr<llama_graph_input_i>;
 
+
 class llama_graph_input_attn_i : public llama_graph_input_i {
 public:
     virtual ~llama_graph_input_attn_i() = default;
@@ -47,10 +64,17 @@ class llama_graph_input_attn_i : public llama_graph_input_i {
 
 using llama_graph_input_attn_ptr = std::shared_ptr<llama_graph_input_attn_i>;
 
+
 //
 // llama_graph_result
 //
 
+// these objects deliver the result from the graph build process back to the llama_context
+// note that the input tensors created for the graph are referenced here - the goal is to be able to populate their
+//   specific data, by calling the set_inputs() method
+// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
+//   these are used by the llama_context to extact the relevant data, based on the compute parameters
+
 class llama_graph_result_i {
 public:
     virtual ~llama_graph_result_i() = default;
@@ -64,9 +88,9 @@ class llama_graph_result_i {
 
 using llama_graph_result_ptr = std::unique_ptr<llama_graph_result_i>;
 
+
 class llama_graph_result : public llama_graph_result_i {
 public:
-    llama_graph_result()          = default;
     virtual ~llama_graph_result() = default;
 
     ggml_tensor * get_logits()      override { return t_logits; }
@@ -91,10 +115,19 @@ class llama_graph_result : public llama_graph_result_i {
     std::vector<llama_graph_input_ptr> inputs;
 };
 
+
 //
 // llama_graph
 //
 
+// this interface defines an API for building graphs by abstracting some high-level concepts such as attention, lora, etc.
+// functionality that is trivial and does not rely on the llama_context should be directly implemented in llm_build_context
+//   other context-specific functionality should be declared here and implemented in the llama_context variations
+//
+// the main goal of this interface is to separate the llama_context specifics from the graph building logic
+//   this allows to have cleaner model architecture definitions while being able to overload certain complex
+//   functionality in order to fit different use cases and/or explore new implementations and ideas
+
 // note: keep all methods const
 // TODO: can become more granular in the future
 class llama_graph_i {
@@ -112,6 +145,10 @@ class llama_graph_i {
 public:
     virtual int32_t get_n_outputs() const = 0;
 
+    //
+    // context-specific API
+    //
+
     // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
     virtual void build_cb(
              ggml_tensor * cur,
@@ -141,8 +178,6 @@ class llama_graph_i {
     // rope factors based on the current context size
     virtual ggml_tensor * build_rope_factors(int il) const = 0;
 
-    // graph build API (context-specific)
-
     // input embeddings with optional lora
     virtual llama_graph_input_ptr build_inp_embd(
             ggml_context * ctx0,
@@ -154,6 +189,9 @@ class llama_graph_i {
             ggml_context * ctx0,
                  int32_t   n_tokens) const = 0;
 
+    virtual llama_graph_input_ptr build_inp_cross_embd(
+            ggml_context * ctx0) const;
+
     //
     // attention API
     //
@@ -186,9 +224,6 @@ class llama_graph_i {
                  float     kq_scale,
                  int       il) const;
 
-    virtual llama_graph_input_ptr build_inp_cross_embd(
-            ggml_context * ctx0) const;
-
     //
     // recurrent API
     //