kv-cache : extract the graph-specific state from the oject (unified)

ggerganov · ggerganov · commit 37cec4321bf0 · 2025-05-29T12:20:10.000+03:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -651,7 +651,7 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
         return nullptr;
     }
 
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype);
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mstate);
     if (!res) {
         LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
         if (ret) {
@@ -1289,7 +1289,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
     llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
     auto * gf = graph_init();
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, nullptr);
 
     this->n_outputs = save_n_outputs;
 
@@ -1310,10 +1310,11 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
 }
 
 llm_graph_result_ptr llama_context::graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-            llm_graph_type gtype) {
+                    ggml_context * ctx,
+                     ggml_cgraph * gf,
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+      const llama_memory_state_i * mstate) {
     return model.build_graph(
             {
                 /*.ctx         =*/ ctx,
@@ -1326,6 +1327,7 @@ llm_graph_result_ptr llama_context::graph_build(
                 /*.cvec        =*/ &cvec,
                 /*.loras       =*/ &loras,
                 /*.memory      =*/ memory.get(),
+                /*.mstate      =*/ mstate,
                 /*.cross       =*/ &cross,
                 /*.n_outputs   =*/ n_outputs,
                 /*.cb          =*/ graph_get_cb(),
@@ -2047,7 +2049,7 @@ void llama_context::opt_epoch_iter(
             n_outputs = ubatch.n_tokens;
 
             auto * gf = graph_init();
-            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
+            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, kv_state.get());
 
             struct ggml_context * ctx_compute_opt;
             {
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -200,10 +200,11 @@ struct llama_context {
 
 private:
     llm_graph_result_ptr graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-          llm_graph_type   gtype);
+                    ggml_context * ctx,
+                     ggml_cgraph * gf,
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+      const llama_memory_state_i * mstate);
 
     llm_graph_cb graph_get_cb() const;
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -449,6 +449,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     cvec             (params.cvec),
     loras            (params.loras),
     memory           (params.memory),
+    mstate           (params.mstate),
     cross            (params.cross),
     cb_func          (params.cb),
     res              (std::make_unique<llm_graph_result>()) {
@@ -1027,9 +1028,13 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
 ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
     const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
 
+    const llama_kv_cache_unified_state_i * kv_state = static_cast<const llama_kv_cache_unified_state_i *>(mstate);
+
+    const llama_kv_cache_unified::compute_state * cstate = kv_state ? kv_state->get_cstate() : nullptr;
+
     auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
 
-    const auto n_kv = kv_self->get_n_kv();
+    const auto n_kv = kv_self->get_n_kv(cstate);
 
     auto & cur = inp->pos_bucket;
 
@@ -1233,12 +1238,16 @@ ggml_tensor * llm_graph_context::build_attn(
 llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
     const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
 
+    const llama_kv_cache_unified_state_i * kv_state = static_cast<const llama_kv_cache_unified_state_i *>(mstate);
+
+    const llama_kv_cache_unified::compute_state * cstate = kv_state ? kv_state->get_cstate() : nullptr;
+
     auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
 
     {
         GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
 
-        const auto n_kv = kv_self->get_n_kv();
+        const auto n_kv = kv_self->get_n_kv(cstate);
 
         inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask, "KQ_mask", -1);
@@ -1270,17 +1279,21 @@ ggml_tensor * llm_graph_context::build_attn(
 
     const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
 
+    const llama_kv_cache_unified_state_i * kv_state = static_cast<const llama_kv_cache_unified_state_i *>(mstate);
+
+    const llama_kv_cache_unified::compute_state * cstate = kv_state ? kv_state->get_cstate() : nullptr;
+
     // store to KV cache
     {
-        ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
+        ggml_build_forward_expand(gf, kv_self->cpy_k(cstate, ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv_self->cpy_v(cstate, ctx0, v_cur, il));
     }
 
     const auto & kq_mask = inp->get_kq_mask();
 
     ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv_self->get_k(ctx0, il);
-    ggml_tensor * v = kv_self->get_v(ctx0, il);
+    ggml_tensor * k = kv_self->get_k(cstate, ctx0, il);
+    ggml_tensor * v = kv_self->get_v(cstate, ctx0, il);
 
     ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
     cb(cur, "kqv_out", il);
@@ -1303,10 +1316,15 @@ ggml_tensor * llm_graph_context::build_attn(
 llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
     const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
 
+    const llama_kv_cache_unified_iswa_state_i * kv_state = static_cast<const llama_kv_cache_unified_iswa_state_i *>(mstate);
+
+    const llama_kv_cache_unified::compute_state * cstate_base = kv_state ? kv_state->get_cstate_base() : nullptr;
+    const llama_kv_cache_unified::compute_state * cstate_swa  = kv_state ? kv_state->get_cstate_swa () : nullptr;
+
     auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_self);
 
     {
-        const auto n_kv = kv_self->get_kv_base()->get_n_kv();
+        const auto n_kv = kv_self->get_kv_base()->get_n_kv(cstate_base);
 
         inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask, "KQ_mask", -1);
@@ -1318,7 +1336,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
     {
         GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
 
-        const auto n_kv = kv_self->get_kv_swa()->get_n_kv();
+        const auto n_kv = kv_self->get_kv_swa()->get_n_kv(cstate_swa);
 
         inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
@@ -1354,17 +1372,24 @@ ggml_tensor * llm_graph_context::build_attn(
 
     const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
 
+    const llama_kv_cache_unified_iswa_state_i * kv_state = static_cast<const llama_kv_cache_unified_iswa_state_i *>(mstate);
+
+    const llama_kv_cache_unified::compute_state * cstate_base = kv_state ? kv_state->get_cstate_base() : nullptr;
+    const llama_kv_cache_unified::compute_state * cstate_swa  = kv_state ? kv_state->get_cstate_swa () : nullptr;
+
+    const llama_kv_cache_unified::compute_state * cstate = is_swa ? cstate_swa : cstate_base;
+
     // store to KV cache
     {
-        ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
+        ggml_build_forward_expand(gf, kv->cpy_k(cstate, ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv->cpy_v(cstate, ctx0, v_cur, il));
     }
 
     const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
 
     ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv->get_k(ctx0, il);
-    ggml_tensor * v = kv->get_v(ctx0, il);
+    ggml_tensor * k = kv->get_k(cstate, ctx0, il);
+    ggml_tensor * v = kv->get_v(cstate, ctx0, il);
 
     ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
     cb(cur, "kqv_out", il);
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -18,6 +18,8 @@ struct llama_ubatch;
 struct llama_cparams;
 
 class llama_memory_i;
+class llama_memory_state_i;
+
 class llama_kv_cache_unified;
 class llama_kv_cache_unified_iswa;
 class llama_kv_cache_recurrent;
@@ -383,10 +385,11 @@ struct llm_graph_params {
     ggml_backend_sched_t sched;
     ggml_backend_t backend_cpu;
 
-    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras * loras;
-    const llama_memory_i      * memory;
-    const llama_cross         * cross;
+    const llama_adapter_cvec   * cvec;
+    const llama_adapter_loras  * loras;
+    const llama_memory_i       * memory;
+    const llama_memory_state_i * mstate;
+    const llama_cross          * cross;
 
     int32_t n_outputs;
 
@@ -435,10 +438,11 @@ struct llm_graph_context {
 
     ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
 
-    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras * loras;
-    const llama_memory_i      * memory;
-    const llama_cross         * cross;
+    const llama_adapter_cvec   * cvec;
+    const llama_adapter_loras  * loras;
+    const llama_memory_i       * memory;
+    const llama_memory_state_i * mstate;
+    const llama_cross          * cross;
 
     const llm_graph_cb & cb_func;
 
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h