fix: Fix recurrent cache impl for llama_memory_state_i paradigm after rebase

gabe-l-hart · gabe-l-hart · commit 33c72e54607a · 2025-05-29T14:51:15.000-06:00
Branch: HybridRecurrentCache

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -3198,11 +3198,21 @@ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t ce
 // llama_kv_cache_hybrid_recurrent
 //
 
-class llama_kv_cache_hybrid_recurrent_decode_state_t : public llama_memory_decode_state_i {
+class llama_kv_cache_hybrid_recurrent_state : public llama_kv_cache_hybrid_recurrent_state_i {
 public:
-    llama_kv_cache_hybrid_recurrent_decode_state_t(llama_memory_status status) : status(status) {}
-
-    llama_kv_cache_hybrid_recurrent_decode_state_t(
+    // init failure
+    explicit llama_kv_cache_hybrid_recurrent_state(llama_memory_status status)
+        : status(status), state_attn(status), state_recurrent(status) {}
+
+    // init full
+    explicit llama_kv_cache_hybrid_recurrent_state(llama_kv_cache_hybrid_recurrent * kv)
+        : status(LLAMA_MEMORY_STATUS_SUCCESS),
+          kv(kv),
+          state_attn(status, kv->get_kv_attn()),
+          state_recurrent(status, kv->get_kv_recurrent()) {}
+
+    // init success
+    llama_kv_cache_hybrid_recurrent_state(
         llama_kv_cache_hybrid_recurrent * kv,
                            llama_sbatch   sbatch,
                   std::vector<uint32_t>   heads_attn,
@@ -3211,22 +3221,33 @@ class llama_kv_cache_hybrid_recurrent_decode_state_t : public llama_memory_decod
               kv(kv),
               sbatch(std::move(sbatch)),
               heads_attn(std::move(heads_attn)),
-              ubatches(std::move(ubatches)) {
+              ubatches(std::move(ubatches)),
+              // NOTE: these child states are only used as wrapper APIs for the
+              //    const methods, so we use the "init full" signature since the
+              //    actual state is not used.
+              state_attn(LLAMA_MEMORY_STATUS_SUCCESS, kv->get_kv_attn()),
+              state_recurrent(LLAMA_MEMORY_STATUS_SUCCESS, kv->get_kv_recurrent()) {
     }
 
-    ~llama_kv_cache_hybrid_recurrent_decode_state_t() = default;
+    ~llama_kv_cache_hybrid_recurrent_state() = default;
 
-    llama_ubatch * next() override {
+    bool next() override {
         assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
 
-        if (i_next >= ubatches.size()) {
-            return nullptr;
+        if (++i_next >= ubatches.size()) {
+            return false;
         }
 
-        kv->get_kv_attn()     ->fill_slot(heads_attn[i_next], ubatches[i_next]);
+        return true;
+    }
+
+    bool apply() override {
+        assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+        kv->get_kv_attn()     ->apply_ubatch(heads_attn[i_next], ubatches[i_next]);
         kv->get_kv_recurrent()->find_slot(ubatches[i_next]);
 
-        return &ubatches[i_next++];
+        return true;
     }
 
     std::vector<int64_t> & out_ids() override {
@@ -3239,6 +3260,23 @@ class llama_kv_cache_hybrid_recurrent_decode_state_t : public llama_memory_decod
         return status;
     }
 
+    const llama_ubatch & get_ubatch() const override {
+        assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+        return ubatches[i_next];
+    }
+
+    //
+    // llama_kv_cache_hybrid_recurrent_state_i
+    //
+
+    const llama_kv_cache_unified_state_i * get_state_attn () const override {
+        return &state_attn;
+    }
+
+    const llama_kv_cache_recurrent_state_i * get_state_recurrent() const override {
+        return &state_recurrent;
+    }
+
 private:
     const llama_memory_status status;
 
@@ -3251,6 +3289,9 @@ class llama_kv_cache_hybrid_recurrent_decode_state_t : public llama_memory_decod
 
     std::vector<uint32_t>     heads_attn;
     std::vector<llama_ubatch> ubatches;
+
+    const llama_kv_cache_unified_state     state_attn;
+    const llama_kv_cache_recurrent_state_t state_recurrent;
 };
 
 llama_kv_cache_hybrid_recurrent::llama_kv_cache_hybrid_recurrent(
@@ -3338,7 +3379,7 @@ llama_pos llama_kv_cache_hybrid_recurrent::seq_pos_max(llama_seq_id seq_id) cons
     return std::min(kv_attn->seq_pos_max(seq_id), kv_recurrent->seq_pos_max(seq_id));
 }
 
-llama_memory_decode_state_ptr llama_kv_cache_hybrid_recurrent::init(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
+llama_memory_state_ptr llama_kv_cache_hybrid_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
 
     // since this includes a recurrent cache, we cannot use split_simple
     auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all);
@@ -3362,20 +3403,24 @@ llama_memory_decode_state_ptr llama_kv_cache_hybrid_recurrent::init(const llama_
     if (!kv_recurrent->prepare(ubatches)) {
         // TODO: will the recurrent cache be in an undefined state at this point?
         LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
-        return std::make_unique<llama_kv_cache_hybrid_recurrent_decode_state_t>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+        return std::make_unique<llama_kv_cache_hybrid_recurrent_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
     }
 
     // prepare the attention cache
     auto heads_attn = kv_attn->prepare(ubatches);
     if (heads_attn.empty()) {
         LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
-        return std::make_unique<llama_kv_cache_hybrid_recurrent_decode_state_t>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+        return std::make_unique<llama_kv_cache_hybrid_recurrent_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
     }
 
-    return std::make_unique<llama_kv_cache_hybrid_recurrent_decode_state_t>(
+    return std::make_unique<llama_kv_cache_hybrid_recurrent_state>(
         this, std::move(sbatch), std::move(heads_attn), std::move(ubatches));
 }
 
+llama_memory_state_ptr llama_kv_cache_hybrid_recurrent::init_full() {
+    return std::make_unique<llama_kv_cache_hybrid_recurrent_state>(this);
+}
+
 bool llama_kv_cache_hybrid_recurrent::update(llama_context & lctx) {
     bool res = false;
 
@@ -3390,11 +3435,6 @@ void llama_kv_cache_hybrid_recurrent::defrag_sched(float thold) {
     kv_recurrent->defrag_sched(thold);
 }
 
-void llama_kv_cache_hybrid_recurrent::set_full() {
-    kv_attn     ->set_full();
-    kv_recurrent->set_full();
-}
-
 bool llama_kv_cache_hybrid_recurrent::get_can_shift() const {
     // TODO: Should this return true if the attention cache can shift?
     return false;
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -544,18 +544,18 @@ class llama_kv_cache_hybrid_recurrent : public llama_kv_cache {
     // llama_kv_cache
     //
 
-    llama_memory_decode_state_ptr init(
+    llama_memory_state_ptr init_batch(
             const llama_batch & batch,
             uint32_t n_ubatch,
             bool embd_pooled,
             bool logits_all) override;
 
+    llama_memory_state_ptr init_full() override;
+
     bool update(llama_context & lctx) override;
 
     void defrag_sched(float thold) override;
 
-    void set_full() override;
-
     bool get_can_shift() const override;
 
     // state write/load
@@ -576,3 +576,11 @@ class llama_kv_cache_hybrid_recurrent : public llama_kv_cache {
     const std::unique_ptr<llama_kv_cache_unified>   kv_attn;
     const std::unique_ptr<llama_kv_cache_recurrent> kv_recurrent;
 };
+
+class llama_kv_cache_hybrid_recurrent_state_i : public llama_memory_state_i {
+public:
+    virtual ~llama_kv_cache_hybrid_recurrent_state_i() = default;
+
+    virtual const llama_kv_cache_unified_state_i *   get_state_attn     () const = 0;
+    virtual const llama_kv_cache_recurrent_state_i * get_state_recurrent() const = 0;
+};