kv-cache : add comments [no ci]

ggerganov · ggerganov · commit 9548d2a10676 · 2025-05-29T17:01:34.000+03:00
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -136,7 +136,10 @@ class llama_kv_cache_unified_state : public llama_kv_cache_unified_state_i {
     std::vector<uint32_t> heads;
     std::vector<llama_ubatch> ubatches;
 
+    //
     // data needed for building the compute graph for the current ubatch:
+    //
+
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // as the cache gets filled, the benefit from this heuristic disappears
     int32_t n_kv;
@@ -1876,7 +1879,10 @@ class llama_kv_cache_unified_iswa_state : public llama_kv_cache_unified_iswa_sta
 
     std::vector<llama_ubatch> ubatches;
 
+    //
     // data needed for building the compute graph for the current ubatch:
+    //
+
     int32_t n_kv_base;
     int32_t head_base;
 
@@ -2123,7 +2129,7 @@ class llama_kv_cache_recurrent_state_t : public llama_kv_cache_recurrent_state_i
         return kv->s_copy(i);
     }
 
-    float   s_mask(int i) const override {
+    float s_mask(int i) const override {
         return kv->s_mask(i);
     }
 
@@ -2132,13 +2138,18 @@ class llama_kv_cache_recurrent_state_t : public llama_kv_cache_recurrent_state_i
 
     llama_kv_cache_recurrent * kv;
 
-    const bool is_full = false;
-
     llama_sbatch sbatch;
 
     size_t i_next = 0;
 
     std::vector<llama_ubatch> ubatches;
+
+    //
+    // data needed for building the compute graph for the current ubatch:
+    // TODO: extract all the state like `head` and `n` here
+    //
+
+    const bool is_full = false;
 };
 
 llama_kv_cache_recurrent::llama_kv_cache_recurrent(
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -40,6 +40,9 @@ struct llama_kv_cache : public llama_memory_i {
     virtual bool update(llama_context & lctx) = 0;
 
     // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
+    // TODO: change to
+    //   llama_memory_state_ptr init_defrag(float thold) = 0;
+    //
     virtual void defrag_sched(float thold) = 0;
 
     // getters
@@ -253,7 +256,7 @@ class llama_kv_cache_unified_state_i : public llama_memory_state_i {
     virtual ggml_tensor * get_k(ggml_context * ctx, int32_t il) const = 0;
     virtual ggml_tensor * get_v(ggml_context * ctx, int32_t il) const = 0;
 
-    // store k_cur and v_cur in the cache based on the current head location
+    // store k_cur and v_cur in the cache based on the provided head location
     virtual ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const = 0;
     virtual ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const = 0;
 
@@ -359,6 +362,8 @@ class llama_kv_cache_unified_iswa_state_i : public llama_memory_state_i {
 // llama_kv_cache_recurrent
 //
 
+// TODO: extract the KV cache state used for graph computation into llama_kv_cache_recurrent_state_i
+//       see the implementation of llama_kv_cache_unified_state_i for an example how to do it
 class llama_kv_cache_recurrent : public llama_kv_cache {
 public:
     llama_kv_cache_recurrent(
diff --git a/src/llama-memory.h b/src/llama-memory.h
@@ -42,6 +42,15 @@ enum llama_memory_status {
     LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
 };
 
+// the interface for managing the memory state during batch processing
+// this interface is extended per memory type with specific methods used for constructing the compute graphs. see:
+//   - llama_kv_cache_unified_state_i
+//   - llama_kv_cache_unified_iswa_state_i
+//   ...
+//
+// these extended interfaces should not mutate neither the memory, nor the current memory state
+// the only method that can mutate the memory and the memory state is llama_memory_i::apply()
+//
 class llama_memory_state_i {
 public:
     virtual ~llama_memory_state_i() = default;