From 1fb82233a4fddaf6353686d5e5acc376c9590438 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Tue, 20 May 2025 12:59:07 -0600
Subject: [PATCH 1/4] tests: Initial unit tests for memory hierarchy

These only test the basics so far, but should allow for more expansive
tests to come.

Branch: MemoryTests

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tests/test-memory.cpp | 175 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 tests/test-memory.cpp
diff --git a/tests/test-memory.cpp b/tests/test-memory.cpp
new file mode 100644
index 0000000000000..ad6c13800cbb6
--- /dev/null
+++ b/tests/test-memory.cpp
@@ -0,0 +1,175 @@
+/*------------------------------------------------------------------------------
+ * Unit tests for llama-memory.h and derived memory implementations. It contains
+ * a number of tests which can be run all together or separately.
+ *
+ * USAGE: ./bin/test-memory <test_name1> <test_name2>
+ *
+ * When adding a new test, do the following:
+ *
+ *   1. Add the new test_<memory_type>_description function under the
+ *      appropriate memory type section
+ *
+ *   2. Add `RUN_TEST(test_<memory_type>_description);` to main
+ *----------------------------------------------------------------------------*/
+
+#include "../src/llama-arch.h"
+#include "../src/llama-batch.h"
+#include "../src/llama-hparams.h"
+#include "../src/llama-impl.h"
+#include "../src/llama-kv-cache.h"
+#include "../src/llama-model.h"
+
+#include "common.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <memory>
+
+/*- Helpers ------------------------------------------------------------------*/
+
+static std::shared_ptr<llama_model> _make_model(
+    llm_arch arch = LLM_ARCH_LLAMA,
+    uint32_t n_layer = 4,
+    uint32_t n_embd_head_k = 4,
+    uint32_t n_embd_head_v = 4,
+    uint32_t n_head = 8,
+    uint32_t n_head_kv = 2) {
+
+    llama_model_params params;
+    params.tensor_buft_overrides = nullptr;
+    std::shared_ptr<llama_model> model(new llama_model(params));
+    model->hparams = llama_hparams();
+    model->arch = arch;
+
+    model->hparams.n_layer = n_layer;
+    model->hparams.n_embd_head_k = n_embd_head_k;
+    model->hparams.n_embd_head_v = n_embd_head_v;
+
+    // If set to 0, assume the test will fill out the array elementwise (hybrid)
+    if (n_head > 0) {
+        auto& n_head_arr = model->hparams.n_head_arr;
+        std::fill(n_head_arr.begin(), n_head_arr.end(), n_head);
+    }
+    if (n_head_kv > 0) {
+        auto& n_head_kv_arr = model->hparams.n_head_kv_arr;
+        std::fill(n_head_kv_arr.begin(), n_head_kv_arr.end(), n_head_kv);
+    }
+
+    return model;
+}
+
+struct log_scope {
+    const char * name;
+    explicit log_scope(const char * name) : name(name) {
+        LLAMA_LOG_INFO("--------\n");
+        LLAMA_LOG_INFO("START: %s\n", name);
+    }
+    ~log_scope() {
+        LLAMA_LOG_INFO("END: %s\n", name);
+        LLAMA_LOG_INFO("--------\n");
+    }
+};
+
+#define RUN_TEST(test_name)                                                \
+    do {                                                                   \
+        bool run_test = argc < 2;                                          \
+        std::vector<std::string> args(argv + 1, argv + argc);              \
+        if (std::find(args.begin(), args.end(), #test_name) != args.end()) \
+            run_test = true;                                               \
+        if (run_test) {                                                    \
+            log_scope __log_scope(#test_name);                             \
+            test_name();                                                   \
+        }                                                                  \
+    } while (0)
+
+/*- Unified Cache ------------------------------------------------------------*/
+
+/* Test that the unified cache can be constructed and destructed safely */
+static void test_llama_kv_cache_unified_constructor() {
+    auto model = _make_model();
+    llama_kv_cache_unified cache(
+        /* model    */ *model,
+        /* filter   */ nullptr,
+        /* type_k   */ GGML_TYPE_F32,
+        /* type_v   */ GGML_TYPE_F16,
+        /* v_trans  */ false,
+        /* offload  */ false,
+        /* kv_size  */ 10,
+        /* padding  */ 10,
+        /* n_swa    */ 0,
+        /* swa_type */ LLAMA_SWA_TYPE_NONE
+    );
+}
+
+/* Test that the unified cache can operate with a single seq */
+static void test_llama_kv_cache_unified_single_seq() {
+    auto model = _make_model();
+    llama_kv_cache_unified cache(
+        /* model    */ *model,
+        /* filter   */ nullptr,
+        /* type_k   */ GGML_TYPE_F32,
+        /* type_v   */ GGML_TYPE_F16,
+        /* v_trans  */ false,
+        /* offload  */ false,
+        /* kv_size  */ 10,
+        /* padding  */ 10,
+        /* n_swa    */ 0,
+        /* swa_type */ LLAMA_SWA_TYPE_NONE
+    );
+    GGML_ASSERT(cache.get_used_cells() == 0);
+
+    // Create the micro batch with a single 3-token sequence
+    //
+    // NOTE: A bunch of these asserts were just me figuring out how the batches
+    //  relate to each other, but they're left for future readers to help in the
+    //  same understanding process.
+    llama_seq_id seq_id = 42;
+    llama_batch batch = llama_batch_init(3, 0, 1);
+    common_batch_add(batch, 101, 0, {seq_id}, false);
+    common_batch_add(batch, 1,   1, {seq_id}, false);
+    common_batch_add(batch, 102, 2, {seq_id}, false);
+    llama_sbatch sbatch(batch, 0, true, false);
+    GGML_ASSERT(batch.n_tokens == 3);
+    GGML_ASSERT(sbatch.n_tokens == 3);
+    GGML_ASSERT(!sbatch.seq.empty());
+    llama_ubatch ubatch = sbatch.split_simple(4);
+    printf("ubatch.n_seqs=%d\n", ubatch.n_seqs);
+    GGML_ASSERT(ubatch.n_seqs == 3);
+    GGML_ASSERT(ubatch.n_seq_tokens == 1);
+    GGML_ASSERT(ubatch.n_tokens == 3);
+    GGML_ASSERT(ubatch.seq_id[0][0] == seq_id);
+    GGML_ASSERT(ubatch.seq_id[1][0] == seq_id);
+    GGML_ASSERT(ubatch.seq_id[2][0] == seq_id);
+
+    // Find a slot for a new sequence
+    GGML_ASSERT(cache.find_slot(ubatch));
+
+    // Clean up
+    llama_batch_free(batch);
+}
+
+/*- Recurrent Cache ----------------------------------------------------------*/
+
+/* Test that the recurrent cache can be constructed and destructed safely */
+static void test_llama_kv_cache_recurrent_constructor() {
+    auto model = _make_model(LLM_ARCH_MAMBA);
+    llama_kv_cache_recurrent cache(
+        /* model   */ *model,
+        /* type_k  */ GGML_TYPE_F32,
+        /* type_v  */ GGML_TYPE_F16,
+        /* offload */ false,
+        /* kv_size */ 10
+    );
+}
+
+/*- Main ---------------------------------------------------------------------*/
+
+int main(int argc, char* argv[]) {
+    // Unified Cache Tests
+    RUN_TEST(test_llama_kv_cache_unified_constructor);
+    RUN_TEST(test_llama_kv_cache_unified_single_seq);
+    // Recurrent Cache Tests
+    RUN_TEST(test_llama_kv_cache_recurrent_constructor);
+    return 0;
+}

From 426fb77c2112bf637035f8b165ce10bc25b41e9c Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Tue, 20 May 2025 12:59:36 -0600
Subject: [PATCH 2/4] build: Add build step for test-memory on non-windows
 builds

These tests use private headers, so won't build on windows

Branch: MemoryTests

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 62a9f5842bca8..ff3d97d7a27eb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -109,6 +109,7 @@ if (NOT WIN32)
     llama_build_and_test(test-grammar-integration.cpp)
     llama_build_and_test(test-llama-grammar.cpp)
     llama_build_and_test(test-chat.cpp)
+    llama_build_and_test(test-memory.cpp)
     # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
     if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
         llama_build_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)

From 5c11928572c52b5e559aa6079ad242d49c0a0f43 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Tue, 27 May 2025 09:00:50 -0600
Subject: [PATCH 3/4] fix(tests): Fix constructors in tests for signature
 changes after rebase

Branch: HybridCache

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tests/test-memory.cpp | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/tests/test-memory.cpp b/tests/test-memory.cpp
index ad6c13800cbb6..36b008da9d8ca 100644
--- a/tests/test-memory.cpp
+++ b/tests/test-memory.cpp
@@ -89,16 +89,17 @@ struct log_scope {
 static void test_llama_kv_cache_unified_constructor() {
     auto model = _make_model();
     llama_kv_cache_unified cache(
-        /* model    */ *model,
-        /* filter   */ nullptr,
-        /* type_k   */ GGML_TYPE_F32,
-        /* type_v   */ GGML_TYPE_F16,
-        /* v_trans  */ false,
-        /* offload  */ false,
-        /* kv_size  */ 10,
-        /* padding  */ 10,
-        /* n_swa    */ 0,
-        /* swa_type */ LLAMA_SWA_TYPE_NONE
+        /* model     */ *model,
+        /* filter    */ nullptr,
+        /* type_k    */ GGML_TYPE_F32,
+        /* type_v    */ GGML_TYPE_F16,
+        /* v_trans   */ false,
+        /* offload   */ false,
+        /* kv_size   */ 10,
+        /* n_seq_max */ 1,
+        /* padding   */ 10,
+        /* n_swa     */ 0,
+        /* swa_type  */ LLAMA_SWA_TYPE_NONE
     );
 }
 
@@ -113,11 +114,11 @@ static void test_llama_kv_cache_unified_single_seq() {
         /* v_trans  */ false,
         /* offload  */ false,
         /* kv_size  */ 10,
+        /* n_seq_max */ 1,
         /* padding  */ 10,
         /* n_swa    */ 0,
         /* swa_type */ LLAMA_SWA_TYPE_NONE
     );
-    GGML_ASSERT(cache.get_used_cells() == 0);
 
     // Create the micro batch with a single 3-token sequence
     //
@@ -155,11 +156,12 @@ static void test_llama_kv_cache_unified_single_seq() {
 static void test_llama_kv_cache_recurrent_constructor() {
     auto model = _make_model(LLM_ARCH_MAMBA);
     llama_kv_cache_recurrent cache(
-        /* model   */ *model,
-        /* type_k  */ GGML_TYPE_F32,
-        /* type_v  */ GGML_TYPE_F16,
-        /* offload */ false,
-        /* kv_size */ 10
+        /* model     */ *model,
+        /* type_k    */ GGML_TYPE_F32,
+        /* type_v    */ GGML_TYPE_F16,
+        /* offload   */ false,
+        /* kv_size   */ 10,
+        /* n_seq_max */ 1
     );
 }
 

From ba118a26fbac9ccf8b6516ff7aa5b8ebf6c5faeb Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Fri, 23 May 2025 17:29:04 -0600
Subject: [PATCH 4/4] tests(wip): More robust test for unified cache

I'm still not clear how cache hits should be detected since find_slot does
not seem to take into account the tokens themselves and simply looks for a
sequence of cells that fits the size of the ubatch and has no set positions
in any of the cells. I'm clearly still missing something about how this
works!

Branch: HybridCache

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 tests/test-memory.cpp | 98 +++++++++++++++++++++++++++++++++----------
 1 file changed, 75 insertions(+), 23 deletions(-)

diff --git a/tests/test-memory.cpp b/tests/test-memory.cpp
index 36b008da9d8ca..d843a6b5ea175 100644
--- a/tests/test-memory.cpp
+++ b/tests/test-memory.cpp
@@ -20,6 +20,7 @@
 #include "../src/llama-model.h"
 
 #include "common.h"
+#include "ggml.h"
 #include "llama.h"
 
 #include <algorithm>
@@ -59,6 +60,43 @@ static std::shared_ptr<llama_model> _make_model(
     return model;
 }
 
+static llama_batch _make_batch(
+    std::vector<std::vector<llama_token>>  token_seqs,
+    std::vector<std::vector<llama_seq_id>> seq_ids) {
+    GGML_ASSERT(token_seqs.size() == seq_ids.size());
+
+    size_t total_tokens = 0;
+    for (const auto & token_seq : token_seqs) {
+        total_tokens += token_seq.size();
+    }
+    size_t max_seq_ids = 0;
+    for (const auto & seq_ids_i : seq_ids) {
+        max_seq_ids = std::max(max_seq_ids, seq_ids_i.size());
+    }
+    llama_batch batch = llama_batch_init(total_tokens, 0, max_seq_ids);
+
+    for (size_t i = 0; i < token_seqs.size(); ++i) {
+        const auto& token_seq = token_seqs[i];
+        const auto& seq_ids_i = seq_ids[i];
+        for (int pos = 0; pos < (int)token_seq.size(); ++pos) {
+            common_batch_add(batch, token_seq[pos], pos, seq_ids_i, false);
+        }
+    }
+    return batch;
+}
+
+static bool is_source_tensor(ggml_tensor * child, ggml_tensor * parent) {
+    if (!child || !parent) return false;
+    for (size_t i = 0; i < GGML_MAX_SRC; ++i) {
+        if (child->src[i] == parent) {
+            return true;
+        } else if (child->src[i] != nullptr && is_source_tensor(child->src[i], parent)) {
+            return true;
+        }
+    }
+    return false;
+}
+
 struct log_scope {
     const char * name;
     explicit log_scope(const char * name) : name(name) {
@@ -121,33 +159,47 @@ static void test_llama_kv_cache_unified_single_seq() {
     );
 
     // Create the micro batch with a single 3-token sequence
-    //
-    // NOTE: A bunch of these asserts were just me figuring out how the batches
-    //  relate to each other, but they're left for future readers to help in the
-    //  same understanding process.
-    llama_seq_id seq_id = 42;
-    llama_batch batch = llama_batch_init(3, 0, 1);
-    common_batch_add(batch, 101, 0, {seq_id}, false);
-    common_batch_add(batch, 1,   1, {seq_id}, false);
-    common_batch_add(batch, 102, 2, {seq_id}, false);
-    llama_sbatch sbatch(batch, 0, true, false);
-    GGML_ASSERT(batch.n_tokens == 3);
-    GGML_ASSERT(sbatch.n_tokens == 3);
-    GGML_ASSERT(!sbatch.seq.empty());
-    llama_ubatch ubatch = sbatch.split_simple(4);
-    printf("ubatch.n_seqs=%d\n", ubatch.n_seqs);
-    GGML_ASSERT(ubatch.n_seqs == 3);
-    GGML_ASSERT(ubatch.n_seq_tokens == 1);
-    GGML_ASSERT(ubatch.n_tokens == 3);
-    GGML_ASSERT(ubatch.seq_id[0][0] == seq_id);
-    GGML_ASSERT(ubatch.seq_id[1][0] == seq_id);
-    GGML_ASSERT(ubatch.seq_id[2][0] == seq_id);
+    llama_batch batch1 = _make_batch({{101, 1, 102}}, {{42}});
+    llama_sbatch sbatch1 = cache.sbatch_init(batch1, false);
+    llama_ubatch ubatch1 = cache.ubatch_next(sbatch1, 4, false);
 
     // Find a slot for a new sequence
-    GGML_ASSERT(cache.find_slot(ubatch));
+    GGML_ASSERT(cache.find_slot(ubatch1));
+
+    // Cache the k/v for a single layer in this slot
+    ggml_context * ctx = ggml_init({10240, NULL, false});
+    ggml_tensor * k1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_k_gqa(0));
+    ggml_tensor * v1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_v_gqa(0));
+    ggml_tensor * k1_view = cache.cpy_k(ctx, k1, 0);
+    ggml_tensor * v1_view = cache.cpy_v(ctx, v1, 0);
+    GGML_ASSERT(is_source_tensor(k1_view, k1));
+    GGML_ASSERT(is_source_tensor(v1_view, v1));
+
+    // Create a second batch with different tokens and find a slot for it
+    llama_batch batch2 = _make_batch({{1, 2, 3, 4}}, {{5}});
+    llama_sbatch sbatch2 = cache.sbatch_init(batch2, false);
+    llama_ubatch ubatch2 = cache.ubatch_next(sbatch2, 4, false);
+    GGML_ASSERT(cache.find_slot(ubatch2));
+
+    // Add some different tensors
+    ggml_tensor * k2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_k_gqa(0));
+    ggml_tensor * v2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_v_gqa(0));
+    ggml_tensor * k2_view = cache.cpy_k(ctx, k2, 0);
+    ggml_tensor * v2_view = cache.cpy_v(ctx, v2, 0);
+    GGML_ASSERT(is_source_tensor(k2_view, k2));
+    GGML_ASSERT(is_source_tensor(v2_view, v2));
+
+    // Make sure first batch's k/v aren't cache hit
+    GGML_ASSERT(!is_source_tensor(k2_view, k1));
+    GGML_ASSERT(!is_source_tensor(v2_view, v1));
+
+    // Re-find the slot for the first batch and make sure they cache hit
+    GGML_ASSERT(cache.find_slot(ubatch1));
 
     // Clean up
-    llama_batch_free(batch);
+    llama_batch_free(batch1);
+    llama_batch_free(batch2);
+    ggml_free(ctx);
 }
 
 /*- Recurrent Cache ----------------------------------------------------------*/