ggml-org · gabe-l-hart · May 20, 2025 · May 20, 2025 · May 27, 2025 · May 23, 2025
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -109,6 +109,7 @@ if (NOT WIN32)
     llama_build_and_test(test-grammar-integration.cpp)
     llama_build_and_test(test-llama-grammar.cpp)
     llama_build_and_test(test-chat.cpp)
+    llama_build_and_test(test-memory.cpp)
     # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
     if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
         llama_build_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)

diff --git a/tests/test-memory.cpp b/tests/test-memory.cpp
@@ -0,0 +1,229 @@
+/*------------------------------------------------------------------------------
+ * Unit tests for llama-memory.h and derived memory implementations. It contains
+ * a number of tests which can be run all together or separately.
+ *
+ * USAGE: ./bin/test-memory <test_name1> <test_name2>
+ *
+ * When adding a new test, do the following:
+ *
+ *   1. Add the new test_<memory_type>_description function under the
+ *      appropriate memory type section
+ *
+ *   2. Add `RUN_TEST(test_<memory_type>_description);` to main
+ *----------------------------------------------------------------------------*/
+
+#include "../src/llama-arch.h"
+#include "../src/llama-batch.h"
+#include "../src/llama-hparams.h"
+#include "../src/llama-impl.h"
+#include "../src/llama-kv-cache.h"
+#include "../src/llama-model.h"
+
+#include "common.h"
+#include "ggml.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <memory>
+
+/*- Helpers ------------------------------------------------------------------*/
+
+static std::shared_ptr<llama_model> _make_model(
+    llm_arch arch = LLM_ARCH_LLAMA,
+    uint32_t n_layer = 4,
+    uint32_t n_embd_head_k = 4,
+    uint32_t n_embd_head_v = 4,
+    uint32_t n_head = 8,
+    uint32_t n_head_kv = 2) {
+
+    llama_model_params params;
+    params.tensor_buft_overrides = nullptr;
+    std::shared_ptr<llama_model> model(new llama_model(params));
+    model->hparams = llama_hparams();
+    model->arch = arch;
+
+    model->hparams.n_layer = n_layer;
+    model->hparams.n_embd_head_k = n_embd_head_k;
+    model->hparams.n_embd_head_v = n_embd_head_v;
+
+    // If set to 0, assume the test will fill out the array elementwise (hybrid)
+    if (n_head > 0) {
+        auto& n_head_arr = model->hparams.n_head_arr;
+        std::fill(n_head_arr.begin(), n_head_arr.end(), n_head);
+    }
+    if (n_head_kv > 0) {
+        auto& n_head_kv_arr = model->hparams.n_head_kv_arr;
+        std::fill(n_head_kv_arr.begin(), n_head_kv_arr.end(), n_head_kv);
+    }
+
+    return model;
+}
+
+static llama_batch _make_batch(
+    std::vector<std::vector<llama_token>>  token_seqs,
+    std::vector<std::vector<llama_seq_id>> seq_ids) {
+    GGML_ASSERT(token_seqs.size() == seq_ids.size());
+
+    size_t total_tokens = 0;
+    for (const auto & token_seq : token_seqs) {
+        total_tokens += token_seq.size();
+    }
+    size_t max_seq_ids = 0;
+    for (const auto & seq_ids_i : seq_ids) {
+        max_seq_ids = std::max(max_seq_ids, seq_ids_i.size());
+    }
+    llama_batch batch = llama_batch_init(total_tokens, 0, max_seq_ids);
+
+    for (size_t i = 0; i < token_seqs.size(); ++i) {
+        const auto& token_seq = token_seqs[i];
+        const auto& seq_ids_i = seq_ids[i];
+        for (int pos = 0; pos < (int)token_seq.size(); ++pos) {
+            common_batch_add(batch, token_seq[pos], pos, seq_ids_i, false);
+        }
+    }
+    return batch;
+}
+
+static bool is_source_tensor(ggml_tensor * child, ggml_tensor * parent) {
+    if (!child || !parent) return false;
+    for (size_t i = 0; i < GGML_MAX_SRC; ++i) {
+        if (child->src[i] == parent) {
+            return true;
+        } else if (child->src[i] != nullptr && is_source_tensor(child->src[i], parent)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+struct log_scope {
+    const char * name;
+    explicit log_scope(const char * name) : name(name) {
+        LLAMA_LOG_INFO("--------\n");
+        LLAMA_LOG_INFO("START: %s\n", name);
+    }
+    ~log_scope() {
+        LLAMA_LOG_INFO("END: %s\n", name);
+        LLAMA_LOG_INFO("--------\n");
+    }
+};
+
+#define RUN_TEST(test_name)                                                \
+    do {                                                                   \
+        bool run_test = argc < 2;                                          \
+        std::vector<std::string> args(argv + 1, argv + argc);              \
+        if (std::find(args.begin(), args.end(), #test_name) != args.end()) \
+            run_test = true;                                               \
+        if (run_test) {                                                    \
+            log_scope __log_scope(#test_name);                             \
+            test_name();                                                   \
+        }                                                                  \
+    } while (0)
+
+/*- Unified Cache ------------------------------------------------------------*/
+
+/* Test that the unified cache can be constructed and destructed safely */
+static void test_llama_kv_cache_unified_constructor() {
+    auto model = _make_model();
+    llama_kv_cache_unified cache(
+        /* model     */ *model,
+        /* filter    */ nullptr,
+        /* type_k    */ GGML_TYPE_F32,
+        /* type_v    */ GGML_TYPE_F16,
+        /* v_trans   */ false,
+        /* offload   */ false,
+        /* kv_size   */ 10,
+        /* n_seq_max */ 1,
+        /* padding   */ 10,
+        /* n_swa     */ 0,
+        /* swa_type  */ LLAMA_SWA_TYPE_NONE
+    );
+}
+
+/* Test that the unified cache can operate with a single seq */
+static void test_llama_kv_cache_unified_single_seq() {
+    auto model = _make_model();
+    llama_kv_cache_unified cache(
+        /* model    */ *model,
+        /* filter   */ nullptr,
+        /* type_k   */ GGML_TYPE_F32,
+        /* type_v   */ GGML_TYPE_F16,
+        /* v_trans  */ false,
+        /* offload  */ false,
+        /* kv_size  */ 10,
+        /* n_seq_max */ 1,
+        /* padding  */ 10,
+        /* n_swa    */ 0,
+        /* swa_type */ LLAMA_SWA_TYPE_NONE
+    );
+
+    // Create the micro batch with a single 3-token sequence
+    llama_batch batch1 = _make_batch({{101, 1, 102}}, {{42}});
+    llama_sbatch sbatch1 = cache.sbatch_init(batch1, false);
+    llama_ubatch ubatch1 = cache.ubatch_next(sbatch1, 4, false);
+
+    // Find a slot for a new sequence
+    GGML_ASSERT(cache.find_slot(ubatch1));
+
+    // Cache the k/v for a single layer in this slot
+    ggml_context * ctx = ggml_init({10240, NULL, false});
+    ggml_tensor * k1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_k_gqa(0));
+    ggml_tensor * v1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_v_gqa(0));
+    ggml_tensor * k1_view = cache.cpy_k(ctx, k1, 0);
+    ggml_tensor * v1_view = cache.cpy_v(ctx, v1, 0);
+    GGML_ASSERT(is_source_tensor(k1_view, k1));
+    GGML_ASSERT(is_source_tensor(v1_view, v1));
+
+    // Create a second batch with different tokens and find a slot for it
+    llama_batch batch2 = _make_batch({{1, 2, 3, 4}}, {{5}});
+    llama_sbatch sbatch2 = cache.sbatch_init(batch2, false);
+    llama_ubatch ubatch2 = cache.ubatch_next(sbatch2, 4, false);
+    GGML_ASSERT(cache.find_slot(ubatch2));
+
+    // Add some different tensors
+    ggml_tensor * k2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_k_gqa(0));
+    ggml_tensor * v2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_v_gqa(0));
+    ggml_tensor * k2_view = cache.cpy_k(ctx, k2, 0);
+    ggml_tensor * v2_view = cache.cpy_v(ctx, v2, 0);
+    GGML_ASSERT(is_source_tensor(k2_view, k2));
+    GGML_ASSERT(is_source_tensor(v2_view, v2));
+
+    // Make sure first batch's k/v aren't cache hit
+    GGML_ASSERT(!is_source_tensor(k2_view, k1));
+    GGML_ASSERT(!is_source_tensor(v2_view, v1));
+
+    // Re-find the slot for the first batch and make sure they cache hit
+    GGML_ASSERT(cache.find_slot(ubatch1));
+
+    // Clean up
+    llama_batch_free(batch1);
+    llama_batch_free(batch2);
+    ggml_free(ctx);
+}
+
+/*- Recurrent Cache ----------------------------------------------------------*/
+
+/* Test that the recurrent cache can be constructed and destructed safely */
+static void test_llama_kv_cache_recurrent_constructor() {
+    auto model = _make_model(LLM_ARCH_MAMBA);
+    llama_kv_cache_recurrent cache(
+        /* model     */ *model,
+        /* type_k    */ GGML_TYPE_F32,
+        /* type_v    */ GGML_TYPE_F16,
+        /* offload   */ false,
+        /* kv_size   */ 10,
+        /* n_seq_max */ 1
+    );
+}
+
+/*- Main ---------------------------------------------------------------------*/
+
+int main(int argc, char* argv[]) {
+    // Unified Cache Tests
+    RUN_TEST(test_llama_kv_cache_unified_constructor);
+    RUN_TEST(test_llama_kv_cache_unified_single_seq);
+    // Recurrent Cache Tests
+    RUN_TEST(test_llama_kv_cache_recurrent_constructor);
+    return 0;
+}