From 1fb82233a4fddaf6353686d5e5acc376c9590438 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 20 May 2025 12:59:07 -0600 Subject: [PATCH 1/4] tests: Initial unit tests for memory hierarchy These only test the basics so far, but should allow for more expansive tests to come. Branch: MemoryTests Signed-off-by: Gabe Goodhart --- tests/test-memory.cpp | 175 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 tests/test-memory.cpp diff --git a/tests/test-memory.cpp b/tests/test-memory.cpp new file mode 100644 index 0000000000000..ad6c13800cbb6 --- /dev/null +++ b/tests/test-memory.cpp @@ -0,0 +1,175 @@ +/*------------------------------------------------------------------------------ + * Unit tests for llama-memory.h and derived memory implementations. It contains + * a number of tests which can be run all together or separately. + * + * USAGE: ./bin/test-memory + * + * When adding a new test, do the following: + * + * 1. Add the new test__description function under the + * appropriate memory type section + * + * 2. Add `RUN_TEST(test__description);` to main + *----------------------------------------------------------------------------*/ + +#include "../src/llama-arch.h" +#include "../src/llama-batch.h" +#include "../src/llama-hparams.h" +#include "../src/llama-impl.h" +#include "../src/llama-kv-cache.h" +#include "../src/llama-model.h" + +#include "common.h" +#include "llama.h" + +#include +#include +#include + +/*- Helpers ------------------------------------------------------------------*/ + +static std::shared_ptr _make_model( + llm_arch arch = LLM_ARCH_LLAMA, + uint32_t n_layer = 4, + uint32_t n_embd_head_k = 4, + uint32_t n_embd_head_v = 4, + uint32_t n_head = 8, + uint32_t n_head_kv = 2) { + + llama_model_params params; + params.tensor_buft_overrides = nullptr; + std::shared_ptr model(new llama_model(params)); + model->hparams = llama_hparams(); + model->arch = arch; + + model->hparams.n_layer = n_layer; + model->hparams.n_embd_head_k = n_embd_head_k; + model->hparams.n_embd_head_v = n_embd_head_v; + + // If set to 0, assume the test will fill out the array elementwise (hybrid) + if (n_head > 0) { + auto& n_head_arr = model->hparams.n_head_arr; + std::fill(n_head_arr.begin(), n_head_arr.end(), n_head); + } + if (n_head_kv > 0) { + auto& n_head_kv_arr = model->hparams.n_head_kv_arr; + std::fill(n_head_kv_arr.begin(), n_head_kv_arr.end(), n_head_kv); + } + + return model; +} + +struct log_scope { + const char * name; + explicit log_scope(const char * name) : name(name) { + LLAMA_LOG_INFO("--------\n"); + LLAMA_LOG_INFO("START: %s\n", name); + } + ~log_scope() { + LLAMA_LOG_INFO("END: %s\n", name); + LLAMA_LOG_INFO("--------\n"); + } +}; + +#define RUN_TEST(test_name) \ + do { \ + bool run_test = argc < 2; \ + std::vector args(argv + 1, argv + argc); \ + if (std::find(args.begin(), args.end(), #test_name) != args.end()) \ + run_test = true; \ + if (run_test) { \ + log_scope __log_scope(#test_name); \ + test_name(); \ + } \ + } while (0) + +/*- Unified Cache ------------------------------------------------------------*/ + +/* Test that the unified cache can be constructed and destructed safely */ +static void test_llama_kv_cache_unified_constructor() { + auto model = _make_model(); + llama_kv_cache_unified cache( + /* model */ *model, + /* filter */ nullptr, + /* type_k */ GGML_TYPE_F32, + /* type_v */ GGML_TYPE_F16, + /* v_trans */ false, + /* offload */ false, + /* kv_size */ 10, + /* padding */ 10, + /* n_swa */ 0, + /* swa_type */ LLAMA_SWA_TYPE_NONE + ); +} + +/* Test that the unified cache can operate with a single seq */ +static void test_llama_kv_cache_unified_single_seq() { + auto model = _make_model(); + llama_kv_cache_unified cache( + /* model */ *model, + /* filter */ nullptr, + /* type_k */ GGML_TYPE_F32, + /* type_v */ GGML_TYPE_F16, + /* v_trans */ false, + /* offload */ false, + /* kv_size */ 10, + /* padding */ 10, + /* n_swa */ 0, + /* swa_type */ LLAMA_SWA_TYPE_NONE + ); + GGML_ASSERT(cache.get_used_cells() == 0); + + // Create the micro batch with a single 3-token sequence + // + // NOTE: A bunch of these asserts were just me figuring out how the batches + // relate to each other, but they're left for future readers to help in the + // same understanding process. + llama_seq_id seq_id = 42; + llama_batch batch = llama_batch_init(3, 0, 1); + common_batch_add(batch, 101, 0, {seq_id}, false); + common_batch_add(batch, 1, 1, {seq_id}, false); + common_batch_add(batch, 102, 2, {seq_id}, false); + llama_sbatch sbatch(batch, 0, true, false); + GGML_ASSERT(batch.n_tokens == 3); + GGML_ASSERT(sbatch.n_tokens == 3); + GGML_ASSERT(!sbatch.seq.empty()); + llama_ubatch ubatch = sbatch.split_simple(4); + printf("ubatch.n_seqs=%d\n", ubatch.n_seqs); + GGML_ASSERT(ubatch.n_seqs == 3); + GGML_ASSERT(ubatch.n_seq_tokens == 1); + GGML_ASSERT(ubatch.n_tokens == 3); + GGML_ASSERT(ubatch.seq_id[0][0] == seq_id); + GGML_ASSERT(ubatch.seq_id[1][0] == seq_id); + GGML_ASSERT(ubatch.seq_id[2][0] == seq_id); + + // Find a slot for a new sequence + GGML_ASSERT(cache.find_slot(ubatch)); + + // Clean up + llama_batch_free(batch); +} + +/*- Recurrent Cache ----------------------------------------------------------*/ + +/* Test that the recurrent cache can be constructed and destructed safely */ +static void test_llama_kv_cache_recurrent_constructor() { + auto model = _make_model(LLM_ARCH_MAMBA); + llama_kv_cache_recurrent cache( + /* model */ *model, + /* type_k */ GGML_TYPE_F32, + /* type_v */ GGML_TYPE_F16, + /* offload */ false, + /* kv_size */ 10 + ); +} + +/*- Main ---------------------------------------------------------------------*/ + +int main(int argc, char* argv[]) { + // Unified Cache Tests + RUN_TEST(test_llama_kv_cache_unified_constructor); + RUN_TEST(test_llama_kv_cache_unified_single_seq); + // Recurrent Cache Tests + RUN_TEST(test_llama_kv_cache_recurrent_constructor); + return 0; +} From 426fb77c2112bf637035f8b165ce10bc25b41e9c Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 20 May 2025 12:59:36 -0600 Subject: [PATCH 2/4] build: Add build step for test-memory on non-windows builds These tests use private headers, so won't build on windows Branch: MemoryTests Signed-off-by: Gabe Goodhart --- tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 62a9f5842bca8..ff3d97d7a27eb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -109,6 +109,7 @@ if (NOT WIN32) llama_build_and_test(test-grammar-integration.cpp) llama_build_and_test(test-llama-grammar.cpp) llama_build_and_test(test-chat.cpp) + llama_build_and_test(test-memory.cpp) # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) From 5c11928572c52b5e559aa6079ad242d49c0a0f43 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 27 May 2025 09:00:50 -0600 Subject: [PATCH 3/4] fix(tests): Fix constructors in tests for signature changes after rebase Branch: HybridCache Signed-off-by: Gabe Goodhart --- tests/test-memory.cpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/test-memory.cpp b/tests/test-memory.cpp index ad6c13800cbb6..36b008da9d8ca 100644 --- a/tests/test-memory.cpp +++ b/tests/test-memory.cpp @@ -89,16 +89,17 @@ struct log_scope { static void test_llama_kv_cache_unified_constructor() { auto model = _make_model(); llama_kv_cache_unified cache( - /* model */ *model, - /* filter */ nullptr, - /* type_k */ GGML_TYPE_F32, - /* type_v */ GGML_TYPE_F16, - /* v_trans */ false, - /* offload */ false, - /* kv_size */ 10, - /* padding */ 10, - /* n_swa */ 0, - /* swa_type */ LLAMA_SWA_TYPE_NONE + /* model */ *model, + /* filter */ nullptr, + /* type_k */ GGML_TYPE_F32, + /* type_v */ GGML_TYPE_F16, + /* v_trans */ false, + /* offload */ false, + /* kv_size */ 10, + /* n_seq_max */ 1, + /* padding */ 10, + /* n_swa */ 0, + /* swa_type */ LLAMA_SWA_TYPE_NONE ); } @@ -113,11 +114,11 @@ static void test_llama_kv_cache_unified_single_seq() { /* v_trans */ false, /* offload */ false, /* kv_size */ 10, + /* n_seq_max */ 1, /* padding */ 10, /* n_swa */ 0, /* swa_type */ LLAMA_SWA_TYPE_NONE ); - GGML_ASSERT(cache.get_used_cells() == 0); // Create the micro batch with a single 3-token sequence // @@ -155,11 +156,12 @@ static void test_llama_kv_cache_unified_single_seq() { static void test_llama_kv_cache_recurrent_constructor() { auto model = _make_model(LLM_ARCH_MAMBA); llama_kv_cache_recurrent cache( - /* model */ *model, - /* type_k */ GGML_TYPE_F32, - /* type_v */ GGML_TYPE_F16, - /* offload */ false, - /* kv_size */ 10 + /* model */ *model, + /* type_k */ GGML_TYPE_F32, + /* type_v */ GGML_TYPE_F16, + /* offload */ false, + /* kv_size */ 10, + /* n_seq_max */ 1 ); } From ba118a26fbac9ccf8b6516ff7aa5b8ebf6c5faeb Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 23 May 2025 17:29:04 -0600 Subject: [PATCH 4/4] tests(wip): More robust test for unified cache I'm still not clear how cache hits should be detected since find_slot does not seem to take into account the tokens themselves and simply looks for a sequence of cells that fits the size of the ubatch and has no set positions in any of the cells. I'm clearly still missing something about how this works! Branch: HybridCache Signed-off-by: Gabe Goodhart --- tests/test-memory.cpp | 98 +++++++++++++++++++++++++++++++++---------- 1 file changed, 75 insertions(+), 23 deletions(-) diff --git a/tests/test-memory.cpp b/tests/test-memory.cpp index 36b008da9d8ca..d843a6b5ea175 100644 --- a/tests/test-memory.cpp +++ b/tests/test-memory.cpp @@ -20,6 +20,7 @@ #include "../src/llama-model.h" #include "common.h" +#include "ggml.h" #include "llama.h" #include @@ -59,6 +60,43 @@ static std::shared_ptr _make_model( return model; } +static llama_batch _make_batch( + std::vector> token_seqs, + std::vector> seq_ids) { + GGML_ASSERT(token_seqs.size() == seq_ids.size()); + + size_t total_tokens = 0; + for (const auto & token_seq : token_seqs) { + total_tokens += token_seq.size(); + } + size_t max_seq_ids = 0; + for (const auto & seq_ids_i : seq_ids) { + max_seq_ids = std::max(max_seq_ids, seq_ids_i.size()); + } + llama_batch batch = llama_batch_init(total_tokens, 0, max_seq_ids); + + for (size_t i = 0; i < token_seqs.size(); ++i) { + const auto& token_seq = token_seqs[i]; + const auto& seq_ids_i = seq_ids[i]; + for (int pos = 0; pos < (int)token_seq.size(); ++pos) { + common_batch_add(batch, token_seq[pos], pos, seq_ids_i, false); + } + } + return batch; +} + +static bool is_source_tensor(ggml_tensor * child, ggml_tensor * parent) { + if (!child || !parent) return false; + for (size_t i = 0; i < GGML_MAX_SRC; ++i) { + if (child->src[i] == parent) { + return true; + } else if (child->src[i] != nullptr && is_source_tensor(child->src[i], parent)) { + return true; + } + } + return false; +} + struct log_scope { const char * name; explicit log_scope(const char * name) : name(name) { @@ -121,33 +159,47 @@ static void test_llama_kv_cache_unified_single_seq() { ); // Create the micro batch with a single 3-token sequence - // - // NOTE: A bunch of these asserts were just me figuring out how the batches - // relate to each other, but they're left for future readers to help in the - // same understanding process. - llama_seq_id seq_id = 42; - llama_batch batch = llama_batch_init(3, 0, 1); - common_batch_add(batch, 101, 0, {seq_id}, false); - common_batch_add(batch, 1, 1, {seq_id}, false); - common_batch_add(batch, 102, 2, {seq_id}, false); - llama_sbatch sbatch(batch, 0, true, false); - GGML_ASSERT(batch.n_tokens == 3); - GGML_ASSERT(sbatch.n_tokens == 3); - GGML_ASSERT(!sbatch.seq.empty()); - llama_ubatch ubatch = sbatch.split_simple(4); - printf("ubatch.n_seqs=%d\n", ubatch.n_seqs); - GGML_ASSERT(ubatch.n_seqs == 3); - GGML_ASSERT(ubatch.n_seq_tokens == 1); - GGML_ASSERT(ubatch.n_tokens == 3); - GGML_ASSERT(ubatch.seq_id[0][0] == seq_id); - GGML_ASSERT(ubatch.seq_id[1][0] == seq_id); - GGML_ASSERT(ubatch.seq_id[2][0] == seq_id); + llama_batch batch1 = _make_batch({{101, 1, 102}}, {{42}}); + llama_sbatch sbatch1 = cache.sbatch_init(batch1, false); + llama_ubatch ubatch1 = cache.ubatch_next(sbatch1, 4, false); // Find a slot for a new sequence - GGML_ASSERT(cache.find_slot(ubatch)); + GGML_ASSERT(cache.find_slot(ubatch1)); + + // Cache the k/v for a single layer in this slot + ggml_context * ctx = ggml_init({10240, NULL, false}); + ggml_tensor * k1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_k_gqa(0)); + ggml_tensor * v1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_v_gqa(0)); + ggml_tensor * k1_view = cache.cpy_k(ctx, k1, 0); + ggml_tensor * v1_view = cache.cpy_v(ctx, v1, 0); + GGML_ASSERT(is_source_tensor(k1_view, k1)); + GGML_ASSERT(is_source_tensor(v1_view, v1)); + + // Create a second batch with different tokens and find a slot for it + llama_batch batch2 = _make_batch({{1, 2, 3, 4}}, {{5}}); + llama_sbatch sbatch2 = cache.sbatch_init(batch2, false); + llama_ubatch ubatch2 = cache.ubatch_next(sbatch2, 4, false); + GGML_ASSERT(cache.find_slot(ubatch2)); + + // Add some different tensors + ggml_tensor * k2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_k_gqa(0)); + ggml_tensor * v2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, model->hparams.n_embd_v_gqa(0)); + ggml_tensor * k2_view = cache.cpy_k(ctx, k2, 0); + ggml_tensor * v2_view = cache.cpy_v(ctx, v2, 0); + GGML_ASSERT(is_source_tensor(k2_view, k2)); + GGML_ASSERT(is_source_tensor(v2_view, v2)); + + // Make sure first batch's k/v aren't cache hit + GGML_ASSERT(!is_source_tensor(k2_view, k1)); + GGML_ASSERT(!is_source_tensor(v2_view, v1)); + + // Re-find the slot for the first batch and make sure they cache hit + GGML_ASSERT(cache.find_slot(ubatch1)); // Clean up - llama_batch_free(batch); + llama_batch_free(batch1); + llama_batch_free(batch2); + ggml_free(ctx); } /*- Recurrent Cache ----------------------------------------------------------*/