From f78b396ee7a5d4c47cf3e3a8cb9fb02a4d3fe250 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 13 Jan 2025 14:13:11 +0200 Subject: [PATCH 01/84] llama : add struct llama_kv_cache (wip) [no ci] --- common/common.cpp | 6 +- common/speculative.cpp | 10 +- examples/embedding/embedding.cpp | 5 +- include/llama.h | 79 +++---- src/llama-context.cpp | 16 +- src/llama-kv-cache.cpp | 286 ++++--------------------- src/llama-kv-cache.h | 350 ++++++++++++++++++++++++++----- src/llama.cpp | 91 ++------ 8 files changed, 428 insertions(+), 415 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 6dea8e3d25238..29de45189e2d3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -952,7 +952,9 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } - if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) { + llama_kv_cache * kv = llama_get_kv_cache(lctx); + + if (params.ctx_shift && !llama_kv_cache_can_shift(kv)) { LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__); params.ctx_shift = false; } @@ -1057,7 +1059,7 @@ struct common_init_result common_init_from_params(common_params & params) { if (llama_model_has_decoder(model)) { llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); } - llama_kv_cache_clear(lctx); + llama_kv_cache_clear(kv); llama_synchronize(lctx); llama_perf_context_reset(lctx); } diff --git a/common/speculative.cpp b/common/speculative.cpp index 318e96ea35468..6ac0585178ebd 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -171,8 +171,10 @@ llama_tokens common_speculative_gen_draft( llama_tokens result; result.reserve(params.n_draft); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + if (reuse_n == 0) { - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); prompt.clear(); } else { @@ -191,14 +193,14 @@ llama_tokens common_speculative_gen_draft( } if (reuse_i > 0) { - llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i); - llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i); + llama_kv_cache_seq_rm (kv, 0, 0, reuse_i); + llama_kv_cache_seq_add(kv, 0, reuse_i, -1, -reuse_i); prompt.erase(prompt.begin(), prompt.begin() + reuse_i); } if (reuse_n < (int) prompt.size()) { - llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1); + llama_kv_cache_seq_rm (kv, 0, reuse_n, -1); prompt.erase(prompt.begin() + reuse_n, prompt.end()); } diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 38d22c90f82bb..fda0949f1c4cf 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -34,10 +34,11 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - const struct llama_model * model = llama_get_model(ctx); + const llama_model * model = llama_get_model(ctx); + llama_kv_cache * kv = llama_get_kv_cache(ctx); // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/include/llama.h b/include/llama.h index 3b75e760780ef..08b8658ad89ac 100644 --- a/include/llama.h +++ b/include/llama.h @@ -60,6 +60,7 @@ extern "C" { struct llama_model; struct llama_context; struct llama_sampler; + struct llama_kv_cache; typedef int32_t llama_pos; typedef int32_t llama_token; @@ -467,8 +468,9 @@ extern "C" { DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead"); - LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); - LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); + LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); // TODO: remove const? + LLAMA_API struct llama_kv_cache * llama_get_kv_cache( struct llama_context * ctx); + LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); @@ -584,7 +586,7 @@ extern "C" { // KV cache // - // TODO: remove llama_kv_cache_view_* API + // TODO: start using struct llama_kv_cache // Information associated with an individual cell in the KV cache view. struct llama_kv_cache_view_cell { @@ -639,14 +641,20 @@ extern "C" { // Returns the number of tokens in the KV cache (slow, use only for debug) // If a KV cell has multiple sequences assigned to it, it will be counted multiple times - LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx); + LLAMA_API int32_t llama_kv_cache_n_tokens(const struct llama_kv_cache * kv); + + DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), + "use llama_kv_cache_n_tokens instead"); // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) - LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx); + LLAMA_API int32_t llama_kv_cache_used_cells(const struct llama_kv_cache * kv); + + DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx), + "use llama_kv_cache_used_cells instead"); // Clear the KV cache - both cell info is erased and KV data is zeroed LLAMA_API void llama_kv_cache_clear( - struct llama_context * ctx); + struct llama_kv_cache * kv); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails @@ -654,26 +662,26 @@ extern "C" { // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API bool llama_kv_cache_seq_rm( - struct llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1); + struct llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1); // Copy all tokens that belong to the specified sequence to another sequence // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_cp( - struct llama_context * ctx, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1); + struct llama_kv_cache * kv, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1); // Removes all tokens that do not belong to the specified sequence LLAMA_API void llama_kv_cache_seq_keep( - struct llama_context * ctx, - llama_seq_id seq_id); + struct llama_kv_cache * kv, + llama_seq_id seq_id); // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) // If the KV cache is RoPEd, the KV data is updated accordingly: @@ -682,11 +690,11 @@ extern "C" { // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_add( - struct llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta); + struct llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta); // Integer division of the positions by factor of `d > 1` // If the KV cache is RoPEd, the KV data is updated accordingly: @@ -695,31 +703,28 @@ extern "C" { // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_div( - struct llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d); + struct llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d); // Returns the largest position present in the KV cache for the specified sequence LLAMA_API llama_pos llama_kv_cache_seq_pos_max( - struct llama_context * ctx, - llama_seq_id seq_id); - - // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache - // how to avoid this? + struct llama_kv_cache * kv, + llama_seq_id seq_id); // Defragment the KV cache // This will be applied: // - lazily on next llama_decode() // - explicitly with llama_kv_cache_update() - LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx); - - // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) - LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); + LLAMA_API void llama_kv_cache_defrag(struct llama_kv_cache * kv); // Check if the context supports KV cache shifting - LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx); + LLAMA_API bool llama_kv_cache_can_shift(const struct llama_kv_cache * kv); + + // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) + LLAMA_API void llama_update_kv_cache(struct llama_context * ctx, struct llama_kv_cache * kv); // // State / sessions diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 671d2a81adabf..bf5a77ccaff1b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -602,11 +602,15 @@ uint32_t llama_n_seq_max(const struct llama_context * ctx) { return ctx->kv_self.size; } -const struct llama_model * llama_get_model(const struct llama_context * ctx) { +const llama_model * llama_get_model(const llama_context * ctx) { return &ctx->model; } -enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) { +llama_kv_cache * llama_get_kv_cache(llama_context * ctx) { + return &ctx->kv_self; +} + +enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { return ctx->cparams.pooling_type; } @@ -1142,7 +1146,7 @@ struct llama_data_read { if (dest_seq_id != -1) { // single sequence - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + kv_self.seq_rm(dest_seq_id, -1, -1); llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false); batch.n_tokens = cell_count; @@ -1185,7 +1189,7 @@ struct llama_data_read { return false; } - llama_kv_cache_clear(kv_self); + kv_self.clear(); for (uint32_t i = 0; i < cell_count; ++i) { llama_kv_cell & cell = kv_self.cells[i]; @@ -1362,9 +1366,9 @@ struct llama_data_read { if (!res) { if (seq_id == -1) { - llama_kv_cache_clear(ctx); + ctx->kv_self.clear(); } else { - llama_kv_cache_seq_rm(ctx, seq_id, -1, -1); + ctx->kv_self.seq_rm(seq_id, -1, -1); } throw std::runtime_error("failed to restore kv cache"); } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index feffdf0de52cf..b0d5a931839f8 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -350,277 +350,67 @@ uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { return 0; } -void llama_kv_cache_clear(struct llama_kv_cache & cache) { - for (int32_t i = 0; i < (int32_t) cache.size; ++i) { - cache.cells[i].pos = -1; - cache.cells[i].seq_id.clear(); - cache.cells[i].src = -1; - cache.cells[i].tail = -1; - } - cache.head = 0; - cache.used = 0; - - for (auto & buf : cache.bufs) { - ggml_backend_buffer_clear(buf.get(), 0); - } +void llama_kv_cache_clear(llama_kv_cache * kv) { + kv->clear(); } bool llama_kv_cache_seq_rm( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1) { - uint32_t new_head = cache.size; - - if (p0 < 0) p0 = 0; - if (p1 < 0) p1 = std::numeric_limits::max(); - - // models like Mamba or RWKV can't have a state partially erased - if (cache.recurrent) { - if (seq_id >= (int64_t) cache.size) { - // could be fatal - return false; - } - if (0 <= seq_id) { - int32_t & tail_id = cache.cells[seq_id].tail; - if (tail_id >= 0) { - const llama_kv_cell & cell = cache.cells[tail_id]; - // partial intersection is invalid - if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { - return false; - } - // invalidate tails which will be cleared - if (p0 <= cell.pos && cell.pos < p1) { - tail_id = -1; - } - } - } else { - // seq_id is negative, then the range should include everything or nothing - if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { - return false; - } - } - } - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - if (seq_id < 0) { - cache.cells[i].seq_id.clear(); - } else if (cache.cells[i].has_seq_id(seq_id)) { - cache.cells[i].seq_id.erase(seq_id); - } else { - continue; - } - if (cache.cells[i].is_empty()) { - // keep count of the number of used cells - if (cache.cells[i].pos >= 0) cache.used--; - - cache.cells[i].pos = -1; - cache.cells[i].src = -1; - if (new_head == cache.size) new_head = i; - } - } - } - - // If we freed up a slot, set head to it so searching can start there. - if (new_head != cache.size && new_head < cache.head) cache.head = new_head; - - return true; + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return kv->seq_rm(seq_id, p0, p1); } void llama_kv_cache_seq_cp( - struct llama_kv_cache & cache, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1) { - if (p0 < 0) p0 = 0; - if (p1 < 0) p1 = std::numeric_limits::max(); - - if (cache.recurrent) { - if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) { - llama_kv_cell & tail_src = cache.cells[seq_id_src]; - llama_kv_cell & tail_dst = cache.cells[seq_id_dst]; - if (tail_dst.tail >= 0) { - // clear destination seq_id if it wasn't empty - llama_kv_cell & cell_dst = cache.cells[tail_dst.tail]; - - cell_dst.seq_id.erase(seq_id_dst); - tail_dst.tail = -1; - if (cell_dst.seq_id.empty()) { - cell_dst.pos = -1; - cell_dst.delta = -1; - cell_dst.src = -1; - cache.used -= 1; - } - } - if (tail_src.tail >= 0) { - llama_kv_cell & cell_src = cache.cells[tail_src.tail]; - - cell_src.seq_id.insert(seq_id_dst); - tail_dst.tail = tail_src.tail; - } - } - - return; - } - // otherwise, this is the KV cache of a Transformer-like model - - cache.head = 0; - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - cache.cells[i].seq_id.insert(seq_id_dst); - } - } + llama_kv_cache * kv, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + kv->seq_cp(seq_id_src, seq_id_dst, p0, p1); } -void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) { - uint32_t new_head = cache.size; - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.recurrent && (llama_seq_id) i != seq_id) { - cache.cells[i].tail = -1; - } - if (!cache.cells[i].has_seq_id(seq_id)) { - if (cache.cells[i].pos >= 0) cache.used--; - cache.cells[i].pos = -1; - cache.cells[i].src = -1; - cache.cells[i].seq_id.clear(); - if (new_head == cache.size) new_head = i; - } else { - cache.cells[i].seq_id.clear(); - cache.cells[i].seq_id.insert(seq_id); - } - } - - // If we freed up a slot, set head to it so searching can start there. - if (new_head != cache.size && new_head < cache.head) cache.head = new_head; +void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id) { + kv->seq_keep(seq_id); } void llama_kv_cache_seq_add( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta) { - uint32_t new_head = cache.size; - - if (p0 < 0) p0 = 0; - if (p1 < 0) p1 = std::numeric_limits::max(); - // If there is no range then return early to avoid looping over the cache. - if (p0 == p1) return; - - if (cache.recurrent) { - // for Mamba-like or RWKV models, only the pos needs to be shifted - if (0 <= seq_id && seq_id < (int64_t) cache.size) { - const int32_t tail_id = cache.cells[seq_id].tail; - if (tail_id >= 0) { - llama_kv_cell & cell = cache.cells[tail_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos += delta; - } - } - } - return; - } - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - cache.has_shift = true; - cache.cells[i].pos += delta; - cache.cells[i].delta += delta; - - if (cache.cells[i].pos < 0) { - if (!cache.cells[i].is_empty()) { - cache.used--; - } - cache.cells[i].pos = -1; - cache.cells[i].seq_id.clear(); - if (new_head == cache.size) { - new_head = i; - } - } - } - } - - // If we freed up a slot, set head to it so searching can start there. - // Otherwise we just start the next search from the beginning. - cache.head = new_head != cache.size ? new_head : 0; + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + kv->seq_add(seq_id, p0, p1, delta); } void llama_kv_cache_seq_div( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d) { - if (p0 < 0) p0 = 0; - if (p1 < 0) p1 = std::numeric_limits::max(); - // If there is no range then return early to avoid looping over the cache. - if (p0 == p1) return; - - if (cache.recurrent) { - // for Mamba-like or RWKV models, only the pos needs to be changed - if (0 <= seq_id && seq_id < (int64_t) cache.size) { - const int32_t tail_id = cache.cells[seq_id].tail; - if (tail_id >= 0) { - llama_kv_cell & cell = cache.cells[tail_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos /= d; - } - } - } - return; - } - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - cache.has_shift = true; - - { - llama_pos p_old = cache.cells[i].pos; - cache.cells[i].pos /= d; - cache.cells[i].delta += cache.cells[i].pos - p_old; - } - } - } + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + kv->seq_div(seq_id, p0, p1, d); } -llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) { - llama_pos result = 0; - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].has_seq_id(seq_id)) { - result = std::max(result, cache.cells[i].pos); - } - } - - return result; +llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id) { + return kv->seq_pos_max(seq_id); } -void llama_kv_cache_defrag(struct llama_kv_cache & cache) { - if (!cache.recurrent) { - cache.do_defrag = true; - } +void llama_kv_cache_defrag(llama_kv_cache * kv) { + kv->defrag(); } -int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv) { - int result = 0; - - for (uint32_t i = 0; i < kv.size; i++) { - result += kv.cells[i].seq_id.size(); - } - - return result; +int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) { + return kv->n_tokens(); } -int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv) { - return kv.used; +int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { + return kv->used; } -bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv) { - return kv.can_shift; +bool llama_kv_cache_can_shift(const llama_kv_cache * kv) { + return kv->can_shift; } // @@ -632,7 +422,7 @@ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache /*.n_cells = */ 0, /*.n_seq_max = */ n_seq_max, /*.token_count = */ 0, - /*.used_cells = */ llama_get_kv_cache_used_cells(kv), + /*.used_cells = */ llama_kv_cache_used_cells(&kv), /*.max_contiguous = */ 0, /*.max_contiguous_idx = */ -1, /*.cells = */ nullptr, diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index dca6f3998c645..b0bb1cfb14f12 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -57,6 +57,16 @@ struct llama_kv_cache { std::vector ctxs; std::vector bufs; + int32_t n_tokens() const { + int32_t result = 0; + + for (uint32_t i = 0; i < size; i++) { + result += cells[i].seq_id.size(); + } + + return result; + } + size_t total_size() const { size_t size = 0; for (const auto & buf : bufs) { @@ -75,6 +85,297 @@ struct llama_kv_cache { return max_pos; } + + void clear() { + for (int32_t i = 0; i < (int32_t) size; ++i) { + cells[i].pos = -1; + cells[i].seq_id.clear(); + cells[i].src = -1; + cells[i].tail = -1; + } + head = 0; + used = 0; + + for (auto & buf : bufs) { + ggml_backend_buffer_clear(buf.get(), 0); + } + } + + bool seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // models like Mamba or RWKV can't have a state partially erased + if (recurrent) { + if (seq_id >= (int64_t) size) { + // could be fatal + return false; + } + if (0 <= seq_id) { + int32_t & tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + const llama_kv_cell & cell = cells[tail_id]; + // partial intersection is invalid + if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { + return false; + } + // invalidate tails which will be cleared + if (p0 <= cell.pos && cell.pos < p1) { + tail_id = -1; + } + } + } else { + // seq_id is negative, then the range should include everything or nothing + if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { + return false; + } + } + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].pos >= p0 && cells[i].pos < p1) { + if (seq_id < 0) { + cells[i].seq_id.clear(); + } else if (cells[i].has_seq_id(seq_id)) { + cells[i].seq_id.erase(seq_id); + } else { + continue; + } + if (cells[i].is_empty()) { + // keep count of the number of used cells + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } + + return true; + } + + void seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (seq_id_src == seq_id_dst) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + if (recurrent) { + if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) { + llama_kv_cell & tail_src = cells[seq_id_src]; + llama_kv_cell & tail_dst = cells[seq_id_dst]; + if (tail_dst.tail >= 0) { + // clear destination seq_id if it wasn't empty + llama_kv_cell & cell_dst = cells[tail_dst.tail]; + + cell_dst.seq_id.erase(seq_id_dst); + tail_dst.tail = -1; + if (cell_dst.seq_id.empty()) { + cell_dst.pos = -1; + cell_dst.delta = -1; + cell_dst.src = -1; + used -= 1; + } + } + if (tail_src.tail >= 0) { + llama_kv_cell & cell_src = cells[tail_src.tail]; + + cell_src.seq_id.insert(seq_id_dst); + tail_dst.tail = tail_src.tail; + } + } + + return; + } + + // otherwise, this is the KV of a Transformer-like model + head = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) { + cells[i].seq_id.insert(seq_id_dst); + } + } + } + + void seq_keep(llama_seq_id seq_id) { + uint32_t new_head = size; + + for (uint32_t i = 0; i < size; ++i) { + if (recurrent && (llama_seq_id) i != seq_id) { + cells[i].tail = -1; + } + + if (!cells[i].has_seq_id(seq_id)) { + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + cells[i].seq_id.clear(); + + if (new_head == size){ + new_head = i; + } + } else { + cells[i].seq_id.clear(); + cells[i].seq_id.insert(seq_id); + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } + } + + void seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + if (delta == 0) { + return; + } + + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be shifted + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos += delta; + } + } + } + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + cells[i].pos += delta; + cells[i].delta += delta; + + if (cells[i].pos < 0) { + if (!cells[i].is_empty()) { + used--; + } + cells[i].pos = -1; + cells[i].seq_id.clear(); + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + // Otherwise we just start the next search from the beginning. + head = new_head != size ? new_head : 0; + } + + void seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (d == 1) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the cache. + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be changed + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos /= d; + } + } + } + + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + + { + llama_pos p_old = cells[i].pos; + cells[i].pos /= d; + cells[i].delta += cells[i].pos - p_old; + } + } + } + } + + llama_pos seq_pos_max(llama_seq_id seq_id) { + llama_pos result = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id)) { + result = std::max(result, cells[i].pos); + } + } + + return result; + } + + void defrag() { + if (!recurrent) { + do_defrag = true; + } + } }; // a structure holds information about the slot found in llama_kv_cache_find_slot @@ -112,51 +413,6 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // find how many cells are currently in use uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache); -void llama_kv_cache_clear(struct llama_kv_cache & cache); - -bool llama_kv_cache_seq_rm( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1); - -void llama_kv_cache_seq_cp( - struct llama_kv_cache & cache, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1); - -void llama_kv_cache_seq_keep( - struct llama_kv_cache & cache, - llama_seq_id seq_id); - -void llama_kv_cache_seq_add( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta); - -void llama_kv_cache_seq_div( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d); - -llama_pos llama_kv_cache_seq_pos_max( - struct llama_kv_cache & cache, - llama_seq_id seq_id); - -void llama_kv_cache_defrag(struct llama_kv_cache & cache); - -int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv); - -int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv); - -bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv); - // // kv cache view // @@ -206,10 +462,10 @@ struct llama_kv_slot_restorer { cache.n = old_state.n; if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased - llama_kv_cache_seq_rm(cache, -1, -1, -1); + cache.seq_rm(-1, -1, -1); } else { for (auto & slot : slot_boundaries) { - llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second); + cache.seq_rm(-1, slot.first, slot.second); } } } diff --git a/src/llama.cpp b/src/llama.cpp index 094157ccf2aa2..87dd512b2546a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8564,7 +8564,7 @@ static int llama_decode_impl( // non-causal masks do not use the KV cache if (hparams.causal_attn) { - llama_kv_cache_update(&lctx); + llama_update_kv_cache(&lctx, &lctx.kv_self); // TODO: lctx->update_kv_cache() // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it @@ -8760,7 +8760,7 @@ static int llama_decode_impl( if (fragmentation > cparams.defrag_thold) { //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); - llama_kv_cache_defrag(kv_self); + kv_self.defrag(); } } @@ -9182,11 +9182,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); } -static void llama_kv_cache_update_impl(struct llama_context & lctx) { +static void llama_update_kv_cache_impl(llama_context & lctx, llama_kv_cache & kv) { bool need_reserve = false; - if (lctx.kv_self.has_shift) { - if (!llama_kv_cache_can_shift(&lctx)) { + if (kv.has_shift) { + if (!kv.can_shift) { GGML_ABORT("The current context does not support K-shift"); } @@ -9206,23 +9206,21 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) { } { - auto & kv_self = lctx.kv_self; + kv.has_shift = false; - kv_self.has_shift = false; - - for (uint32_t i = 0; i < kv_self.size; ++i) { - kv_self.cells[i].delta = 0; + for (uint32_t i = 0; i < kv.size; ++i) { + kv.cells[i].delta = 0; } } } // defragment the KV cache if needed - if (lctx.kv_self.do_defrag) { + if (kv.do_defrag) { llama_kv_cache_defrag_impl(lctx); need_reserve = true; - lctx.kv_self.do_defrag = false; + kv.do_defrag = false; } // reserve a worst case graph again @@ -9845,6 +9843,7 @@ struct llama_context * llama_init_from_model( return ctx; } +// deprecated struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params) { @@ -9855,73 +9854,27 @@ struct llama_context * llama_new_context_with_model( // kv cache // -// TODO: tmp bridges below until `struct llama_kv_cache` is exposed through the public API - -struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { +struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { return llama_kv_cache_view_init(ctx->kv_self, n_seq_max); } -void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) { +void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) { llama_kv_cache_view_update(view, ctx->kv_self); } -int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) { - return llama_get_kv_cache_token_count(ctx->kv_self); -} - -int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) { - return llama_get_kv_cache_used_cells(ctx->kv_self); -} - -void llama_kv_cache_clear(struct llama_context * ctx) { - llama_kv_cache_clear(ctx->kv_self); -} - -bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { - return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1); -} - -void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { - if (seq_id_src == seq_id_dst) { - return; - } - llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); -} - -void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { - llama_kv_cache_seq_keep(ctx->kv_self, seq_id); -} - -void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { - if (delta == 0) { - return; - } - - llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta); -} - -void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { - if (d == 1) { - return; - } - - llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); -} - -llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id); -} - -void llama_kv_cache_defrag(struct llama_context * ctx) { - llama_kv_cache_defrag(ctx->kv_self); +// deprecated +int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { + return llama_kv_cache_n_tokens(&ctx->kv_self); } -void llama_kv_cache_update(struct llama_context * ctx) { - llama_kv_cache_update_impl(*ctx); +// deprecated +int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) { + return llama_kv_cache_used_cells(&ctx->kv_self); } -bool llama_kv_cache_can_shift(struct llama_context * ctx) { - return llama_kv_cache_can_shift(ctx->kv_self); +// TODO: move to llama-context +void llama_update_kv_cache(llama_context * ctx, llama_kv_cache * kv) { + llama_update_kv_cache_impl(*ctx, *kv); } /// From e4550fbafc44403b243fe029937a97a0aed7bbd6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 13 Jan 2025 14:56:52 +0200 Subject: [PATCH 02/84] llama : cont ggml-ci --- examples/batched-bench/batched-bench.cpp | 6 ++-- .../cvector-generator/cvector-generator.cpp | 3 +- examples/gritlm/gritlm.cpp | 8 +++-- examples/imatrix/imatrix.cpp | 4 ++- examples/infill/infill.cpp | 6 ++-- examples/llama-bench/llama-bench.cpp | 6 ++-- examples/lookahead/lookahead.cpp | 13 ++++---- examples/lookup/lookup.cpp | 3 +- examples/main/main.cpp | 14 +++++---- examples/parallel/parallel.cpp | 11 +++---- examples/passkey/passkey.cpp | 30 ++++++++++--------- examples/perplexity/perplexity.cpp | 24 +++++++++++---- examples/retrieval/retrieval.cpp | 4 ++- examples/run/run.cpp | 7 +++-- examples/save-load-state/save-load-state.cpp | 4 ++- examples/server/server.cpp | 25 +++++++++------- examples/simple-chat/simple-chat.cpp | 6 ++-- .../speculative-simple/speculative-simple.cpp | 4 ++- examples/speculative/speculative.cpp | 29 ++++++++++-------- 19 files changed, 128 insertions(+), 79 deletions(-) diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 0659ab6f119a7..fcbad37bb3f2f 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -57,6 +57,8 @@ int main(int argc, char ** argv) { return 1; } + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const int32_t n_kv_max = llama_n_ctx(ctx); llama_batch batch = llama_batch_init(n_kv_max, 0, 1); @@ -132,7 +134,7 @@ int main(int argc, char ** argv) { const auto t_pp_start = ggml_time_us(); - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); @@ -141,7 +143,7 @@ int main(int argc, char ** argv) { if (is_pp_shared) { for (int32_t i = 1; i < pl; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + llama_kv_cache_seq_cp(kv, 0, i, -1, -1); } } diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 413b71d34c52b..adb4a60ada41f 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -342,7 +342,8 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { - llama_kv_cache_clear(ctx); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + llama_kv_cache_clear(kv); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 72eb46257429e..16437453edb89 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -13,6 +13,8 @@ static std::vector> encode(llama_context * ctx, const std::ve const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); for (uint64_t i = 0; i < sentences.size(); i++) { @@ -45,7 +47,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); llama_set_embeddings(ctx, true); llama_set_causal_attn(ctx, false); @@ -100,9 +102,11 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + llama_token eos_token = llama_vocab_eos(vocab); - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); llama_set_embeddings(ctx, false); llama_set_causal_attn(ctx, true); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index b5f3feb9f82e6..5efe4f019f562 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -431,6 +431,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const bool add_bos = llama_vocab_get_add_bos(vocab); const int n_ctx = llama_n_ctx(ctx); @@ -497,7 +499,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); llama_batch batch = llama_batch_init(n_batch, 0, 1); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 489a208b66b34..de8e7769552bb 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -139,6 +139,8 @@ int main(int argc, char ** argv) { return 1; } + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); const int n_ctx_train = llama_model_n_ctx_train(model); @@ -332,8 +334,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (kv, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_add(kv, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 4ac19ca86ec56..8843c0048d6cc 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1575,9 +1575,11 @@ int main(int argc, char ** argv) { return 1; } + llama_kv_cache * kv = llama_get_kv_cache(ctx); + test t(inst, lmodel, ctx); - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // cool off before the test if (params.delay) { @@ -1617,7 +1619,7 @@ int main(int argc, char ** argv) { } for (int i = 0; i < params.reps; i++) { - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); uint64_t t_start = get_time_ns(); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 2f0898e6284a0..1219c207464d2 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -60,6 +60,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); + llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -95,7 +96,7 @@ int main(int argc, char ** argv) { llama_decode(ctx, llama_batch_get_one(&inp.back(), 1)); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_cache_seq_cp(ctx, 0, s, -1, -1); + llama_kv_cache_seq_cp(kv, 0, s, -1, -1); } const auto t_enc_end = ggml_time_us(); @@ -437,17 +438,17 @@ int main(int argc, char ** argv) { // KV cache management // if no verification token matched, we simply remove all cells from this batch -> no fragmentation - llama_kv_cache_seq_rm(ctx, -1, n_past, -1); + llama_kv_cache_seq_rm(kv, -1, n_past, -1); if (seq_id_best != 0) { // if a verification token matched, we keep the best sequence and remove the rest // this leads to some KV cache fragmentation - llama_kv_cache_seq_keep(ctx, seq_id_best); - llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1); - llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1); + llama_kv_cache_seq_keep(kv, seq_id_best); + llama_kv_cache_seq_cp (kv, seq_id_best, 0, -1, -1); + llama_kv_cache_seq_rm (kv, seq_id_best, -1, -1); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_cache_seq_cp(ctx, 0, s, -1, -1); + llama_kv_cache_seq_cp(kv, 0, s, -1, -1); } } } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index dbd0444ec8742..8628f7318556c 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -35,6 +35,7 @@ int main(int argc, char ** argv){ llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); + llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -192,7 +193,7 @@ int main(int argc, char ** argv){ // KV cache management // clean the cache of draft tokens that weren't accepted - llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + llama_kv_cache_seq_rm(kv, 0, n_past, -1); common_batch_clear(batch_tgt); common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index da2a03ab9ba10..9d79af79e2723 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -157,6 +157,8 @@ int main(int argc, char ** argv) { return 1; } + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); auto chat_templates = common_chat_templates_from_model(model, params.chat_template); @@ -328,7 +330,7 @@ int main(int argc, char ** argv) { } // remove any "future" tokens that we might have inherited from the previous session - llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); + llama_kv_cache_seq_rm(kv, -1, n_matching_session_tokens, -1); } LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", @@ -569,8 +571,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (kv, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(kv, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; @@ -593,9 +595,9 @@ int main(int argc, char ** argv) { LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); - llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); - llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); - llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); + llama_kv_cache_seq_add(kv, 0, ga_i, n_past, ib*bd); + llama_kv_cache_seq_div(kv, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); + llama_kv_cache_seq_add(kv, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); n_past -= bd; diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7ef43d5e12876..2ba0706dc5d24 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -134,6 +134,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); + llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -201,7 +202,7 @@ int main(int argc, char ** argv) { // assign the system KV cache to all parallel sequences for (int32_t i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + llama_kv_cache_seq_cp(kv, 0, i, -1, -1); } LOG_INF("\n"); @@ -233,9 +234,9 @@ int main(int argc, char ** argv) { if (batch.n_tokens == 0) { // all sequences have ended - clear the entire KV cache for (int i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_rm(ctx, i, -1, -1); + llama_kv_cache_seq_rm(kv, i, -1, -1); // but keep the system prompt - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + llama_kv_cache_seq_cp(kv, 0, i, -1, -1); } LOG_INF("%s: clearing the KV cache\n", __func__); @@ -371,8 +372,8 @@ int main(int argc, char ** argv) { } // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1); - llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1); + llama_kv_cache_seq_rm(kv, client.id + 1, -1, -1); + llama_kv_cache_seq_cp(kv, 0, client.id + 1, -1, -1); const auto t_main_end = ggml_time_us(); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 5953928d47d33..e2764313b2f01 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -86,6 +86,8 @@ int main(int argc, char ** argv) { return 1; } + llama_kv_cache * kv = llama_get_kv_cache(ctx); + auto sparams = llama_sampler_chain_default_params(); llama_sampler * smpl = llama_sampler_chain_init(sparams); @@ -132,11 +134,11 @@ int main(int argc, char ** argv) { const int ib = i/n_batch - 1; const int bd = n_batch_grp*(n_grp - 1); - llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd); - llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); - llama_kv_cache_update (ctx); + llama_kv_cache_seq_add(kv, 0, n_past - n_batch, n_past, ib*bd); + llama_kv_cache_seq_div(kv, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_update_kv_cache (ctx, kv); - n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; + n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; } common_batch_clear(batch); @@ -166,12 +168,12 @@ int main(int argc, char ** argv) { LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); - //llama_kv_cache_defrag (ctx); - llama_kv_cache_update (ctx); + llama_kv_cache_seq_rm (kv, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx, -n_discard); + //llama_kv_cache_defrag (kv); + llama_update_kv_cache (ctx, kv); - n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; + n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; common_batch_clear(batch); @@ -197,12 +199,12 @@ int main(int argc, char ** argv) { if (n_discard > 0) { LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); - //llama_kv_cache_defrag (ctx); - llama_kv_cache_update (ctx); + llama_kv_cache_seq_rm (kv, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx, -n_discard); + //llama_kv_cache_defrag (kv); + llama_update_kv_cache (ctx, kv); - n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; + n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; } } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9bf6c57433ab2..6c9f716ede23c 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -299,6 +299,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const bool add_bos = llama_vocab_get_add_bos(vocab); GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); @@ -360,7 +362,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); llama_batch batch = llama_batch_init(n_batch, 0, 1); @@ -450,6 +452,8 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const bool add_bos = llama_vocab_get_add_bos(vocab); GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); @@ -546,7 +550,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -741,6 +745,8 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -923,7 +929,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { return; } - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1084,6 +1090,8 @@ static void winogrande_score(llama_context * ctx, const common_params & params) const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + constexpr int k_min_trailing_ctx = 3; auto data = load_winogrande_from_csv(params.prompt); @@ -1202,7 +1210,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) return; } - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1388,6 +1396,8 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + std::istringstream strstream(params.prompt); uint32_t n_task; strstream.read((char *)&n_task, sizeof(n_task)); @@ -1574,7 +1584,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par return; } - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1671,6 +1681,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + if (params.logits_file.empty()) { LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; @@ -1764,7 +1776,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { } // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); llama_batch batch = llama_batch_init(n_batch, 0, 1); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 2439022a229b7..a907ea07607dd 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -82,8 +82,10 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke } static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { + llama_kv_cache * kv = llama_get_kv_cache(ctx); + // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 92a49eb744fda..8e2c174a955e8 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -756,7 +756,8 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll // Function to tokenize the prompt static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt, std::vector & prompt_tokens, const LlamaData & llama_data) { - const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0; + const llama_kv_cache * kv = llama_get_kv_cache(llama_data.context.get()); + const bool is_first = llama_kv_cache_used_cells(kv) == 0; const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); prompt_tokens.resize(n_prompt_tokens); @@ -771,8 +772,10 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt // Check if we have enough space in the context to evaluate this batch static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { + llama_kv_cache * kv = llama_get_kv_cache(ctx.get()); + const int n_ctx = llama_n_ctx(ctx.get()); - const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); + const int n_ctx_used = llama_kv_cache_used_cells(kv); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); printe("context size exceeded\n"); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index cf7cbd8159cf8..3839fbe8c84d5 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -156,6 +156,8 @@ int main(int argc, char ** argv) { // make new context llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params)); + llama_kv_cache * kv3 = llama_get_kv_cache(ctx3); + llama_sampler * smpl3 = llama_sampler_chain_init(sparams); llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); @@ -196,7 +198,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); // erase whole kv - llama_kv_cache_clear(ctx3); + llama_kv_cache_clear(kv3); fprintf(stderr, "%s : kv cache cleared\n", __func__); // restore kv into seq 1 diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b1cde2d7f48dd..076044d39679c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1693,6 +1693,7 @@ struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; + llama_kv_cache * kv = nullptr; const llama_vocab * vocab = nullptr; @@ -1755,6 +1756,8 @@ struct server_context { return false; } + kv = llama_get_kv_cache(ctx); + vocab = llama_model_get_vocab(model); n_ctx = llama_n_ctx(ctx); @@ -2023,7 +2026,7 @@ struct server_context { SRV_DBG("%s", "clearing KV cache\n"); // clear the entire KV cache - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); clean_kv_cache = false; } @@ -2565,8 +2568,8 @@ struct server_context { res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); res->t_start = metrics.t_start; - res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); - res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); + res->kv_cache_tokens_count = llama_kv_cache_n_tokens(kv); + res->kv_cache_used_cells = llama_kv_cache_used_cells(kv); res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; res->t_prompt_processing_total = metrics.t_prompt_processing_total; @@ -2682,7 +2685,7 @@ struct server_context { // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); + llama_kv_cache_seq_rm(kv, slot->id, -1, -1); slot->cache_tokens.clear(); auto res = std::make_unique(); @@ -2750,8 +2753,8 @@ struct server_context { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); + llama_kv_cache_seq_rm (kv, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(kv, slot.id, n_keep + n_discard, slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -2938,8 +2941,8 @@ struct server_context { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); - llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift); + llama_kv_cache_seq_rm (kv, slot.id, head_p, head_c); + llama_kv_cache_seq_add(kv, slot.id, head_c, -1, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; @@ -2977,9 +2980,9 @@ struct server_context { } // keep only the common part - if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { + if (!llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id, -1, -1); + llama_kv_cache_seq_rm(kv, slot.id, -1, -1); // there is no common part left slot.n_past = 0; @@ -3219,7 +3222,7 @@ struct server_context { slot.cache_tokens.push_back(id); slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); - llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); + llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1); for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index c5534cc13e4b4..130e326b55d4c 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -88,6 +88,8 @@ int main(int argc, char ** argv) { return 1; } + const llama_kv_cache * kv = llama_get_kv_cache(ctx); + // initialize the sampler llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params()); llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1)); @@ -98,7 +100,7 @@ int main(int argc, char ** argv) { auto generate = [&](const std::string & prompt) { std::string response; - const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0; + const bool is_first = llama_kv_cache_used_cells(kv) == 0; // tokenize the prompt const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); @@ -113,7 +115,7 @@ int main(int argc, char ** argv) { while (true) { // check if we have enough space in the context to evaluate this batch int n_ctx = llama_n_ctx(ctx); - int n_ctx_used = llama_get_kv_cache_used_cells(ctx); + int n_ctx_used = llama_kv_cache_used_cells(kv); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); fprintf(stderr, "context size exceeded\n"); diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 403ba2dd21914..24bdc806d5710 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -45,6 +45,8 @@ int main(int argc, char ** argv) { model_tgt = llama_init_tgt.model.get(); ctx_tgt = llama_init_tgt.context.get(); + llama_kv_cache * kv = llama_get_kv_cache(ctx_tgt); + const llama_vocab * vocab = llama_model_get_vocab(model_tgt); // load the draft model @@ -217,7 +219,7 @@ int main(int argc, char ** argv) { { LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); - llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1); + llama_kv_cache_seq_rm(kv, 0, n_past, -1); } if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index c7ccea50dbbd4..b4e5259b5be46 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -90,6 +90,9 @@ int main(int argc, char ** argv) { model_dft = llama_init_dft.model.get(); ctx_dft = llama_init_dft.context.get(); + llama_kv_cache * kv_tgt = llama_get_kv_cache(ctx_tgt); + llama_kv_cache * kv_dft = llama_get_kv_cache(ctx_dft); + const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); @@ -420,14 +423,14 @@ int main(int argc, char ** argv) { { LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); - llama_kv_cache_seq_keep(ctx_dft, s_keep); - llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(ctx_dft, 0); + llama_kv_cache_seq_keep(kv_dft, s_keep); + llama_kv_cache_seq_cp (kv_dft, s_keep, 0, -1, -1); + llama_kv_cache_seq_keep(kv_dft, 0); - llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1); - llama_kv_cache_seq_keep(ctx_tgt, s_keep); - llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(ctx_tgt, 0); + llama_kv_cache_seq_rm (kv_tgt, s_keep, n_past_tgt, -1); + llama_kv_cache_seq_keep(kv_tgt, s_keep); + llama_kv_cache_seq_cp (kv_tgt, s_keep, 0, -1, -1); + llama_kv_cache_seq_keep(kv_tgt, 0); } for (int s = 0; s < n_seq_dft; ++s) { @@ -444,8 +447,8 @@ int main(int argc, char ** argv) { common_batch_clear(batch_dft); common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); - llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); - // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); + llama_kv_cache_seq_rm(kv_dft, 0, n_past_dft, -1); + // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(kv_dft, batch_dft).c_str()); llama_decode(ctx_dft, batch_dft); ++n_past_dft; @@ -503,8 +506,8 @@ int main(int argc, char ** argv) { if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) { LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur); - llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); - llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); + llama_kv_cache_seq_rm(kv_dft, n_seq_cur, -1, -1); + llama_kv_cache_seq_cp(kv_dft, s, n_seq_cur, -1, -1); // all previous tokens from this branch are now also part of the new branch for (int t = 0; t < batch_tgt.n_tokens; ++t) { @@ -585,9 +588,9 @@ int main(int argc, char ** argv) { // evaluate the target model on the drafted tokens { - llama_kv_cache_seq_keep(ctx_tgt, 0); + llama_kv_cache_seq_keep(kv_tgt, 0); for (int s = 1; s < n_seq_dft; ++s) { - llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1); + llama_kv_cache_seq_cp(kv_tgt, 0, s, -1, -1); } // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); From 4d7bd03e653f24e00158ae7e819908e444a20353 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 13 Jan 2025 15:50:39 +0200 Subject: [PATCH 03/84] kv_cache : functions -> members ggml-ci --- src/llama-context.cpp | 2 +- src/llama-kv-cache.cpp | 490 ++++++++++++++++++++++++++++++++++------- src/llama-kv-cache.h | 402 +++++---------------------------- src/llama.cpp | 16 +- 4 files changed, 466 insertions(+), 444 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index bf5a77ccaff1b..0654feccb8951 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1169,7 +1169,7 @@ struct llama_data_read { } batch.n_seq_id[0] = 1; batch.seq_id[0] = &dest_seq_id; - if (!llama_kv_cache_find_slot(kv_self, batch)) { + if (!kv_self.find_slot(batch)) { LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); return false; } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index b0d5a931839f8..8b2f6287b8ae7 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -11,41 +11,35 @@ static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false}; -uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) { - // the FA kernels require padding to avoid extra runtime boundary checks - return cparams.flash_attn ? 256u : 32u; -} - -bool llama_kv_cache_init( - struct llama_kv_cache & cache, - const llama_model & model, - const llama_cparams & cparams, - ggml_type type_k, - ggml_type type_v, - uint32_t kv_size, - bool offload) { +bool llama_kv_cache::init( + const llama_model & model, + const llama_cparams & cparams, + ggml_type type_k, + ggml_type type_v, + uint32_t kv_size, + bool offload) { const struct llama_hparams & hparams = model.hparams; const int32_t n_layer = hparams.n_layer; - cache.has_shift = false; + has_shift = false; - cache.recurrent = llama_model_is_recurrent(&model); - cache.v_trans = !cache.recurrent && !cparams.flash_attn; - cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + recurrent = llama_model_is_recurrent(&model); + v_trans = !recurrent && !cparams.flash_attn; + can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", - __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift); + __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift); - cache.head = 0; - cache.size = kv_size; - cache.used = 0; + head = 0; + size = kv_size; + used = 0; - cache.type_k = type_k; - cache.type_v = type_v; + type_k = type_k; + type_v = type_v; - cache.cells.clear(); - cache.cells.resize(kv_size); + cells.clear(); + cells.resize(kv_size); // create a context for each buffer type std::map ctx_map; @@ -57,19 +51,23 @@ bool llama_kv_cache_init( /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; + ggml_context * ctx = ggml_init(params); if (!ctx) { return nullptr; } + ctx_map[buft] = ctx; - cache.ctxs.emplace_back(ctx); + ctxs.emplace_back(ctx); + return ctx; } + return it->second; }; - cache.k_l.reserve(n_layer); - cache.v_l.reserve(n_layer); + k_l.reserve(n_layer); + v_l.reserve(n_layer); for (int i = 0; i < n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); @@ -95,8 +93,8 @@ bool llama_kv_cache_init( ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); - cache.k_l.push_back(k); - cache.v_l.push_back(v); + k_l.push_back(k); + v_l.push_back(v); } // allocate tensors and initialize the buffers to avoid NaNs in the padding @@ -111,20 +109,339 @@ bool llama_kv_cache_init( } ggml_backend_buffer_clear(buf, 0); LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); - cache.bufs.emplace_back(buf); + bufs.emplace_back(buf); } return true; } -struct llama_kv_cache_slot_info llama_kv_cache_find_slot( - struct llama_kv_cache & cache, +int32_t llama_kv_cache::n_tokens() const { + int32_t result = 0; + + for (uint32_t i = 0; i < size; i++) { + result += cells[i].seq_id.size(); + } + + return result; +} + +size_t llama_kv_cache::total_size() const { + size_t size = 0; + for (const auto & buf : bufs) { + size += ggml_backend_buffer_get_size(buf.get()); + } + + return size; +} + +// TODO: better data structures to reduce the cost of this operation +llama_pos llama_kv_cache::max_pos() const { + llama_pos max_pos = -1; + for (const auto & cell : cells) { + max_pos = std::max(max_pos, cell.pos); + } + + return max_pos; +} + +void llama_kv_cache::clear() { + for (int32_t i = 0; i < (int32_t) size; ++i) { + cells[i].pos = -1; + cells[i].seq_id.clear(); + cells[i].src = -1; + cells[i].tail = -1; + } + head = 0; + used = 0; + + for (auto & buf : bufs) { + ggml_backend_buffer_clear(buf.get(), 0); + } +} + +bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // models like Mamba or RWKV can't have a state partially erased + if (recurrent) { + if (seq_id >= (int64_t) size) { + // could be fatal + return false; + } + if (0 <= seq_id) { + int32_t & tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + const llama_kv_cell & cell = cells[tail_id]; + // partial intersection is invalid + if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { + return false; + } + // invalidate tails which will be cleared + if (p0 <= cell.pos && cell.pos < p1) { + tail_id = -1; + } + } + } else { + // seq_id is negative, then the range should include everything or nothing + if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { + return false; + } + } + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].pos >= p0 && cells[i].pos < p1) { + if (seq_id < 0) { + cells[i].seq_id.clear(); + } else if (cells[i].has_seq_id(seq_id)) { + cells[i].seq_id.erase(seq_id); + } else { + continue; + } + if (cells[i].is_empty()) { + // keep count of the number of used cells + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } + + return true; +} + +void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (seq_id_src == seq_id_dst) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + if (recurrent) { + if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) { + llama_kv_cell & tail_src = cells[seq_id_src]; + llama_kv_cell & tail_dst = cells[seq_id_dst]; + if (tail_dst.tail >= 0) { + // clear destination seq_id if it wasn't empty + llama_kv_cell & cell_dst = cells[tail_dst.tail]; + + cell_dst.seq_id.erase(seq_id_dst); + tail_dst.tail = -1; + if (cell_dst.seq_id.empty()) { + cell_dst.pos = -1; + cell_dst.delta = -1; + cell_dst.src = -1; + used -= 1; + } + } + if (tail_src.tail >= 0) { + llama_kv_cell & cell_src = cells[tail_src.tail]; + + cell_src.seq_id.insert(seq_id_dst); + tail_dst.tail = tail_src.tail; + } + } + + return; + } + + // otherwise, this is the KV of a Transformer-like model + head = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) { + cells[i].seq_id.insert(seq_id_dst); + } + } +} + +void llama_kv_cache::seq_keep(llama_seq_id seq_id) { + uint32_t new_head = size; + + for (uint32_t i = 0; i < size; ++i) { + if (recurrent && (llama_seq_id) i != seq_id) { + cells[i].tail = -1; + } + + if (!cells[i].has_seq_id(seq_id)) { + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + cells[i].seq_id.clear(); + + if (new_head == size){ + new_head = i; + } + } else { + cells[i].seq_id.clear(); + cells[i].seq_id.insert(seq_id); + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } +} + +void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + if (delta == 0) { + return; + } + + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be shifted + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos += delta; + } + } + } + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + cells[i].pos += delta; + cells[i].delta += delta; + + if (cells[i].pos < 0) { + if (!cells[i].is_empty()) { + used--; + } + cells[i].pos = -1; + cells[i].seq_id.clear(); + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + // Otherwise we just start the next search from the beginning. + head = new_head != size ? new_head : 0; +} + +void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (d == 1) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the cache. + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be changed + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos /= d; + } + } + } + + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + + { + llama_pos p_old = cells[i].pos; + cells[i].pos /= d; + cells[i].delta += cells[i].pos - p_old; + } + } + } +} + +llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) { + llama_pos result = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id)) { + result = std::max(result, cells[i].pos); + } + } + + return result; +} + +void llama_kv_cache::defrag() { + if (!recurrent) { + do_defrag = true; + } +} + +struct llama_kv_cache_slot_info llama_kv_cache::find_slot( const struct llama_ubatch & ubatch) { const uint32_t n_tokens = ubatch.n_tokens; const uint32_t n_seqs = ubatch.n_seqs; const uint32_t n_seq_tokens = ubatch.n_seq_tokens; - if (cache.recurrent) { + if (recurrent) { // For recurrent state architectures (like Mamba or RWKV), // each cache cell can store the state for a whole sequence. // A slot should be always be contiguous. @@ -132,7 +449,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // can only process batches with an equal number of new tokens in each sequence GGML_ASSERT(ubatch.equal_seqs); - int32_t min = cache.size - 1; + int32_t min = size - 1; int32_t max = 0; // everything should fit if all seq_ids are smaller than the max @@ -141,16 +458,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( for (uint32_t j = 0; j < n_seq_id; ++j) { const llama_seq_id seq_id = ubatch.seq_id[s][j]; - if (seq_id < 0 || (uint32_t) seq_id >= cache.size) { + if (seq_id < 0 || (uint32_t) seq_id >= size) { // too big seq_id // TODO: would it be possible to resize the cache instead? - LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size); + LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size); return llama_kv_cache_slot_info_failed; } if (j > 0) { - llama_kv_cell & seq = cache.cells[seq_id]; + llama_kv_cell & seq = cells[seq_id]; if (seq.tail >= 0) { - llama_kv_cell & cell = cache.cells[seq.tail]; + llama_kv_cell & cell = cells[seq.tail]; // clear cells from seq_ids that become shared // (should not normally happen, but let's handle it anyway) cell.seq_id.erase(seq_id); @@ -158,7 +475,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( if (cell.seq_id.empty()) { cell.pos = -1; cell.src = -1; - cache.used -= 1; + used -= 1; } } } @@ -168,9 +485,9 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( #ifndef NDEBUG { std::vector tails_verif; - tails_verif.assign(cache.size, -1); - for (uint32_t i = 0; i < cache.size; ++i) { - llama_kv_cell & cell = cache.cells[i]; + tails_verif.assign(size, -1); + for (uint32_t i = 0; i < size; ++i) { + llama_kv_cell & cell = cells[i]; for (llama_seq_id seq_id : cell.seq_id) { if (tails_verif[seq_id] != -1) { LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]); @@ -178,20 +495,20 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( tails_verif[seq_id] = i; } } - for (uint32_t i = 0; i < cache.size; ++i) { - if (tails_verif[i] != cache.cells[i].tail) { - LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]); + for (uint32_t i = 0; i < size; ++i) { + if (tails_verif[i] != cells[i].tail) { + LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]); } } } #endif // find next empty cell - uint32_t next_empty_cell = cache.head; + uint32_t next_empty_cell = head; - for (uint32_t i = 0; i < cache.size; ++i) { - if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; } - llama_kv_cell & cell = cache.cells[next_empty_cell]; + for (uint32_t i = 0; i < size; ++i) { + if (next_empty_cell >= size) { next_empty_cell -= size; } + llama_kv_cell & cell = cells[next_empty_cell]; if (cell.is_empty()) { break; } next_empty_cell += 1; } @@ -199,20 +516,20 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // find usable cell range for (uint32_t s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch.seq_id[s][0]; - llama_kv_cell & seq_meta = cache.cells[seq_id]; + llama_kv_cell & seq_meta = cells[seq_id]; bool has_cell = false; if (seq_meta.tail >= 0) { - llama_kv_cell & cell = cache.cells[seq_meta.tail]; + llama_kv_cell & cell = cells[seq_meta.tail]; GGML_ASSERT(cell.has_seq_id(seq_id)); // does this seq_id "own" the cell? if (cell.seq_id.size() == 1) { has_cell = true; } } if (!has_cell) { - llama_kv_cell & empty_cell = cache.cells[next_empty_cell]; + llama_kv_cell & empty_cell = cells[next_empty_cell]; GGML_ASSERT(empty_cell.is_empty()); // copy old tail into the empty cell if (seq_meta.tail >= 0) { - llama_kv_cell & orig_cell = cache.cells[seq_meta.tail]; + llama_kv_cell & orig_cell = cells[seq_meta.tail]; empty_cell.pos = orig_cell.pos; empty_cell.src = orig_cell.src; orig_cell.seq_id.erase(seq_id); @@ -222,9 +539,9 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // find next empty cell if (s + 1 < n_seqs) { next_empty_cell += 1; - for (uint32_t i = 0; i < cache.size; ++i) { - if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; } - llama_kv_cell & cell = cache.cells[next_empty_cell]; + for (uint32_t i = 0; i < size; ++i) { + if (next_empty_cell >= size) { next_empty_cell -= size; } + llama_kv_cell & cell = cells[next_empty_cell]; if (cell.is_empty()) { break; } next_empty_cell += 1; } @@ -237,10 +554,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // gather and re-order for (uint32_t s = 0; s < n_seqs; ++s) { int32_t dst_id = s + min; - int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail; + int32_t src_id = cells[ubatch.seq_id[s][0]].tail; if (dst_id != src_id) { - llama_kv_cell & dst_cell = cache.cells[dst_id]; - llama_kv_cell & src_cell = cache.cells[src_id]; + llama_kv_cell & dst_cell = cells[dst_id]; + llama_kv_cell & src_cell = cells[src_id]; std::swap(dst_cell.pos, src_cell.pos); std::swap(dst_cell.src, src_cell.src); @@ -248,10 +565,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // swap tails (assuming they NEVER overlap) for (const llama_seq_id seq_id : src_cell.seq_id) { - cache.cells[seq_id].tail = src_id; + cells[seq_id].tail = src_id; } for (const llama_seq_id seq_id : dst_cell.seq_id) { - cache.cells[seq_id].tail = dst_id; + cells[seq_id].tail = dst_id; } } } @@ -260,7 +577,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( for (uint32_t s = 0; s < n_seqs; ++s) { const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1]; int32_t cell_id = s + min; - llama_kv_cell & cell = cache.cells[cell_id]; + llama_kv_cell & cell = cells[cell_id]; if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) { // What should happen when the pos backtracks or skips a value? @@ -273,41 +590,41 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) { const llama_seq_id seq_id = ubatch.seq_id[s][j]; cell.seq_id.insert(seq_id); - cache.cells[seq_id].tail = cell_id; + cells[seq_id].tail = cell_id; } } // allow getting the range of used cells, from head to head + n - cache.head = min; - cache.n = max - min + 1; - cache.used = std::count_if(cache.cells.begin(), cache.cells.end(), + head = min; + n = max - min + 1; + used = std::count_if(cells.begin(), cells.end(), [](const llama_kv_cell& cell){ return !cell.is_empty(); }); // sanity check - return llama_kv_cache_slot_info(cache.n >= n_seqs); + return llama_kv_cache_slot_info(n >= n_seqs); } // otherwise, one cell per token. - if (n_tokens > cache.size) { - LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size); + if (n_tokens > size) { + LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size); return llama_kv_cache_slot_info_failed; } uint32_t n_tested = 0; while (true) { - if (cache.head + n_tokens > cache.size) { - n_tested += cache.size - cache.head; - cache.head = 0; + if (head + n_tokens > size) { + n_tested += size - head; + head = 0; continue; } bool found = true; for (uint32_t i = 0; i < n_tokens; i++) { - if (cache.cells[cache.head + i].pos >= 0) { + if (cells[head + i].pos >= 0) { found = false; - cache.head += i + 1; - n_tested += i + 1; + head += i + 1; + n_tested += i + 1; break; } } @@ -316,7 +633,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( break; } - if (n_tested >= cache.size) { + if (n_tested >= size) { //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); return llama_kv_cache_slot_info_failed; } @@ -325,22 +642,27 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( for (uint32_t s = 0; s < n_seqs; s++) { for (uint32_t i = 0; i < n_seq_tokens; ++i) { uint32_t k = s*n_seq_tokens + i; - cache.cells[cache.head + k].pos = ubatch.pos[k]; + cells[head + k].pos = ubatch.pos[k]; for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) { - cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]); + cells[head + k].seq_id.insert(ubatch.seq_id[s][j]); } } } - cache.used += n_tokens; + used += n_tokens; - return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens); + return llama_kv_cache_slot_info(head, head + n_tokens); +} + +uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) const { + // the FA kernels require padding to avoid extra runtime boundary checks + return cparams.flash_attn ? 256u : 32u; } -uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { - for (uint32_t i = cache.size; i > 0; --i) { - const llama_kv_cell & cell = cache.cells[i - 1]; +uint32_t llama_kv_cache::cell_max() const { + for (uint32_t i = size; i > 0; --i) { + const llama_kv_cell & cell = cells[i - 1]; if (cell.pos >= 0 && !cell.is_empty()) { return i; diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index b0bb1cfb14f12..4ee3418d80334 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -7,6 +7,9 @@ #include #include +struct llama_cparams; +struct llama_ubatch; + struct llama_kv_cell { llama_pos pos = -1; llama_pos delta = 0; @@ -28,7 +31,19 @@ struct llama_kv_cell { } }; +// a structure holds information about the slot found in llama_kv_cache_find_slot +struct llama_kv_cache_slot_info { + std::pair boundaries; // slot boundaries [begin, end) + bool found = false; // the slot was found + + explicit llama_kv_cache_slot_info(bool found_) : found{found_} {} + llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {} + + operator bool() const { return found; } +}; + // ring-buffer of cached KV data +// TODO: pimpl struct llama_kv_cache { bool has_shift = false; bool do_defrag = false; @@ -57,370 +72,48 @@ struct llama_kv_cache { std::vector ctxs; std::vector bufs; - int32_t n_tokens() const { - int32_t result = 0; - - for (uint32_t i = 0; i < size; i++) { - result += cells[i].seq_id.size(); - } - - return result; - } + // TODO: become constructor + bool init( + const llama_model & model, + const llama_cparams & cparams, + ggml_type type_k, + ggml_type type_v, + uint32_t kv_size, + bool offload); - size_t total_size() const { - size_t size = 0; - for (const auto & buf : bufs) { - size += ggml_backend_buffer_get_size(buf.get()); - } + int32_t n_tokens() const; - return size; - } + size_t total_size() const; // TODO: better data structures to reduce the cost of this operation - llama_pos max_pos() const { - llama_pos max_pos = -1; - for (const auto & cell : cells) { - max_pos = std::max(max_pos, cell.pos); - } - - return max_pos; - } - - void clear() { - for (int32_t i = 0; i < (int32_t) size; ++i) { - cells[i].pos = -1; - cells[i].seq_id.clear(); - cells[i].src = -1; - cells[i].tail = -1; - } - head = 0; - used = 0; + llama_pos max_pos() const; - for (auto & buf : bufs) { - ggml_backend_buffer_clear(buf.get(), 0); - } - } + void clear(); - bool seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { - uint32_t new_head = size; + bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1); + void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); + void seq_keep(llama_seq_id seq_id); + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta); + void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d); - if (p0 < 0) { - p0 = 0; - } + llama_pos seq_pos_max(llama_seq_id seq_id); - if (p1 < 0) { - p1 = std::numeric_limits::max(); - } + void defrag(); - // models like Mamba or RWKV can't have a state partially erased - if (recurrent) { - if (seq_id >= (int64_t) size) { - // could be fatal - return false; - } - if (0 <= seq_id) { - int32_t & tail_id = cells[seq_id].tail; - if (tail_id >= 0) { - const llama_kv_cell & cell = cells[tail_id]; - // partial intersection is invalid - if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { - return false; - } - // invalidate tails which will be cleared - if (p0 <= cell.pos && cell.pos < p1) { - tail_id = -1; - } - } - } else { - // seq_id is negative, then the range should include everything or nothing - if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { - return false; - } - } - } + // find an empty slot of size "n_tokens" in the cache + // updates the cache head + // returns a structure holding information about the slot found + // Note: On success, it's important that cache.head points + // to the first cell of the slot. + llama_kv_cache_slot_info find_slot(const llama_ubatch & batch); - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].pos >= p0 && cells[i].pos < p1) { - if (seq_id < 0) { - cells[i].seq_id.clear(); - } else if (cells[i].has_seq_id(seq_id)) { - cells[i].seq_id.erase(seq_id); - } else { - continue; - } - if (cells[i].is_empty()) { - // keep count of the number of used cells - if (cells[i].pos >= 0) { - used--; - } - - cells[i].pos = -1; - cells[i].src = -1; - - if (new_head == size) { - new_head = i; - } - } - } - } - - // If we freed up a slot, set head to it so searching can start there. - if (new_head != size && new_head < head) { - head = new_head; - } - - return true; - } - - void seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { - if (seq_id_src == seq_id_dst) { - return; - } - - if (p0 < 0) { - p0 = 0; - } - - if (p1 < 0) { - p1 = std::numeric_limits::max(); - } - - if (recurrent) { - if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) { - llama_kv_cell & tail_src = cells[seq_id_src]; - llama_kv_cell & tail_dst = cells[seq_id_dst]; - if (tail_dst.tail >= 0) { - // clear destination seq_id if it wasn't empty - llama_kv_cell & cell_dst = cells[tail_dst.tail]; - - cell_dst.seq_id.erase(seq_id_dst); - tail_dst.tail = -1; - if (cell_dst.seq_id.empty()) { - cell_dst.pos = -1; - cell_dst.delta = -1; - cell_dst.src = -1; - used -= 1; - } - } - if (tail_src.tail >= 0) { - llama_kv_cell & cell_src = cells[tail_src.tail]; - - cell_src.seq_id.insert(seq_id_dst); - tail_dst.tail = tail_src.tail; - } - } - - return; - } - - // otherwise, this is the KV of a Transformer-like model - head = 0; - - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) { - cells[i].seq_id.insert(seq_id_dst); - } - } - } - - void seq_keep(llama_seq_id seq_id) { - uint32_t new_head = size; - - for (uint32_t i = 0; i < size; ++i) { - if (recurrent && (llama_seq_id) i != seq_id) { - cells[i].tail = -1; - } - - if (!cells[i].has_seq_id(seq_id)) { - if (cells[i].pos >= 0) { - used--; - } - - cells[i].pos = -1; - cells[i].src = -1; - cells[i].seq_id.clear(); - - if (new_head == size){ - new_head = i; - } - } else { - cells[i].seq_id.clear(); - cells[i].seq_id.insert(seq_id); - } - } - - // If we freed up a slot, set head to it so searching can start there. - if (new_head != size && new_head < head) { - head = new_head; - } - } - - void seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { - if (delta == 0) { - return; - } + // TODO: maybe not needed + uint32_t get_padding(const llama_cparams & cparams) const; - uint32_t new_head = size; - - if (p0 < 0) { - p0 = 0; - } - - if (p1 < 0) { - p1 = std::numeric_limits::max(); - } - - // If there is no range then return early to avoid looping over the - if (p0 == p1) { - return; - } - - if (recurrent) { - // for Mamba-like or RWKV models, only the pos needs to be shifted - if (0 <= seq_id && seq_id < (int64_t) size) { - const int32_t tail_id = cells[seq_id].tail; - if (tail_id >= 0) { - llama_kv_cell & cell = cells[tail_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos += delta; - } - } - } - return; - } - - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { - has_shift = true; - cells[i].pos += delta; - cells[i].delta += delta; - - if (cells[i].pos < 0) { - if (!cells[i].is_empty()) { - used--; - } - cells[i].pos = -1; - cells[i].seq_id.clear(); - if (new_head == size) { - new_head = i; - } - } - } - } - - // If we freed up a slot, set head to it so searching can start there. - // Otherwise we just start the next search from the beginning. - head = new_head != size ? new_head : 0; - } - - void seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { - if (d == 1) { - return; - } - - if (p0 < 0) { - p0 = 0; - } - - if (p1 < 0) { - p1 = std::numeric_limits::max(); - } - - // If there is no range then return early to avoid looping over the cache. - if (p0 == p1) { - return; - } - - if (recurrent) { - // for Mamba-like or RWKV models, only the pos needs to be changed - if (0 <= seq_id && seq_id < (int64_t) size) { - const int32_t tail_id = cells[seq_id].tail; - if (tail_id >= 0) { - llama_kv_cell & cell = cells[tail_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos /= d; - } - } - } - - return; - } - - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { - has_shift = true; - - { - llama_pos p_old = cells[i].pos; - cells[i].pos /= d; - cells[i].delta += cells[i].pos - p_old; - } - } - } - } - - llama_pos seq_pos_max(llama_seq_id seq_id) { - llama_pos result = 0; - - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id)) { - result = std::max(result, cells[i].pos); - } - } - - return result; - } - - void defrag() { - if (!recurrent) { - do_defrag = true; - } - } -}; - -// a structure holds information about the slot found in llama_kv_cache_find_slot -struct llama_kv_cache_slot_info { - std::pair boundaries; // slot boundaries [begin, end) - bool found = false; // the slot was found - - explicit llama_kv_cache_slot_info(bool found_) : found{found_} {} - llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {} - - operator bool() const { return found; } + // find how many cells are currently in use + uint32_t cell_max() const; }; -// TODO: maybe not needed -uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams); - -bool llama_kv_cache_init( - struct llama_kv_cache & cache, - const llama_model & model, - const llama_cparams & cparams, - ggml_type type_k, - ggml_type type_v, - uint32_t kv_size, - bool offload); - -// find an empty slot of size "n_tokens" in the cache -// updates the cache head -// returns a structure holding information about the slot found -// Note: On success, it's important that cache.head points -// to the first cell of the slot. -struct llama_kv_cache_slot_info llama_kv_cache_find_slot( - struct llama_kv_cache & cache, - const struct llama_ubatch & batch); - -// find how many cells are currently in use -uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache); - -// -// kv cache view -// - -struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max); - -void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv); - // // kv cache restore // @@ -472,3 +165,10 @@ struct llama_kv_slot_restorer { } }; +// +// kv cache view +// + +struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max); + +void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv); diff --git a/src/llama.cpp b/src/llama.cpp index 87dd512b2546a..d8427af9d1b6d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8572,18 +8572,18 @@ static int llama_decode_impl( kv_self.head = 0; } - const auto slot = llama_kv_cache_find_slot(kv_self, ubatch); - if (!slot) { + const auto slot_info = kv_self.find_slot(ubatch); + if (!slot_info) { return 1; } - kv_slot_restorer.save(slot); + kv_slot_restorer.save(slot_info); if (!kv_self.recurrent) { // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - const uint32_t pad = llama_kv_cache_get_padding(cparams); - kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad))); + const uint32_t pad = kv_self.get_padding(cparams); + kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); //kv_self.n = llama_kv_cache_cell_max(kv_self); } } @@ -8969,7 +8969,7 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { const uint32_t n_layer = hparams.n_layer; - const uint32_t n_kv = llama_kv_cache_cell_max(kv_self); + const uint32_t n_kv = kv_self.cell_max(); const uint32_t n_used = kv_self.used; assert(n_used <= n_kv); @@ -9550,7 +9550,7 @@ struct llama_context * llama_init_from_model( cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; // this is necessary due to kv_self.n being padded later during inference - cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams)); + cparams.n_ctx = GGML_PAD(cparams.n_ctx, ctx->kv_self.get_padding(cparams)); // with causal attention, the batch size is limited by the context size cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; @@ -9692,7 +9692,7 @@ struct llama_context * llama_init_from_model( llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); - if (!llama_kv_cache_init(ctx->kv_self, ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { + if (!ctx->kv_self.init(ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; From fef90cb3d7a823bd00a7899b52ffc70a4f824d44 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 13 Jan 2025 15:58:20 +0200 Subject: [PATCH 04/84] kv_cache : fix ggml-ci --- src/llama-kv-cache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 8b2f6287b8ae7..fe59867684a85 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -35,8 +35,8 @@ bool llama_kv_cache::init( size = kv_size; used = 0; - type_k = type_k; - type_v = type_v; + this->type_k = type_k; + this->type_v = type_v; cells.clear(); cells.resize(kv_size); From 73a14eccc9f200d6012963af9448042dfeac54fc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 14 Jan 2025 11:56:53 +0200 Subject: [PATCH 05/84] kv_cache : minor --- src/llama-kv-cache.cpp | 38 +++++++++++++++++++++++++++++++------- src/llama-kv-cache.h | 18 +++++++++++------- src/llama.cpp | 18 +++++------------- 3 files changed, 47 insertions(+), 27 deletions(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index fe59867684a85..9f3b4e5144415 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -73,17 +73,22 @@ bool llama_kv_cache::init( const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); - LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa); + const char * dev_name = "CPU"; ggml_backend_buffer_type_t buft; if (offload) { auto * dev = model.dev_layer(i); buft = ggml_backend_dev_buffer_type(dev); + + dev_name = ggml_backend_dev_name(dev); } else { buft = ggml_backend_cpu_buffer_type(); } - ggml_context * ctx = ctx_for_buft(buft); + LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__, + i, n_embd_k_gqa, n_embd_v_gqa, dev_name); + + ggml_context * ctx = ctx_for_buft(buft); if (!ctx) { LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__); return false; @@ -134,14 +139,13 @@ size_t llama_kv_cache::total_size() const { return size; } -// TODO: better data structures to reduce the cost of this operation -llama_pos llama_kv_cache::max_pos() const { - llama_pos max_pos = -1; +llama_pos llama_kv_cache::pos_max() const { + llama_pos pos_max = -1; for (const auto & cell : cells) { - max_pos = std::max(max_pos, cell.pos); + pos_max = std::max(pos_max, cell.pos); } - return max_pos; + return pos_max; } void llama_kv_cache::clear() { @@ -672,6 +676,26 @@ uint32_t llama_kv_cache::cell_max() const { return 0; } +size_t llama_kv_cache::size_k_bytes() const { + size_t size_k_bytes = 0; + + for (const auto & k : k_l) { + size_k_bytes += ggml_nbytes(k); + } + + return size_k_bytes; +} + +size_t llama_kv_cache::size_v_bytes() const { + size_t size_v_bytes = 0; + + for (const auto & v : v_l) { + size_v_bytes += ggml_nbytes(v); + } + + return size_v_bytes; +} + void llama_kv_cache_clear(llama_kv_cache * kv) { kv->clear(); } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 4ee3418d80334..97285481e3588 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -61,17 +61,11 @@ struct llama_kv_cache { // computed before each graph build uint32_t n = 0; - ggml_type type_k = GGML_TYPE_F16; - ggml_type type_v = GGML_TYPE_F16; - std::vector cells; std::vector k_l; // per layer std::vector v_l; - std::vector ctxs; - std::vector bufs; - // TODO: become constructor bool init( const llama_model & model, @@ -86,7 +80,7 @@ struct llama_kv_cache { size_t total_size() const; // TODO: better data structures to reduce the cost of this operation - llama_pos max_pos() const; + llama_pos pos_max() const; void clear(); @@ -112,6 +106,16 @@ struct llama_kv_cache { // find how many cells are currently in use uint32_t cell_max() const; + + size_t size_k_bytes() const; + size_t size_v_bytes() const; + +private: + ggml_type type_k = GGML_TYPE_F16; + ggml_type type_v = GGML_TYPE_F16; + + std::vector ctxs; + std::vector bufs; }; // diff --git a/src/llama.cpp b/src/llama.cpp index d8427af9d1b6d..0227ba6b36a93 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1973,7 +1973,7 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -8456,7 +8456,7 @@ static int llama_decode_impl( } // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; const uint32_t n_tokens_all = batch.n_tokens; @@ -8792,7 +8792,7 @@ static int llama_encode_impl( } // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; const uint32_t n_tokens = batch.n_tokens; @@ -9699,16 +9699,8 @@ struct llama_context * llama_init_from_model( } { - size_t memory_size_k = 0; - size_t memory_size_v = 0; - - for (auto & k : ctx->kv_self.k_l) { - memory_size_k += ggml_nbytes(k); - } - - for (auto & v : ctx->kv_self.v_l) { - memory_size_v += ggml_nbytes(v); - } + const size_t memory_size_k = ctx->kv_self.size_k_bytes(); + const size_t memory_size_v = ctx->kv_self.size_v_bytes(); LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), From 4cd1b6fa4cc4e8da927caac5c61b9fcd096a1ace Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 14 Jan 2025 12:33:13 +0200 Subject: [PATCH 06/84] context : prepare kv_cache_read/write to be moved to kv_cache ggml-ci --- src/llama-context.cpp | 153 +++++++++++++++++++++--------------------- src/llama-kv-cache.h | 1 + 2 files changed, 76 insertions(+), 78 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0654feccb8951..8fc6de2f271f9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -928,11 +928,8 @@ struct llama_data_write { } } - void write_kv_cache_data(const struct llama_context * ctx, const std::vector> & cell_ranges) { - const struct llama_kv_cache & kv_self = ctx->kv_self; - const struct llama_hparams & hparams = ctx->model.hparams; - - const uint32_t v_trans = kv_self.v_trans ? 1 : 0; + void write_kv_cache_data(const llama_kv_cache & kv, const llama_hparams & hparams, const std::vector> & cell_ranges) { + const uint32_t v_trans = kv.v_trans ? 1 : 0; const uint32_t n_layer = hparams.n_layer; write(&v_trans, sizeof(v_trans)); @@ -946,52 +943,52 @@ struct llama_data_write { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); // Write key type - const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + const int32_t k_type_i = (int32_t)kv.k_l[il]->type; write(&k_type_i, sizeof(k_type_i)); // Write row size of key - const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + const uint64_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa); write(&k_size_row, sizeof(k_size_row)); // Read each range of cells of k_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * k_size_row; - write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size); + write_tensor_data(kv.k_l[il], range.first * k_size_row, buf_size); } } - if (!kv_self.v_trans) { + if (!kv.v_trans) { for (uint32_t il = 0; il < n_layer; ++il) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); // Write value type - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + const int32_t v_type_i = (int32_t)kv.v_l[il]->type; write(&v_type_i, sizeof(v_type_i)); // Write row size of value - const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + const uint64_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa); write(&v_size_row, sizeof(v_size_row)); // Read each range of cells of v_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * v_size_row; - write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size); + write_tensor_data(kv.v_l[il], range.first * v_size_row, buf_size); } } } else { // When v is transposed, we also need the element size and get the element ranges from each row - const uint32_t kv_size = kv_self.size; + const uint32_t kv_size = kv.size; for (uint32_t il = 0; il < n_layer; ++il) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); // Write value type - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + const int32_t v_type_i = (int32_t)kv.v_l[il]->type; write(&v_type_i, sizeof(v_type_i)); // Write element size - const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + const uint32_t v_size_el = ggml_type_size(kv.v_l[il]->type); write(&v_size_el, sizeof(v_size_el)); // Write GQA embedding size @@ -1004,37 +1001,36 @@ struct llama_data_write { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * kv_size) * v_size_el; const size_t buf_size = range_size * v_size_el; - write_tensor_data(kv_self.v_l[il], src_offset, buf_size); + write_tensor_data(kv.v_l[il], src_offset, buf_size); } } } } } - void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) { - const struct llama_kv_cache & kv_self = ctx->kv_self; + void write_kv_cache(const llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) { std::vector> cell_ranges; // ranges, from inclusive, to exclusive uint32_t cell_count = 0; // Count the number of cells with the specified seq_id // Find all the ranges of cells with this seq id (or all, when -1) - uint32_t cell_range_begin = kv_self.size; - for (uint32_t i = 0; i < kv_self.size; ++i) { - const auto & cell = kv_self.cells[i]; + uint32_t cell_range_begin = kv.size; + for (uint32_t i = 0; i < kv.size; ++i) { + const auto & cell = kv.cells[i]; if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { ++cell_count; - if (cell_range_begin == kv_self.size) { + if (cell_range_begin == kv.size) { cell_range_begin = i; } } else { - if (cell_range_begin != kv_self.size) { + if (cell_range_begin != kv.size) { cell_ranges.emplace_back(cell_range_begin, i); - cell_range_begin = kv_self.size; + cell_range_begin = kv.size; } } } - if (cell_range_begin != kv_self.size) { - cell_ranges.emplace_back(cell_range_begin, kv_self.size); + if (cell_range_begin != kv.size) { + cell_ranges.emplace_back(cell_range_begin, kv.size); } // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count @@ -1046,8 +1042,8 @@ struct llama_data_write { write(&cell_count, sizeof(cell_count)); - write_kv_cache_meta(kv_self, cell_ranges, seq_id); - write_kv_cache_data(ctx, cell_ranges); + write_kv_cache_meta(kv, cell_ranges, seq_id); + write_kv_cache_data(kv, hparams, cell_ranges); } }; @@ -1140,15 +1136,15 @@ struct llama_data_read { } } - bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) { - struct llama_kv_cache & kv_self = ctx->kv_self; - + bool read_kv_cache_meta(llama_kv_cache & kv, uint32_t cell_count, llama_seq_id dest_seq_id = -1) { if (dest_seq_id != -1) { // single sequence - kv_self.seq_rm(dest_seq_id, -1, -1); + kv.seq_rm(dest_seq_id, -1, -1); + + llama_sbatch sbatch; + llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); - llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false); batch.n_tokens = cell_count; batch.n_seq_tokens = cell_count; batch.n_seqs = 1; @@ -1157,7 +1153,7 @@ struct llama_data_read { llama_pos pos; uint32_t n_seq_id; - read_to(&pos, sizeof(pos)); + read_to(&pos, sizeof(pos)); read_to(&n_seq_id, sizeof(n_seq_id)); if (n_seq_id != 0) { @@ -1169,30 +1165,30 @@ struct llama_data_read { } batch.n_seq_id[0] = 1; batch.seq_id[0] = &dest_seq_id; - if (!kv_self.find_slot(batch)) { + if (!kv.find_slot(batch)) { LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); return false; } - // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) // Assume that this is one contiguous block of cells - GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); - GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); - GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); - GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); - GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); + GGML_ASSERT(kv.head + cell_count <= kv.size); + GGML_ASSERT(kv.cells[kv.head].pos == batch.pos[0]); + GGML_ASSERT(kv.cells[kv.head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(kv.cells[kv.head].has_seq_id(dest_seq_id)); + GGML_ASSERT(kv.cells[kv.head + cell_count - 1].has_seq_id(dest_seq_id)); } else { // whole KV cache restore - if (cell_count > kv_self.size) { + if (cell_count > kv.size) { LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); return false; } - kv_self.clear(); + kv.clear(); for (uint32_t i = 0; i < cell_count; ++i) { - llama_kv_cell & cell = kv_self.cells[i]; + llama_kv_cell & cell = kv.cells[i]; llama_pos pos; uint32_t n_seq_id; @@ -1206,15 +1202,18 @@ struct llama_data_read { llama_seq_id seq_id; read_to(&seq_id, sizeof(seq_id)); - if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { - LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + // TODO: llama_kv_cache should have a notion of max sequences + //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { + if (seq_id < 0) { + //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id); return false; } cell.seq_id.insert(seq_id); - if (kv_self.recurrent) { - int32_t & tail = kv_self.cells[seq_id].tail; + if (kv.recurrent) { + int32_t & tail = kv.cells[seq_id].tail; if (tail != -1) { LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); return false; @@ -1224,24 +1223,22 @@ struct llama_data_read { } } - kv_self.head = 0; - kv_self.used = cell_count; + kv.head = 0; + kv.used = cell_count; } - if (kv_self.recurrent) { + if (kv.recurrent) { for (uint32_t i = 0; i < cell_count; ++i) { - uint32_t cell_id = kv_self.head + i; + uint32_t cell_id = kv.head + i; // make sure the recurrent states will keep their restored state - kv_self.cells[cell_id].src = cell_id; + kv.cells[cell_id].src = cell_id; } } return true; } - bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) { - const struct llama_hparams & hparams = ctx->model.hparams; - struct llama_kv_cache & kv_self = ctx->kv_self; + bool read_kv_cache_data(llama_kv_cache & kv, const llama_hparams & hparams, uint32_t cell_count) { uint32_t v_trans; uint32_t n_layer; read_to(&v_trans, sizeof(v_trans)); @@ -1251,11 +1248,11 @@ struct llama_data_read { LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); return false; } - if (cell_count > kv_self.size) { - LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size); + if (cell_count > kv.size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv.size); return false; } - if (kv_self.v_trans != (bool) v_trans) { + if (kv.v_trans != (bool) v_trans) { LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); return false; } @@ -1267,7 +1264,7 @@ struct llama_data_read { // Read type of key int32_t k_type_i_ref; read_to(&k_type_i_ref, sizeof(k_type_i_ref)); - const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + const int32_t k_type_i = (int32_t)kv.k_l[il]->type; if (k_type_i != k_type_i_ref) { LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); return false; @@ -1276,7 +1273,7 @@ struct llama_data_read { // Read row size of key uint64_t k_size_row_ref; read_to(&k_size_row_ref, sizeof(k_size_row_ref)); - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + const size_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa); if (k_size_row != k_size_row_ref) { LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); return false; @@ -1284,18 +1281,18 @@ struct llama_data_read { if (cell_count) { // Read and set the keys for the whole cell range - ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row); + ggml_backend_tensor_set(kv.k_l[il], read(cell_count * k_size_row), kv.head * k_size_row, cell_count * k_size_row); } } - if (!kv_self.v_trans) { + if (!kv.v_trans) { for (uint32_t il = 0; il < n_layer; ++il) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); // Read type of value int32_t v_type_i_ref; read_to(&v_type_i_ref, sizeof(v_type_i_ref)); - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + const int32_t v_type_i = (int32_t)kv.v_l[il]->type; if (v_type_i != v_type_i_ref) { LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); return false; @@ -1304,7 +1301,7 @@ struct llama_data_read { // Read row size of value uint64_t v_size_row_ref; read_to(&v_size_row_ref, sizeof(v_size_row_ref)); - const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + const size_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa); if (v_size_row != v_size_row_ref) { LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); return false; @@ -1312,7 +1309,7 @@ struct llama_data_read { if (cell_count) { // Read and set the values for the whole cell range - ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row); + ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_row), kv.head * v_size_row, cell_count * v_size_row); } } } else { @@ -1323,7 +1320,7 @@ struct llama_data_read { // Read type of value int32_t v_type_i_ref; read_to(&v_type_i_ref, sizeof(v_type_i_ref)); - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + const int32_t v_type_i = (int32_t)kv.v_l[il]->type; if (v_type_i != v_type_i_ref) { LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); return false; @@ -1332,7 +1329,7 @@ struct llama_data_read { // Read element size of value uint32_t v_size_el_ref; read_to(&v_size_el_ref, sizeof(v_size_el_ref)); - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + const size_t v_size_el = ggml_type_size(kv.v_l[il]->type); if (v_size_el != v_size_el_ref) { LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); return false; @@ -1349,8 +1346,8 @@ struct llama_data_read { if (cell_count) { // For each row in the transposed matrix, read the values for the whole cell range for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el; - ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + const size_t dst_offset = (kv.head + j * kv.size) * v_size_el; + ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); } } } @@ -1358,17 +1355,17 @@ struct llama_data_read { return true; } - void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) { + void read_kv_cache(llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) { uint32_t cell_count; read_to(&cell_count, sizeof(cell_count)); - bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count); + bool res = read_kv_cache_meta(kv, cell_count, seq_id) && read_kv_cache_data(kv, hparams, cell_count); if (!res) { if (seq_id == -1) { - ctx->kv_self.clear(); + kv.clear(); } else { - ctx->kv_self.seq_rm(seq_id, -1, -1); + kv.seq_rm(seq_id, -1, -1); } throw std::runtime_error("failed to restore kv cache"); } @@ -1521,7 +1518,7 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da data_ctx.write_logits(ctx); data_ctx.write_embeddings(ctx); - data_ctx.write_kv_cache(ctx); + data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams); return data_ctx.get_size_written(); } @@ -1558,7 +1555,7 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da data_ctx.read_logits(ctx); data_ctx.read_embeddings(ctx); - data_ctx.read_kv_cache(ctx); + data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams); return data_ctx.get_size_read(); } @@ -1654,7 +1651,7 @@ bool llama_state_save_file(struct llama_context * ctx, const char * path_session static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) { llama_synchronize(ctx); - data_ctx.write_kv_cache(ctx, seq_id); + data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams, seq_id); return data_ctx.get_size_written(); } @@ -1677,7 +1674,7 @@ size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_ static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) { llama_synchronize(ctx); - data_ctx.read_kv_cache(ctx, dest_seq_id); + data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams, dest_seq_id); return data_ctx.get_size_read(); } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 97285481e3588..7fc2fabf5163d 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -44,6 +44,7 @@ struct llama_kv_cache_slot_info { // ring-buffer of cached KV data // TODO: pimpl +// TODO: add notion of max sequences struct llama_kv_cache { bool has_shift = false; bool do_defrag = false; From fd05ab87aad1221535da86d5cd810ee5856ebb49 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 14 Jan 2025 13:13:35 +0200 Subject: [PATCH 07/84] kv_cache : move state read/write to llama_kv_cache ggml-ci --- src/llama-context.cpp | 424 +++++------------------------------------ src/llama-kv-cache.cpp | 378 ++++++++++++++++++++++++++++++++++++ src/llama-kv-cache.h | 20 ++ 3 files changed, 446 insertions(+), 376 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8fc6de2f271f9..0e146652c5996 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -908,143 +908,6 @@ struct llama_data_write { write(ctx->embd, embeddings_size * sizeof(float)); } } - - void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) { - for (const auto & range : cell_ranges) { - for (uint32_t i = range.first; i < range.second; ++i) { - const auto & cell = kv_self.cells[i]; - const llama_pos pos = cell.pos; - const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; - - write(&pos, sizeof(pos)); - write(&n_seq_id, sizeof(n_seq_id)); - - if (n_seq_id) { - for (auto seq_id : cell.seq_id) { - write(&seq_id, sizeof(seq_id)); - } - } - } - } - } - - void write_kv_cache_data(const llama_kv_cache & kv, const llama_hparams & hparams, const std::vector> & cell_ranges) { - const uint32_t v_trans = kv.v_trans ? 1 : 0; - const uint32_t n_layer = hparams.n_layer; - - write(&v_trans, sizeof(v_trans)); - write(&n_layer, sizeof(n_layer)); - - std::vector tmp_buf; - - // Iterate and write all the keys first, each row is a cell - // Get whole range at a time - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - - // Write key type - const int32_t k_type_i = (int32_t)kv.k_l[il]->type; - write(&k_type_i, sizeof(k_type_i)); - - // Write row size of key - const uint64_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa); - write(&k_size_row, sizeof(k_size_row)); - - // Read each range of cells of k_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - const size_t buf_size = range_size * k_size_row; - write_tensor_data(kv.k_l[il], range.first * k_size_row, buf_size); - } - } - - if (!kv.v_trans) { - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Write value type - const int32_t v_type_i = (int32_t)kv.v_l[il]->type; - write(&v_type_i, sizeof(v_type_i)); - - // Write row size of value - const uint64_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa); - write(&v_size_row, sizeof(v_size_row)); - - // Read each range of cells of v_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - const size_t buf_size = range_size * v_size_row; - write_tensor_data(kv.v_l[il], range.first * v_size_row, buf_size); - } - } - } else { - // When v is transposed, we also need the element size and get the element ranges from each row - const uint32_t kv_size = kv.size; - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Write value type - const int32_t v_type_i = (int32_t)kv.v_l[il]->type; - write(&v_type_i, sizeof(v_type_i)); - - // Write element size - const uint32_t v_size_el = ggml_type_size(kv.v_l[il]->type); - write(&v_size_el, sizeof(v_size_el)); - - // Write GQA embedding size - write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); - - // For each row, we get the element values of each cell - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - // Read each range of cells of v_size_el length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - const size_t src_offset = (range.first + j * kv_size) * v_size_el; - const size_t buf_size = range_size * v_size_el; - write_tensor_data(kv.v_l[il], src_offset, buf_size); - } - } - } - } - } - - void write_kv_cache(const llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) { - std::vector> cell_ranges; // ranges, from inclusive, to exclusive - uint32_t cell_count = 0; - - // Count the number of cells with the specified seq_id - // Find all the ranges of cells with this seq id (or all, when -1) - uint32_t cell_range_begin = kv.size; - for (uint32_t i = 0; i < kv.size; ++i) { - const auto & cell = kv.cells[i]; - if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { - ++cell_count; - if (cell_range_begin == kv.size) { - cell_range_begin = i; - } - } else { - if (cell_range_begin != kv.size) { - cell_ranges.emplace_back(cell_range_begin, i); - cell_range_begin = kv.size; - } - } - } - if (cell_range_begin != kv.size) { - cell_ranges.emplace_back(cell_range_begin, kv.size); - } - - // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count - uint32_t cell_count_check = 0; - for (const auto & range : cell_ranges) { - cell_count_check += range.second - range.first; - } - GGML_ASSERT(cell_count == cell_count_check); - - write(&cell_count, sizeof(cell_count)); - - write_kv_cache_meta(kv, cell_ranges, seq_id); - write_kv_cache_data(kv, hparams, cell_ranges); - } }; struct llama_data_read { @@ -1135,241 +998,6 @@ struct llama_data_read { read_to(ctx->embd, embeddings_size * sizeof(float)); } } - - bool read_kv_cache_meta(llama_kv_cache & kv, uint32_t cell_count, llama_seq_id dest_seq_id = -1) { - if (dest_seq_id != -1) { - // single sequence - - kv.seq_rm(dest_seq_id, -1, -1); - - llama_sbatch sbatch; - llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); - - batch.n_tokens = cell_count; - batch.n_seq_tokens = cell_count; - batch.n_seqs = 1; - - for (uint32_t i = 0; i < cell_count; ++i) { - llama_pos pos; - uint32_t n_seq_id; - - read_to(&pos, sizeof(pos)); - read_to(&n_seq_id, sizeof(n_seq_id)); - - if (n_seq_id != 0) { - LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); - return false; - } - - batch.pos[i] = pos; - } - batch.n_seq_id[0] = 1; - batch.seq_id[0] = &dest_seq_id; - if (!kv.find_slot(batch)) { - LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); - return false; - } - - // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) - // Assume that this is one contiguous block of cells - GGML_ASSERT(kv.head + cell_count <= kv.size); - GGML_ASSERT(kv.cells[kv.head].pos == batch.pos[0]); - GGML_ASSERT(kv.cells[kv.head + cell_count - 1].pos == batch.pos[cell_count - 1]); - GGML_ASSERT(kv.cells[kv.head].has_seq_id(dest_seq_id)); - GGML_ASSERT(kv.cells[kv.head + cell_count - 1].has_seq_id(dest_seq_id)); - } else { - // whole KV cache restore - - if (cell_count > kv.size) { - LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); - return false; - } - - kv.clear(); - - for (uint32_t i = 0; i < cell_count; ++i) { - llama_kv_cell & cell = kv.cells[i]; - - llama_pos pos; - uint32_t n_seq_id; - - read_to(&pos, sizeof(pos)); - read_to(&n_seq_id, sizeof(n_seq_id)); - - cell.pos = pos; - - for (uint32_t j = 0; j < n_seq_id; ++j) { - llama_seq_id seq_id; - read_to(&seq_id, sizeof(seq_id)); - - // TODO: llama_kv_cache should have a notion of max sequences - //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { - if (seq_id < 0) { - //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); - LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id); - return false; - } - - cell.seq_id.insert(seq_id); - - if (kv.recurrent) { - int32_t & tail = kv.cells[seq_id].tail; - if (tail != -1) { - LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); - return false; - } - tail = i; - } - } - } - - kv.head = 0; - kv.used = cell_count; - } - - if (kv.recurrent) { - for (uint32_t i = 0; i < cell_count; ++i) { - uint32_t cell_id = kv.head + i; - // make sure the recurrent states will keep their restored state - kv.cells[cell_id].src = cell_id; - } - } - - return true; - } - - bool read_kv_cache_data(llama_kv_cache & kv, const llama_hparams & hparams, uint32_t cell_count) { - uint32_t v_trans; - uint32_t n_layer; - read_to(&v_trans, sizeof(v_trans)); - read_to(&n_layer, sizeof(n_layer)); - - if (n_layer != hparams.n_layer) { - LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); - return false; - } - if (cell_count > kv.size) { - LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv.size); - return false; - } - if (kv.v_trans != (bool) v_trans) { - LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); - return false; - } - - // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - - // Read type of key - int32_t k_type_i_ref; - read_to(&k_type_i_ref, sizeof(k_type_i_ref)); - const int32_t k_type_i = (int32_t)kv.k_l[il]->type; - if (k_type_i != k_type_i_ref) { - LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); - return false; - } - - // Read row size of key - uint64_t k_size_row_ref; - read_to(&k_size_row_ref, sizeof(k_size_row_ref)); - const size_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa); - if (k_size_row != k_size_row_ref) { - LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); - return false; - } - - if (cell_count) { - // Read and set the keys for the whole cell range - ggml_backend_tensor_set(kv.k_l[il], read(cell_count * k_size_row), kv.head * k_size_row, cell_count * k_size_row); - } - } - - if (!kv.v_trans) { - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Read type of value - int32_t v_type_i_ref; - read_to(&v_type_i_ref, sizeof(v_type_i_ref)); - const int32_t v_type_i = (int32_t)kv.v_l[il]->type; - if (v_type_i != v_type_i_ref) { - LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); - return false; - } - - // Read row size of value - uint64_t v_size_row_ref; - read_to(&v_size_row_ref, sizeof(v_size_row_ref)); - const size_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa); - if (v_size_row != v_size_row_ref) { - LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); - return false; - } - - if (cell_count) { - // Read and set the values for the whole cell range - ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_row), kv.head * v_size_row, cell_count * v_size_row); - } - } - } else { - // For each layer, read the values for each cell (transposed) - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Read type of value - int32_t v_type_i_ref; - read_to(&v_type_i_ref, sizeof(v_type_i_ref)); - const int32_t v_type_i = (int32_t)kv.v_l[il]->type; - if (v_type_i != v_type_i_ref) { - LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); - return false; - } - - // Read element size of value - uint32_t v_size_el_ref; - read_to(&v_size_el_ref, sizeof(v_size_el_ref)); - const size_t v_size_el = ggml_type_size(kv.v_l[il]->type); - if (v_size_el != v_size_el_ref) { - LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); - return false; - } - - // Read GQA embedding size - uint32_t n_embd_v_gqa_ref; - read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); - if (n_embd_v_gqa != n_embd_v_gqa_ref) { - LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il); - return false; - } - - if (cell_count) { - // For each row in the transposed matrix, read the values for the whole cell range - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - const size_t dst_offset = (kv.head + j * kv.size) * v_size_el; - ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); - } - } - } - } - return true; - } - - void read_kv_cache(llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) { - uint32_t cell_count; - read_to(&cell_count, sizeof(cell_count)); - - bool res = read_kv_cache_meta(kv, cell_count, seq_id) && read_kv_cache_data(kv, hparams, cell_count); - - if (!res) { - if (seq_id == -1) { - kv.clear(); - } else { - kv.seq_rm(seq_id, -1, -1); - } - throw std::runtime_error("failed to restore kv cache"); - } - } }; struct llama_data_write_dummy : llama_data_write { @@ -1518,7 +1146,18 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da data_ctx.write_logits(ctx); data_ctx.write_embeddings(ctx); - data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams); + llama_kv_cache::io io = { + /* .write =*/ [&](const void * src, size_t size) { + data_ctx.write(src, size); + }, + /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { + data_ctx.write_tensor_data(tensor, offset, size); + }, + /* .read =*/ nullptr, + /* .read_to =*/ nullptr, + }; + + ctx->kv_self.state_write(io, ctx->model.hparams); return data_ctx.get_size_written(); } @@ -1555,7 +1194,18 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da data_ctx.read_logits(ctx); data_ctx.read_embeddings(ctx); - data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams); + llama_kv_cache::io io = { + /* .write =*/ nullptr, + /* .write_tensor_data =*/ nullptr, + /* .read =*/ [&](size_t size) { + return data_ctx.read(size); + }, + /* .read_to =*/ [&](void * dst, size_t size) { + data_ctx.read_to(dst, size); + }, + }; + + ctx->kv_self.state_read(io, ctx->model.hparams); return data_ctx.get_size_read(); } @@ -1651,7 +1301,18 @@ bool llama_state_save_file(struct llama_context * ctx, const char * path_session static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) { llama_synchronize(ctx); - data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams, seq_id); + llama_kv_cache::io io = { + /* .write =*/ [&](const void * src, size_t size) { + data_ctx.write(src, size); + }, + /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { + data_ctx.write_tensor_data(tensor, offset, size); + }, + /* .read =*/ nullptr, + /* .read_to =*/ nullptr, + }; + + ctx->kv_self.state_write(io, ctx->model.hparams, seq_id); return data_ctx.get_size_written(); } @@ -1674,7 +1335,18 @@ size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_ static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) { llama_synchronize(ctx); - data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams, dest_seq_id); + llama_kv_cache::io io = { + /* .write =*/ nullptr, + /* .write_tensor_data =*/ nullptr, + /* .read =*/ [&](size_t size) { + return data_ctx.read(size); + }, + /* .read_to =*/ [&](void * dst, size_t size) { + data_ctx.read_to(dst, size); + }, + }; + + ctx->kv_self.state_read(io, ctx->model.hparams, dest_seq_id); return data_ctx.get_size_read(); } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 9f3b4e5144415..6886d24f0d98f 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -8,6 +8,7 @@ #include #include #include +#include static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false}; @@ -696,6 +697,383 @@ size_t llama_kv_cache::size_v_bytes() const { return size_v_bytes; } +void llama_kv_cache::state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) const { + std::vector> cell_ranges; // ranges, from inclusive, to exclusive + uint32_t cell_count = 0; + + // Count the number of cells with the specified seq_id + // Find all the ranges of cells with this seq id (or all, when -1) + uint32_t cell_range_begin = size; + for (uint32_t i = 0; i < size; ++i) { + const auto & cell = cells[i]; + if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { + ++cell_count; + if (cell_range_begin == size) { + cell_range_begin = i; + } + } else { + if (cell_range_begin != size) { + cell_ranges.emplace_back(cell_range_begin, i); + cell_range_begin = size; + } + } + } + if (cell_range_begin != size) { + cell_ranges.emplace_back(cell_range_begin, size); + } + + // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count + uint32_t cell_count_check = 0; + for (const auto & range : cell_ranges) { + cell_count_check += range.second - range.first; + } + GGML_ASSERT(cell_count == cell_count_check); + + io.write(&cell_count, sizeof(cell_count)); + + state_write_meta(io, cell_ranges, seq_id); + state_write_data(io, cell_ranges, hparams); +} + +void llama_kv_cache::state_read(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) { + uint32_t cell_count; + io.read_to(&cell_count, sizeof(cell_count)); + + bool res = true; + res = res && state_read_meta(io, cell_count, seq_id); + res = res && state_read_data(io, hparams, cell_count); + + if (!res) { + if (seq_id == -1) { + clear(); + } else { + seq_rm(seq_id, -1, -1); + } + throw std::runtime_error("failed to restore kv cache"); + } +} + +void llama_kv_cache::state_write_meta(const io & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { + for (const auto & range : cell_ranges) { + for (uint32_t i = range.first; i < range.second; ++i) { + const auto & cell = cells[i]; + const llama_pos pos = cell.pos; + const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; + + io.write(&pos, sizeof(pos)); + io.write(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id) { + for (auto seq_id : cell.seq_id) { + io.write(&seq_id, sizeof(seq_id)); + } + } + } + } +} + +void llama_kv_cache::state_write_data(const io & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const { + const uint32_t v_trans = this->v_trans ? 1 : 0; + const uint32_t n_layer = hparams.n_layer; + + io.write(&v_trans, sizeof(v_trans)); + io.write(&n_layer, sizeof(n_layer)); + + std::vector tmp_buf; + + // Iterate and write all the keys first, each row is a cell + // Get whole range at a time + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Write key type + const int32_t k_type_i = (int32_t)k_l[il]->type; + io.write(&k_type_i, sizeof(k_type_i)); + + // Write row size of key + const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + io.write(&k_size_row, sizeof(k_size_row)); + + // Read each range of cells of k_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * k_size_row; + io.write_tensor_data(k_l[il], range.first * k_size_row, buf_size); + } + } + + if (!v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)v_l[il]->type; + io.write(&v_type_i, sizeof(v_type_i)); + + // Write row size of value + const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa); + io.write(&v_size_row, sizeof(v_size_row)); + + // Read each range of cells of v_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * v_size_row; + io.write_tensor_data(v_l[il], range.first * v_size_row, buf_size); + } + } + } else { + // When v is transposed, we also need the element size and get the element ranges from each row + const uint32_t kv_size = size; + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)v_l[il]->type; + io.write(&v_type_i, sizeof(v_type_i)); + + // Write element size + const uint32_t v_size_el = ggml_type_size(v_l[il]->type); + io.write(&v_size_el, sizeof(v_size_el)); + + // Write GQA embedding size + io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); + + // For each row, we get the element values of each cell + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + // Read each range of cells of v_size_el length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t src_offset = (range.first + j * kv_size) * v_size_el; + const size_t buf_size = range_size * v_size_el; + io.write_tensor_data(v_l[il], src_offset, buf_size); + } + } + } + } +} + +bool llama_kv_cache::state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id) { + if (dest_seq_id != -1) { + // single sequence + + seq_rm(dest_seq_id, -1, -1); + + llama_sbatch sbatch; + llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); + + batch.n_tokens = cell_count; + batch.n_seq_tokens = cell_count; + batch.n_seqs = 1; + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_pos pos; + uint32_t n_seq_id; + + io.read_to(&pos, sizeof(pos)); + io.read_to(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id != 0) { + LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); + return false; + } + + batch.pos[i] = pos; + } + batch.n_seq_id[0] = 1; + batch.seq_id[0] = &dest_seq_id; + if (!find_slot(batch)) { + LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + return false; + } + + // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // Assume that this is one contiguous block of cells + GGML_ASSERT(head + cell_count <= size); + GGML_ASSERT(cells[head].pos == batch.pos[0]); + GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(cells[head].has_seq_id(dest_seq_id)); + GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id)); + } else { + // whole KV cache restore + + if (cell_count > size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); + return false; + } + + clear(); + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_kv_cell & cell = cells[i]; + + llama_pos pos; + uint32_t n_seq_id; + + io.read_to(&pos, sizeof(pos)); + io.read_to(&n_seq_id, sizeof(n_seq_id)); + + cell.pos = pos; + + for (uint32_t j = 0; j < n_seq_id; ++j) { + llama_seq_id seq_id; + io.read_to(&seq_id, sizeof(seq_id)); + + // TODO: llama_kv_cache should have a notion of max sequences + //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { + if (seq_id < 0) { + //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id); + return false; + } + + cell.seq_id.insert(seq_id); + + if (recurrent) { + int32_t & tail = cells[seq_id].tail; + if (tail != -1) { + LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); + return false; + } + tail = i; + } + } + } + + head = 0; + used = cell_count; + } + + if (recurrent) { + for (uint32_t i = 0; i < cell_count; ++i) { + uint32_t cell_id = head + i; + // make sure the recurrent states will keep their restored state + cells[cell_id].src = cell_id; + } + } + + return true; +} + +bool llama_kv_cache::state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count) { + uint32_t v_trans; + uint32_t n_layer; + io.read_to(&v_trans, sizeof(v_trans)); + io.read_to(&n_layer, sizeof(n_layer)); + + if (n_layer != hparams.n_layer) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + return false; + } + if (cell_count > size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size); + return false; + } + if (v_trans != (bool) v_trans) { + LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); + return false; + } + + // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Read type of key + int32_t k_type_i_ref; + io.read_to(&k_type_i_ref, sizeof(k_type_i_ref)); + const int32_t k_type_i = (int32_t) k_l[il]->type; + if (k_type_i != k_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); + return false; + } + + // Read row size of key + uint64_t k_size_row_ref; + io.read_to(&k_size_row_ref, sizeof(k_size_row_ref)); + const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + if (k_size_row != k_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the keys for the whole cell range + ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row); + } + } + + if (!v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read row size of value + uint64_t v_size_row_ref; + io.read_to(&v_size_row_ref, sizeof(v_size_row_ref)); + const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa); + if (v_size_row != v_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the values for the whole cell range + ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row); + } + } + } else { + // For each layer, read the values for each cell (transposed) + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read element size of value + uint32_t v_size_el_ref; + io.read_to(&v_size_el_ref, sizeof(v_size_el_ref)); + const size_t v_size_el = ggml_type_size(v_l[il]->type); + if (v_size_el != v_size_el_ref) { + LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); + return false; + } + + // Read GQA embedding size + uint32_t n_embd_v_gqa_ref; + io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); + if (n_embd_v_gqa != n_embd_v_gqa_ref) { + LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il); + return false; + } + + if (cell_count) { + // For each row in the transposed matrix, read the values for the whole cell range + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const size_t dst_offset = (head + j * size) * v_size_el; + ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + } + } + } + } + + return true; +} + +///////////// + void llama_kv_cache_clear(llama_kv_cache * kv) { kv->clear(); } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 7fc2fabf5163d..0384a2b7ce7ab 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -6,8 +6,10 @@ #include #include +#include struct llama_cparams; +struct llama_hparams; struct llama_ubatch; struct llama_kv_cell { @@ -45,6 +47,7 @@ struct llama_kv_cache_slot_info { // ring-buffer of cached KV data // TODO: pimpl // TODO: add notion of max sequences +// TODO: add llama_hparams & struct llama_kv_cache { bool has_shift = false; bool do_defrag = false; @@ -111,12 +114,29 @@ struct llama_kv_cache { size_t size_k_bytes() const; size_t size_v_bytes() const; + struct io { + std::function write; + std::function write_tensor_data; + + std::function read; + std::function read_to; + }; + + void state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const; + void state_read (const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1); + private: ggml_type type_k = GGML_TYPE_F16; ggml_type type_v = GGML_TYPE_F16; std::vector ctxs; std::vector bufs; + + void state_write_meta(const io & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; + void state_write_data(const io & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const; + + bool state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1); + bool state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count); }; // From 17b363afd3575f8f9d025a35d2abb75f528a64c2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 14 Jan 2025 16:47:34 +0200 Subject: [PATCH 08/84] llama : update llama_kv_self API ggml-ci --- common/common.cpp | 6 +- common/speculative.cpp | 10 +- examples/batched-bench/batched-bench.cpp | 6 +- examples/batched.swift/Sources/main.swift | 2 +- .../cvector-generator/cvector-generator.cpp | 3 +- examples/embedding/embedding.cpp | 5 +- examples/gritlm/gritlm.cpp | 8 +- examples/imatrix/imatrix.cpp | 4 +- examples/infill/infill.cpp | 6 +- examples/llama-bench/llama-bench.cpp | 6 +- .../llama/src/main/cpp/llama-android.cpp | 8 +- .../llama.cpp.swift/LibLlama.swift | 8 +- examples/lookahead/lookahead.cpp | 13 +- examples/lookup/lookup.cpp | 3 +- examples/main/main.cpp | 14 +- examples/parallel/parallel.cpp | 11 +- examples/passkey/passkey.cpp | 30 ++-- examples/perplexity/perplexity.cpp | 24 +-- examples/retrieval/retrieval.cpp | 4 +- examples/run/run.cpp | 7 +- examples/save-load-state/save-load-state.cpp | 4 +- examples/server/server.cpp | 25 ++- examples/simple-chat/simple-chat.cpp | 6 +- .../speculative-simple/speculative-simple.cpp | 4 +- examples/speculative/speculative.cpp | 29 ++-- include/llama.h | 105 ++++++++++--- src/llama-context.cpp | 34 ++-- src/llama-kv-cache.cpp | 20 +-- src/llama-kv-cache.h | 42 +++++ src/llama.cpp | 145 +++++++++++++++++- 30 files changed, 387 insertions(+), 205 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 29de45189e2d3..098feebee9e65 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -952,9 +952,7 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } - llama_kv_cache * kv = llama_get_kv_cache(lctx); - - if (params.ctx_shift && !llama_kv_cache_can_shift(kv)) { + if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) { LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__); params.ctx_shift = false; } @@ -1059,7 +1057,7 @@ struct common_init_result common_init_from_params(common_params & params) { if (llama_model_has_decoder(model)) { llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); } - llama_kv_cache_clear(kv); + llama_kv_self_clear(lctx); llama_synchronize(lctx); llama_perf_context_reset(lctx); } diff --git a/common/speculative.cpp b/common/speculative.cpp index 6ac0585178ebd..a660f198ae865 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -171,10 +171,8 @@ llama_tokens common_speculative_gen_draft( llama_tokens result; result.reserve(params.n_draft); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - if (reuse_n == 0) { - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); prompt.clear(); } else { @@ -193,14 +191,14 @@ llama_tokens common_speculative_gen_draft( } if (reuse_i > 0) { - llama_kv_cache_seq_rm (kv, 0, 0, reuse_i); - llama_kv_cache_seq_add(kv, 0, reuse_i, -1, -reuse_i); + llama_kv_self_seq_rm (ctx, 0, 0, reuse_i); + llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i); prompt.erase(prompt.begin(), prompt.begin() + reuse_i); } if (reuse_n < (int) prompt.size()) { - llama_kv_cache_seq_rm (kv, 0, reuse_n, -1); + llama_kv_self_seq_rm (ctx, 0, reuse_n, -1); prompt.erase(prompt.begin() + reuse_n, prompt.end()); } diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index fcbad37bb3f2f..430e8be512653 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -57,8 +57,6 @@ int main(int argc, char ** argv) { return 1; } - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const int32_t n_kv_max = llama_n_ctx(ctx); llama_batch batch = llama_batch_init(n_kv_max, 0, 1); @@ -134,7 +132,7 @@ int main(int argc, char ** argv) { const auto t_pp_start = ggml_time_us(); - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); @@ -143,7 +141,7 @@ int main(int argc, char ** argv) { if (is_pp_shared) { for (int32_t i = 1; i < pl; ++i) { - llama_kv_cache_seq_cp(kv, 0, i, -1, -1); + llama_kv_self_seq_cp(ctx, 0, i, -1, -1); } } diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 371917b2ee863..a6494ebdfe176 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -111,7 +111,7 @@ if llama_decode(context, batch) != 0 { } for i in 1 ..< n_parallel { - llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) + llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) } if n_parallel > 1 { diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index adb4a60ada41f..3733e32d7007e 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -342,8 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { - llama_kv_cache * kv = llama_get_kv_cache(ctx); - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index fda0949f1c4cf..c4fb1c6d1d78f 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -34,11 +34,10 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - const llama_model * model = llama_get_model(ctx); - llama_kv_cache * kv = llama_get_kv_cache(ctx); + const struct llama_model * model = llama_get_model(ctx); // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 16437453edb89..f7db7861c1ad5 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -13,8 +13,6 @@ static std::vector> encode(llama_context * ctx, const std::ve const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); for (uint64_t i = 0; i < sentences.size(); i++) { @@ -47,7 +45,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); llama_set_embeddings(ctx, true); llama_set_causal_attn(ctx, false); @@ -102,11 +100,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - llama_token eos_token = llama_vocab_eos(vocab); - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); llama_set_embeddings(ctx, false); llama_set_causal_attn(ctx, true); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 5efe4f019f562..e335ecc74b8fe 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -431,8 +431,6 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const bool add_bos = llama_vocab_get_add_bos(vocab); const int n_ctx = llama_n_ctx(ctx); @@ -499,7 +497,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); llama_batch batch = llama_batch_init(n_batch, 0, 1); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index de8e7769552bb..4e2f7b7270003 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -139,8 +139,6 @@ int main(int argc, char ** argv) { return 1; } - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); const int n_ctx_train = llama_model_n_ctx_train(model); @@ -334,8 +332,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (kv, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(kv, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 8843c0048d6cc..fc58135fe5fa8 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1575,11 +1575,9 @@ int main(int argc, char ** argv) { return 1; } - llama_kv_cache * kv = llama_get_kv_cache(ctx); - test t(inst, lmodel, ctx); - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // cool off before the test if (params.delay) { @@ -1619,7 +1617,7 @@ int main(int argc, char ** argv) { } for (int i = 0; i < params.reps; i++) { - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); uint64_t t_start = get_time_ns(); diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 2a73983a9832f..cf5e14907247e 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( } batch->logits[batch->n_tokens - 1] = true; - llama_kv_cache_clear(context); + llama_kv_self_clear(context); const auto t_pp_start = ggml_time_us(); if (llama_decode(context, *batch) != 0) { @@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( LOGi("Benchmark text generation (tg)"); - llama_kv_cache_clear(context); + llama_kv_self_clear(context); const auto t_tg_start = ggml_time_us(); for (i = 0; i < tg; i++) { @@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( const auto t_tg_end = ggml_time_us(); - llama_kv_cache_clear(context); + llama_kv_self_clear(context); const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0; const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0; @@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { - llama_kv_cache_clear(reinterpret_cast(context)); + llama_kv_self_clear(reinterpret_cast(context)); } diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 477c3e6f2e95b..82c26935bbaea 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -208,7 +208,7 @@ actor LlamaContext { } batch.logits[Int(batch.n_tokens) - 1] = 1 // true - llama_kv_cache_clear(context) + llama_kv_self_clear(context) let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000; @@ -221,7 +221,7 @@ actor LlamaContext { // bench text generation - llama_kv_cache_clear(context) + llama_kv_self_clear(context) let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000; @@ -240,7 +240,7 @@ actor LlamaContext { let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000; - llama_kv_cache_clear(context) + llama_kv_self_clear(context) let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0 let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0 @@ -290,7 +290,7 @@ actor LlamaContext { func clear() { tokens_list.removeAll() temporary_invalid_cchars.removeAll() - llama_kv_cache_clear(context) + llama_kv_self_clear(context) } private func tokenize(text: String, add_bos: Bool) -> [llama_token] { diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 1219c207464d2..b7f334007a39b 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -60,7 +60,6 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); - llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -96,7 +95,7 @@ int main(int argc, char ** argv) { llama_decode(ctx, llama_batch_get_one(&inp.back(), 1)); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_cache_seq_cp(kv, 0, s, -1, -1); + llama_kv_self_seq_cp(ctx, 0, s, -1, -1); } const auto t_enc_end = ggml_time_us(); @@ -438,17 +437,17 @@ int main(int argc, char ** argv) { // KV cache management // if no verification token matched, we simply remove all cells from this batch -> no fragmentation - llama_kv_cache_seq_rm(kv, -1, n_past, -1); + llama_kv_self_seq_rm(ctx, -1, n_past, -1); if (seq_id_best != 0) { // if a verification token matched, we keep the best sequence and remove the rest // this leads to some KV cache fragmentation - llama_kv_cache_seq_keep(kv, seq_id_best); - llama_kv_cache_seq_cp (kv, seq_id_best, 0, -1, -1); - llama_kv_cache_seq_rm (kv, seq_id_best, -1, -1); + llama_kv_self_seq_keep(ctx, seq_id_best); + llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1); + llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_cache_seq_cp(kv, 0, s, -1, -1); + llama_kv_self_seq_cp(ctx, 0, s, -1, -1); } } } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 8628f7318556c..4ae93b2a5ed15 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -35,7 +35,6 @@ int main(int argc, char ** argv){ llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); - llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -193,7 +192,7 @@ int main(int argc, char ** argv){ // KV cache management // clean the cache of draft tokens that weren't accepted - llama_kv_cache_seq_rm(kv, 0, n_past, -1); + llama_kv_self_seq_rm(ctx, 0, n_past, -1); common_batch_clear(batch_tgt); common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 9d79af79e2723..23437937cfb5e 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -157,8 +157,6 @@ int main(int argc, char ** argv) { return 1; } - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); auto chat_templates = common_chat_templates_from_model(model, params.chat_template); @@ -330,7 +328,7 @@ int main(int argc, char ** argv) { } // remove any "future" tokens that we might have inherited from the previous session - llama_kv_cache_seq_rm(kv, -1, n_matching_session_tokens, -1); + llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1); } LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", @@ -571,8 +569,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (kv, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(kv, 0, params.n_keep + n_discard, n_past, -n_discard); + llama_kv_self_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; @@ -595,9 +593,9 @@ int main(int argc, char ** argv) { LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); - llama_kv_cache_seq_add(kv, 0, ga_i, n_past, ib*bd); - llama_kv_cache_seq_div(kv, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); - llama_kv_cache_seq_add(kv, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); + llama_kv_self_seq_add(ctx, 0, ga_i, n_past, ib*bd); + llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); + llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); n_past -= bd; diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 2ba0706dc5d24..3f9e1bcbbe540 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -134,7 +134,6 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); - llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -202,7 +201,7 @@ int main(int argc, char ** argv) { // assign the system KV cache to all parallel sequences for (int32_t i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_cp(kv, 0, i, -1, -1); + llama_kv_self_seq_cp(ctx, 0, i, -1, -1); } LOG_INF("\n"); @@ -234,9 +233,9 @@ int main(int argc, char ** argv) { if (batch.n_tokens == 0) { // all sequences have ended - clear the entire KV cache for (int i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_rm(kv, i, -1, -1); + llama_kv_self_seq_rm(ctx, i, -1, -1); // but keep the system prompt - llama_kv_cache_seq_cp(kv, 0, i, -1, -1); + llama_kv_self_seq_cp(ctx, 0, i, -1, -1); } LOG_INF("%s: clearing the KV cache\n", __func__); @@ -372,8 +371,8 @@ int main(int argc, char ** argv) { } // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_cache_seq_rm(kv, client.id + 1, -1, -1); - llama_kv_cache_seq_cp(kv, 0, client.id + 1, -1, -1); + llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1); + llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1); const auto t_main_end = ggml_time_us(); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index e2764313b2f01..46de2c2a207f9 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -86,8 +86,6 @@ int main(int argc, char ** argv) { return 1; } - llama_kv_cache * kv = llama_get_kv_cache(ctx); - auto sparams = llama_sampler_chain_default_params(); llama_sampler * smpl = llama_sampler_chain_init(sparams); @@ -134,11 +132,11 @@ int main(int argc, char ** argv) { const int ib = i/n_batch - 1; const int bd = n_batch_grp*(n_grp - 1); - llama_kv_cache_seq_add(kv, 0, n_past - n_batch, n_past, ib*bd); - llama_kv_cache_seq_div(kv, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); - llama_update_kv_cache (ctx, kv); + llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd); + llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_kv_self_update (ctx); - n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; + n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; } common_batch_clear(batch); @@ -168,12 +166,12 @@ int main(int argc, char ** argv) { LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard); - llama_kv_cache_seq_rm (kv, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx, -n_discard); - //llama_kv_cache_defrag (kv); - llama_update_kv_cache (ctx, kv); + llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + //llama_kv_self_defrag (ctx); + llama_kv_self_update (ctx); - n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; + n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; common_batch_clear(batch); @@ -199,12 +197,12 @@ int main(int argc, char ** argv) { if (n_discard > 0) { LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); - llama_kv_cache_seq_rm (kv, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx, -n_discard); - //llama_kv_cache_defrag (kv); - llama_update_kv_cache (ctx, kv); + llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + //llama_kv_self_defrag (ctx); + llama_kv_self_update (ctx); - n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; + n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; } } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 6c9f716ede23c..31c436f13976b 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -299,8 +299,6 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const bool add_bos = llama_vocab_get_add_bos(vocab); GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); @@ -362,7 +360,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); llama_batch batch = llama_batch_init(n_batch, 0, 1); @@ -452,8 +450,6 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const bool add_bos = llama_vocab_get_add_bos(vocab); GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); @@ -550,7 +546,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -745,8 +741,6 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -929,7 +923,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { return; } - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1090,8 +1084,6 @@ static void winogrande_score(llama_context * ctx, const common_params & params) const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - constexpr int k_min_trailing_ctx = 3; auto data = load_winogrande_from_csv(params.prompt); @@ -1210,7 +1202,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) return; } - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1396,8 +1388,6 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - std::istringstream strstream(params.prompt); uint32_t n_task; strstream.read((char *)&n_task, sizeof(n_task)); @@ -1584,7 +1574,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par return; } - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1681,8 +1671,6 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - if (params.logits_file.empty()) { LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; @@ -1776,7 +1764,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { } // clear the KV cache - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); llama_batch batch = llama_batch_init(n_batch, 0, 1); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index a907ea07607dd..0efe20d4b3f5d 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -82,10 +82,8 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke } static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { - llama_kv_cache * kv = llama_get_kv_cache(ctx); - // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 8e2c174a955e8..2c38d1ef68321 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -756,8 +756,7 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll // Function to tokenize the prompt static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt, std::vector & prompt_tokens, const LlamaData & llama_data) { - const llama_kv_cache * kv = llama_get_kv_cache(llama_data.context.get()); - const bool is_first = llama_kv_cache_used_cells(kv) == 0; + const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0; const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); prompt_tokens.resize(n_prompt_tokens); @@ -772,10 +771,8 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt // Check if we have enough space in the context to evaluate this batch static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { - llama_kv_cache * kv = llama_get_kv_cache(ctx.get()); - const int n_ctx = llama_n_ctx(ctx.get()); - const int n_ctx_used = llama_kv_cache_used_cells(kv); + const int n_ctx_used = llama_kv_self_used_cells(ctx.get()); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); printe("context size exceeded\n"); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 3839fbe8c84d5..77b1572a9dec5 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -156,8 +156,6 @@ int main(int argc, char ** argv) { // make new context llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params)); - llama_kv_cache * kv3 = llama_get_kv_cache(ctx3); - llama_sampler * smpl3 = llama_sampler_chain_init(sparams); llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); @@ -198,7 +196,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); // erase whole kv - llama_kv_cache_clear(kv3); + llama_kv_self_clear(ctx3); fprintf(stderr, "%s : kv cache cleared\n", __func__); // restore kv into seq 1 diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 076044d39679c..b665bde417094 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1693,7 +1693,6 @@ struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; - llama_kv_cache * kv = nullptr; const llama_vocab * vocab = nullptr; @@ -1756,8 +1755,6 @@ struct server_context { return false; } - kv = llama_get_kv_cache(ctx); - vocab = llama_model_get_vocab(model); n_ctx = llama_n_ctx(ctx); @@ -2026,7 +2023,7 @@ struct server_context { SRV_DBG("%s", "clearing KV cache\n"); // clear the entire KV cache - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); clean_kv_cache = false; } @@ -2568,8 +2565,8 @@ struct server_context { res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); res->t_start = metrics.t_start; - res->kv_cache_tokens_count = llama_kv_cache_n_tokens(kv); - res->kv_cache_used_cells = llama_kv_cache_used_cells(kv); + res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx); + res->kv_cache_used_cells = llama_kv_self_used_cells(ctx); res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; res->t_prompt_processing_total = metrics.t_prompt_processing_total; @@ -2685,7 +2682,7 @@ struct server_context { // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - llama_kv_cache_seq_rm(kv, slot->id, -1, -1); + llama_kv_self_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); auto res = std::make_unique(); @@ -2753,8 +2750,8 @@ struct server_context { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (kv, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(kv, slot.id, n_keep + n_discard, slot.n_past, -n_discard); + llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -2941,8 +2938,8 @@ struct server_context { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_kv_cache_seq_rm (kv, slot.id, head_p, head_c); - llama_kv_cache_seq_add(kv, slot.id, head_c, -1, kv_shift); + llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c); + llama_kv_self_seq_add(ctx, slot.id, head_c, -1, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; @@ -2980,9 +2977,9 @@ struct server_context { } // keep only the common part - if (!llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1)) { + if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(kv, slot.id, -1, -1); + llama_kv_self_seq_rm(ctx, slot.id, -1, -1); // there is no common part left slot.n_past = 0; @@ -3222,7 +3219,7 @@ struct server_context { slot.cache_tokens.push_back(id); slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); - llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1); + llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1); for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 130e326b55d4c..84f4159737260 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -88,8 +88,6 @@ int main(int argc, char ** argv) { return 1; } - const llama_kv_cache * kv = llama_get_kv_cache(ctx); - // initialize the sampler llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params()); llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1)); @@ -100,7 +98,7 @@ int main(int argc, char ** argv) { auto generate = [&](const std::string & prompt) { std::string response; - const bool is_first = llama_kv_cache_used_cells(kv) == 0; + const bool is_first = llama_kv_self_used_cells(ctx) == 0; // tokenize the prompt const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); @@ -115,7 +113,7 @@ int main(int argc, char ** argv) { while (true) { // check if we have enough space in the context to evaluate this batch int n_ctx = llama_n_ctx(ctx); - int n_ctx_used = llama_kv_cache_used_cells(kv); + int n_ctx_used = llama_kv_self_used_cells(ctx); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); fprintf(stderr, "context size exceeded\n"); diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 24bdc806d5710..a5d2bc9d09de7 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -45,8 +45,6 @@ int main(int argc, char ** argv) { model_tgt = llama_init_tgt.model.get(); ctx_tgt = llama_init_tgt.context.get(); - llama_kv_cache * kv = llama_get_kv_cache(ctx_tgt); - const llama_vocab * vocab = llama_model_get_vocab(model_tgt); // load the draft model @@ -219,7 +217,7 @@ int main(int argc, char ** argv) { { LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); - llama_kv_cache_seq_rm(kv, 0, n_past, -1); + llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1); } if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index b4e5259b5be46..bfddc67e034fb 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -90,9 +90,6 @@ int main(int argc, char ** argv) { model_dft = llama_init_dft.model.get(); ctx_dft = llama_init_dft.context.get(); - llama_kv_cache * kv_tgt = llama_get_kv_cache(ctx_tgt); - llama_kv_cache * kv_dft = llama_get_kv_cache(ctx_dft); - const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); @@ -423,14 +420,14 @@ int main(int argc, char ** argv) { { LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); - llama_kv_cache_seq_keep(kv_dft, s_keep); - llama_kv_cache_seq_cp (kv_dft, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(kv_dft, 0); + llama_kv_self_seq_keep(ctx_dft, s_keep); + llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1); + llama_kv_self_seq_keep(ctx_dft, 0); - llama_kv_cache_seq_rm (kv_tgt, s_keep, n_past_tgt, -1); - llama_kv_cache_seq_keep(kv_tgt, s_keep); - llama_kv_cache_seq_cp (kv_tgt, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(kv_tgt, 0); + llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1); + llama_kv_self_seq_keep(ctx_tgt, s_keep); + llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1); + llama_kv_self_seq_keep(ctx_tgt, 0); } for (int s = 0; s < n_seq_dft; ++s) { @@ -447,8 +444,8 @@ int main(int argc, char ** argv) { common_batch_clear(batch_dft); common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); - llama_kv_cache_seq_rm(kv_dft, 0, n_past_dft, -1); - // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(kv_dft, batch_dft).c_str()); + llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1); + // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); llama_decode(ctx_dft, batch_dft); ++n_past_dft; @@ -506,8 +503,8 @@ int main(int argc, char ** argv) { if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) { LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur); - llama_kv_cache_seq_rm(kv_dft, n_seq_cur, -1, -1); - llama_kv_cache_seq_cp(kv_dft, s, n_seq_cur, -1, -1); + llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1); + llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); // all previous tokens from this branch are now also part of the new branch for (int t = 0; t < batch_tgt.n_tokens; ++t) { @@ -588,9 +585,9 @@ int main(int argc, char ** argv) { // evaluate the target model on the drafted tokens { - llama_kv_cache_seq_keep(kv_tgt, 0); + llama_kv_self_seq_keep(ctx_tgt, 0); for (int s = 1; s < n_seq_dft; ++s) { - llama_kv_cache_seq_cp(kv_tgt, 0, s, -1, -1); + llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1); } // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); diff --git a/include/llama.h b/include/llama.h index 08b8658ad89ac..91300b1ae51a3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -469,7 +469,7 @@ extern "C" { DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead"); LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); // TODO: remove const? - LLAMA_API struct llama_kv_cache * llama_get_kv_cache( struct llama_context * ctx); + LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx); LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); @@ -641,28 +641,28 @@ extern "C" { // Returns the number of tokens in the KV cache (slow, use only for debug) // If a KV cell has multiple sequences assigned to it, it will be counted multiple times - LLAMA_API int32_t llama_kv_cache_n_tokens(const struct llama_kv_cache * kv); + LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx); DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), - "use llama_kv_cache_n_tokens instead"); + "use llama_kv_self_n_tokens instead"); // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) - LLAMA_API int32_t llama_kv_cache_used_cells(const struct llama_kv_cache * kv); + LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx); DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx), - "use llama_kv_cache_used_cells instead"); + "use llama_kv_self_used_cells instead"); // Clear the KV cache - both cell info is erased and KV data is zeroed - LLAMA_API void llama_kv_cache_clear( - struct llama_kv_cache * kv); + LLAMA_API void llama_kv_self_clear( + struct llama_context * ctx); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails // seq_id < 0 : match any sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API bool llama_kv_cache_seq_rm( - struct llama_kv_cache * kv, + LLAMA_API bool llama_kv_self_seq_rm( + struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1); @@ -671,26 +671,26 @@ extern "C" { // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_cache_seq_cp( - struct llama_kv_cache * kv, + LLAMA_API void llama_kv_self_seq_cp( + struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); // Removes all tokens that do not belong to the specified sequence - LLAMA_API void llama_kv_cache_seq_keep( - struct llama_kv_cache * kv, + LLAMA_API void llama_kv_self_seq_keep( + struct llama_context * ctx, llama_seq_id seq_id); // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) // If the KV cache is RoPEd, the KV data is updated accordingly: // - lazily on next llama_decode() - // - explicitly with llama_kv_cache_update() + // - explicitly with llama_kv_self_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_cache_seq_add( - struct llama_kv_cache * kv, + LLAMA_API void llama_kv_self_seq_add( + struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, @@ -699,32 +699,87 @@ extern "C" { // Integer division of the positions by factor of `d > 1` // If the KV cache is RoPEd, the KV data is updated accordingly: // - lazily on next llama_decode() - // - explicitly with llama_kv_cache_update() + // - explicitly with llama_kv_self_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_cache_seq_div( - struct llama_kv_cache * kv, + LLAMA_API void llama_kv_self_seq_div( + struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d); // Returns the largest position present in the KV cache for the specified sequence - LLAMA_API llama_pos llama_kv_cache_seq_pos_max( - struct llama_kv_cache * kv, + LLAMA_API llama_pos llama_kv_self_seq_pos_max( + struct llama_context * ctx, llama_seq_id seq_id); // Defragment the KV cache // This will be applied: // - lazily on next llama_decode() - // - explicitly with llama_kv_cache_update() - LLAMA_API void llama_kv_cache_defrag(struct llama_kv_cache * kv); + // - explicitly with llama_kv_self_update() + LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx); // Check if the context supports KV cache shifting - LLAMA_API bool llama_kv_cache_can_shift(const struct llama_kv_cache * kv); + LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx); // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) - LLAMA_API void llama_update_kv_cache(struct llama_context * ctx, struct llama_kv_cache * kv); + LLAMA_API void llama_kv_self_update(struct llama_context * ctx); + + DEPRECATED(LLAMA_API void llama_kv_cache_clear( + struct llama_context * ctx), + "use llama_kv_self_clear instead"); + + DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1), + "use llama_kv_self_seq_rm instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp( + struct llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1), + "use llama_kv_self_seq_cp instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep( + struct llama_context * ctx, + llama_seq_id seq_id), + "use llama_kv_self_seq_keep instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_seq_add( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta), + "use llama_kv_self_seq_add instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_seq_div( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d), + "use llama_kv_self_seq_div instead"); + + DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max( + struct llama_context * ctx, + llama_seq_id seq_id), + "use llama_kv_self_seq_pos_max instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx), + "use llama_kv_self_defrag instead"); + + DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx), + "use llama_kv_self_can_shift instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx), + "use llama_kv_self_update instead"); + // // State / sessions diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0e146652c5996..0004e214b9e27 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -606,7 +606,7 @@ const llama_model * llama_get_model(const llama_context * ctx) { return &ctx->model; } -llama_kv_cache * llama_get_kv_cache(llama_context * ctx) { +llama_kv_cache * llama_get_kv_self(llama_context * ctx) { return &ctx->kv_self; } @@ -1147,14 +1147,14 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da data_ctx.write_embeddings(ctx); llama_kv_cache::io io = { - /* .write =*/ [&](const void * src, size_t size) { + /* .write = */ [&](const void * src, size_t size) { data_ctx.write(src, size); }, - /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { + /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { data_ctx.write_tensor_data(tensor, offset, size); }, - /* .read =*/ nullptr, - /* .read_to =*/ nullptr, + /* .read = */ nullptr, + /* .read_to = */ nullptr, }; ctx->kv_self.state_write(io, ctx->model.hparams); @@ -1195,12 +1195,12 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da data_ctx.read_embeddings(ctx); llama_kv_cache::io io = { - /* .write =*/ nullptr, - /* .write_tensor_data =*/ nullptr, - /* .read =*/ [&](size_t size) { + /* .write = */ nullptr, + /* .write_tensor_data = */ nullptr, + /* .read = */ [&](size_t size) { return data_ctx.read(size); }, - /* .read_to =*/ [&](void * dst, size_t size) { + /* .read_to = */ [&](void * dst, size_t size) { data_ctx.read_to(dst, size); }, }; @@ -1302,14 +1302,14 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam llama_synchronize(ctx); llama_kv_cache::io io = { - /* .write =*/ [&](const void * src, size_t size) { + /* .write = */ [&](const void * src, size_t size) { data_ctx.write(src, size); }, - /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { + /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { data_ctx.write_tensor_data(tensor, offset, size); }, - /* .read =*/ nullptr, - /* .read_to =*/ nullptr, + /* .read = */ nullptr, + /* .read_to = */ nullptr, }; ctx->kv_self.state_write(io, ctx->model.hparams, seq_id); @@ -1336,12 +1336,12 @@ static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llam llama_synchronize(ctx); llama_kv_cache::io io = { - /* .write =*/ nullptr, - /* .write_tensor_data =*/ nullptr, - /* .read =*/ [&](size_t size) { + /* .write = */ nullptr, + /* .write_tensor_data = */ nullptr, + /* .read = */ [&](size_t size) { return data_ctx.read(size); }, - /* .read_to =*/ [&](void * dst, size_t size) { + /* .read_to = */ [&](void * dst, size_t size) { data_ctx.read_to(dst, size); }, }; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 6886d24f0d98f..d2b81a0220d83 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1072,7 +1072,17 @@ bool llama_kv_cache::state_read_data(const io & io, const llama_hparams & hparam return true; } -///////////// +// +// interface implementation +// + +int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) { + return kv->n_tokens(); +} + +int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { + return kv->used; +} void llama_kv_cache_clear(llama_kv_cache * kv) { kv->clear(); @@ -1125,14 +1135,6 @@ void llama_kv_cache_defrag(llama_kv_cache * kv) { kv->defrag(); } -int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) { - return kv->n_tokens(); -} - -int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { - return kv->used; -} - bool llama_kv_cache_can_shift(const llama_kv_cache * kv) { return kv->can_shift; } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 0384a2b7ce7ab..2e021d4edf959 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -190,6 +190,48 @@ struct llama_kv_slot_restorer { } }; +// TODO: maybe become part of the public llama_kv_cache in the future +int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv); + +int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv); + +void llama_kv_cache_clear(llama_kv_cache * kv); + +bool llama_kv_cache_seq_rm( + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1); + +void llama_kv_cache_seq_cp( + llama_kv_cache * kv, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1); + +void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id); + +void llama_kv_cache_seq_add( + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta); + +void llama_kv_cache_seq_div( + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d); + +llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id); + +void llama_kv_cache_defrag(llama_kv_cache * kv); + +bool llama_kv_cache_can_shift(const llama_kv_cache * kv); + // // kv cache view // diff --git a/src/llama.cpp b/src/llama.cpp index 0227ba6b36a93..b8f4043757d49 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8564,7 +8564,7 @@ static int llama_decode_impl( // non-causal masks do not use the KV cache if (hparams.causal_attn) { - llama_update_kv_cache(&lctx, &lctx.kv_self); // TODO: lctx->update_kv_cache() + llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update() // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it @@ -9182,9 +9182,12 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); } -static void llama_update_kv_cache_impl(llama_context & lctx, llama_kv_cache & kv) { +// TODO: move to llama_context +static void llama_kv_self_update_impl(llama_context & lctx) { bool need_reserve = false; + auto & kv = lctx.kv_self; + if (kv.has_shift) { if (!kv.can_shift) { GGML_ABORT("The current context does not support K-shift"); @@ -9856,17 +9859,151 @@ void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * // deprecated int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { + return llama_kv_self_n_tokens(ctx); +} + +int32_t llama_kv_self_n_tokens(const llama_context * ctx) { return llama_kv_cache_n_tokens(&ctx->kv_self); } // deprecated int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) { + return llama_kv_self_used_cells(ctx); +} + +int32_t llama_kv_self_used_cells(const llama_context * ctx) { return llama_kv_cache_used_cells(&ctx->kv_self); } +// deprecated +void llama_kv_cache_clear(llama_context * ctx) { + llama_kv_self_clear(ctx); +} + +void llama_kv_self_clear(llama_context * ctx) { + llama_kv_cache_clear(&ctx->kv_self); +} + +// deprecated +bool llama_kv_cache_seq_rm( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return llama_kv_self_seq_rm(ctx, seq_id, p0, p1); +} + +bool llama_kv_self_seq_rm( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1); +} + +// deprecated +void llama_kv_cache_seq_cp( + llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_self_seq_cp( + llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); +} + +// deprecated +void llama_kv_cache_seq_keep( + llama_context * ctx, + llama_seq_id seq_id) { + return llama_kv_self_seq_keep(ctx, seq_id); +} + +void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id); +} + +// deprecated +void llama_kv_cache_seq_add( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta); +} + +void llama_kv_self_seq_add( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta); +} + +// deprecated +void llama_kv_cache_seq_div( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d); +} + +void llama_kv_self_seq_div( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d); +} + +// deprecated +llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_self_seq_pos_max(ctx, seq_id); +} + +llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id); +} + +// deprecated +void llama_kv_cache_defrag(llama_context * ctx) { + return llama_kv_self_defrag(ctx); +} + +void llama_kv_self_defrag(llama_context * ctx) { + return llama_kv_cache_defrag(&ctx->kv_self); +} + +// deprecated +bool llama_kv_cache_can_shift(const llama_context * ctx) { + return llama_kv_self_can_shift(ctx); +} + +bool llama_kv_self_can_shift(const llama_context * ctx) { + return llama_kv_cache_can_shift(&ctx->kv_self); +} + +// deprecated +void llama_kv_cache_update(llama_context * ctx) { + llama_kv_self_update(ctx); +} + // TODO: move to llama-context -void llama_update_kv_cache(llama_context * ctx, llama_kv_cache * kv) { - llama_update_kv_cache_impl(*ctx, *kv); +void llama_kv_self_update(llama_context * ctx) { + llama_kv_self_update_impl(*ctx); } /// From a19f671fe078497f73ec1898951475e026ffdc20 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 15 Jan 2025 10:54:21 +0200 Subject: [PATCH 09/84] context : minor ggml-ci --- src/llama-context.cpp | 36 +++++++++++------------------------- src/llama-context.h | 8 +++----- src/llama-kv-cache.cpp | 1 + src/llama-kv-cache.h | 6 +++--- src/llama.cpp | 33 +++++++++++++++++++-------------- 5 files changed, 37 insertions(+), 47 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0004e214b9e27..9eae6fe57ce1e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -8,30 +8,6 @@ #include #include -void llama_set_k_shift(struct llama_context & lctx) { - const int64_t kv_size = lctx.kv_self.size; - - assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); - - int32_t * data = (int32_t *) lctx.inp_K_shift->data; - - for (int i = 0; i < kv_size; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } -} - -void llama_set_s_copy(struct llama_context & lctx) { - const int64_t kv_size = lctx.kv_self.size; - - assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer)); - - int32_t * data = (int32_t *) lctx.inp_s_copy->data; - - for (int i = 0; i < kv_size; ++i) { - data[i] = lctx.kv_self.cells[i].src; - } -} - // llama input static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { @@ -58,6 +34,16 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } +void llama_context::set_k_shift(llama_kv_cache & kv) { + assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); + + int32_t * data = (int32_t *) inp_K_shift->data; + + for (uint32_t i = 0; i < kv.size; ++i) { + data[i] = kv.cells[i].delta; + } +} + void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { // // set input data @@ -134,7 +120,6 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - float * data = nullptr; float * data_swa = nullptr; @@ -599,6 +584,7 @@ uint32_t llama_n_ubatch(const struct llama_context * ctx) { } uint32_t llama_n_seq_max(const struct llama_context * ctx) { + // TODO: add notion of n_seq_max to llama_kv_cache and use it here return ctx->kv_self.size; } diff --git a/src/llama-context.h b/src/llama-context.h index a9268b2920908..73baa711f394a 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -18,7 +18,7 @@ struct llama_context { llama_context(const llama_model & model) : model(model) , t_start_us(model.t_start_us) - , t_load_us(model.t_load_us) {} + , t_load_us (model.t_load_us) {} const struct llama_model & model; @@ -107,13 +107,11 @@ struct llama_context { struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] + + void set_k_shift(llama_kv_cache & kv); }; // TODO: make these methods of llama_context -void llama_set_k_shift(struct llama_context & lctx); - -void llama_set_s_copy(struct llama_context & lctx); - void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch); // Make sure enough space is available for outputs. diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index d2b81a0220d83..b79c2ff934a6e 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -6,6 +6,7 @@ #include "llama-model.h" #include +#include #include #include #include diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 2e021d4edf959..5ffee62818b18 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -88,11 +88,11 @@ struct llama_kv_cache { void clear(); - bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1); + bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1); void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); void seq_keep(llama_seq_id seq_id); - void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta); - void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d); + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta); + void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d); llama_pos seq_pos_max(llama_seq_id seq_id); diff --git a/src/llama.cpp b/src/llama.cpp index b8f4043757d49..3e1cd8260b329 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1142,18 +1142,18 @@ struct llm_build_context { ctx0 = ggml_init(params); - lctx.inp_tokens = nullptr; - lctx.inp_embd = nullptr; - lctx.inp_pos = nullptr; - lctx.inp_out_ids = nullptr; - lctx.inp_KQ_mask = nullptr; - lctx.inp_KQ_mask_swa = nullptr; - lctx.inp_K_shift = nullptr; - lctx.inp_mean = nullptr; - lctx.inp_cls = nullptr; - lctx.inp_s_copy = nullptr; - lctx.inp_s_mask = nullptr; - lctx.inp_s_seq = nullptr; + lctx.inp_tokens = nullptr; + lctx.inp_embd = nullptr; + lctx.inp_pos = nullptr; + lctx.inp_out_ids = nullptr; + lctx.inp_KQ_mask = nullptr; + lctx.inp_KQ_mask_swa = nullptr; + lctx.inp_K_shift = nullptr; + lctx.inp_mean = nullptr; + lctx.inp_cls = nullptr; + lctx.inp_s_copy = nullptr; + lctx.inp_s_mask = nullptr; + lctx.inp_s_seq = nullptr; lctx.inp_pos_bucket = nullptr; lctx.inp_embd_enc = nullptr; lctx.inp_KQ_mask_cross = nullptr; @@ -1174,9 +1174,11 @@ struct llm_build_context { ggml_set_input(lctx.inp_K_shift); for (int il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * k = ggml_view_3d(ctx0, kv_self.k_l[il], n_embd_head_k, n_head_kv, n_ctx, @@ -1189,6 +1191,7 @@ struct llm_build_context { // dequantize to f32 -> RoPE -> quantize back tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); cb(tmp, "K_f32", il); + for (auto & backend : lctx.backends) { // Figure out which backend KV cache belongs to if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { @@ -1200,6 +1203,7 @@ struct llm_build_context { lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(tmp, "K_shifted_f32", il); + tmp = ggml_cpy(ctx0, tmp, k); } else { // we rotate only the first n_rot dimensions @@ -1208,6 +1212,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); } cb(tmp, "K_shifted", il); + ggml_build_forward_expand(gf, tmp); } @@ -9201,7 +9206,7 @@ static void llama_kv_self_update_impl(llama_context & lctx) { ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - llama_set_k_shift(lctx); + lctx.set_k_shift(kv); llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); From ae274f9747cce6ba6b4910d05ddc3016cd0b4e21 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 15 Jan 2025 13:35:56 +0200 Subject: [PATCH 10/84] llama : fix names [no ci] --- src/llama.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 3e1cd8260b329..37816ddc28a38 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1373,9 +1373,9 @@ struct llm_build_context { inp = ggml_graph_node(gf, i); if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { break; - } else { - inp = nullptr; } + + inp = nullptr; } GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor"); @@ -1431,7 +1431,7 @@ struct llm_build_context { return gf; } - struct ggml_tensor * llm_build_pos_bucket(bool causal) { + struct ggml_tensor * build_pos_bucket(bool causal) { if (causal) { lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); } else { @@ -1444,7 +1444,7 @@ struct llm_build_context { return lctx.inp_pos_bucket; } - struct ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { + struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); cb(pos_bucket_1d, "pos_bucket_1d", -1); @@ -1463,7 +1463,7 @@ struct llm_build_context { return pos_bias; } - struct ggml_tensor * llm_build_inp_embd_enc() { + struct ggml_tensor * build_inp_embd_enc() { const int64_t n_embd = hparams.n_embd; lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); ggml_set_input(lctx.inp_embd_enc); @@ -1471,7 +1471,7 @@ struct llm_build_context { return lctx.inp_embd_enc; } - struct ggml_tensor * llm_build_inp_KQ_mask_cross() { + struct ggml_tensor * build_inp_KQ_mask_cross() { lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); ggml_set_input(lctx.inp_KQ_mask_cross); cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1); @@ -6775,7 +6775,7 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); GGML_ASSERT(lctx.is_encoding); - struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); + struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); @@ -6810,7 +6810,7 @@ struct llm_build_context { cb(kq, "kq", il); struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); + struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); cb(kq_b, "kq_b", il); @@ -6909,11 +6909,11 @@ struct llm_build_context { GGML_ASSERT(!lctx.is_encoding); GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); - struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); + struct ggml_tensor * embd_enc = build_inp_embd_enc(); + struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); + struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6961,7 +6961,7 @@ struct llm_build_context { cb(kq, "kq", il); struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); + struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); cb(kq_b, "kq_b", il); From f2524c0e4137a4327473c086f97a01aa0632ca3e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 16 Jan 2025 15:04:14 +0200 Subject: [PATCH 11/84] llama : remove references to llama_kv_cache (wip) Intermediate step necessary to abstract the `llama_context` and `llama_kv_cache`. ggml-ci --- src/llama-context.cpp | 1031 ++++++++- src/llama-context.h | 162 +- src/llama.cpp | 4642 +++++++++++++++++------------------------ 3 files changed, 3017 insertions(+), 2818 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9eae6fe57ce1e..910e2243d7e8a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -8,8 +8,6 @@ #include #include -// llama input - static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { // TODO move to hparams if a T5 variant appears that uses a different value const int64_t max_distance = 128; @@ -34,56 +32,88 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } -void llama_context::set_k_shift(llama_kv_cache & kv) { - assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); +// TODO: improve +void llama_context::reset() { + inp_tokens = nullptr; + inp_embd = nullptr; + inp_pos = nullptr; + inp_out_ids = nullptr; + inp_mean = nullptr; + inp_cls = nullptr; + inp_embd_enc = nullptr; + inp_pos_bucket = nullptr; + inp_KQ_mask = nullptr; + inp_KQ_mask_cnv = nullptr; + inp_KQ_mask_swa = nullptr; + inp_KQ_mask_swa_cnv = nullptr; + inp_KQ_mask_cross = nullptr; + inp_K_shift = nullptr; + inp_s_copy = nullptr; + inp_s_mask = nullptr; +} + +void llama_context::prepare_k_shift() { +} - int32_t * data = (int32_t *) inp_K_shift->data; +void llama_context::prepare_defrag() { +} - for (uint32_t i = 0; i < kv.size; ++i) { - data[i] = kv.cells[i].delta; - } +void llama_context::prepare_decode(const llama_ubatch & /*ubatch*/) { } -void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { +// llama input + +void llama_context::set_inputs(const llama_ubatch & ubatch) { + const llama_hparams & hparams = model.hparams; + // // set input data // - const auto & hparams = lctx.model.hparams; - const auto & cparams = lctx.cparams; - const auto & kv_self = lctx.kv_self; + if (inp_K_shift) { + assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); + + int32_t * data = (int32_t *) inp_K_shift->data; + + for (uint32_t i = 0; i < kv_self.size; ++i) { + data[i] = kv_self.cells[i].delta; + } + + // the K-shift graph requires just this input + return; + } if (ubatch.token) { const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens)); + ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); } if (ubatch.embd) { const int64_t n_embd = hparams.n_embd; const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); + ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd)); } - if (ubatch.pos && lctx.inp_pos) { + if (ubatch.pos && inp_pos) { const int64_t n_tokens = ubatch.n_tokens; - auto n_pos = lctx.n_pos_per_token; - ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos)); + auto n_pos = n_pos_per_token; + ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(inp_pos)); } if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); + //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs"); - if (!lctx.inp_out_ids) { - LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__); + if (!inp_out_ids) { + LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__); } else { const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); - int32_t * data = (int32_t *) lctx.inp_out_ids->data; + GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer)); + int32_t * data = (int32_t *) inp_out_ids->data; - if (lctx.n_outputs == n_tokens) { + if (n_outputs == n_tokens) { for (int i = 0; i < n_tokens; ++i) { data[i] = i; } @@ -95,26 +125,26 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(lctx.n_outputs == n_outputs); - } else if (lctx.n_outputs == 1) { + GGML_ASSERT(n_outputs == n_outputs); + } else if (n_outputs == 1) { // only keep last output data[0] = n_tokens - 1; } else { - GGML_ASSERT(lctx.n_outputs == 0); + GGML_ASSERT(n_outputs == 0); } } } GGML_ASSERT( - // (!a || b) is a logical implication (a -> b) - // !hparams.causal_attn -> !cparams.causal_attn - (hparams.causal_attn || !cparams.causal_attn) && - "causal attention is not supported by this model" - ); + // (!a || b) is a logical implication (a -> b) + // !hparams.causal_attn -> !cparams.causal_attn + (hparams.causal_attn || !cparams.causal_attn) && + "causal attention is not supported by this model" + ); - if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) { + if (inp_KQ_mask || inp_KQ_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. - if (cparams.causal_attn && !lctx.is_encoding) { + if (cparams.causal_attn && !is_encoding) { const int64_t n_kv = kv_self.n; const int64_t n_tokens = ubatch.n_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens; @@ -123,14 +153,14 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { float * data = nullptr; float * data_swa = nullptr; - if (lctx.inp_KQ_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); - data = (float *) lctx.inp_KQ_mask->data; + if (inp_KQ_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer)); + data = (float *) inp_KQ_mask->data; } - if (lctx.inp_KQ_mask_swa) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer)); - data_swa = (float *) lctx.inp_KQ_mask_swa->data; + if (inp_KQ_mask_swa) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_swa->buffer)); + data_swa = (float *) inp_KQ_mask_swa->data; } // For causal attention, use only the previous KV cells @@ -191,11 +221,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; // when using kv cache, the mask needs to match the kv cache size - const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens; + const int64_t n_stride = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer)); - float * data = (float *) lctx.inp_KQ_mask->data; + float * data = (float *) inp_KQ_mask->data; for (int h = 0; h < 1; ++h) { for (int s1 = 0; s1 < n_seqs; ++s1) { @@ -238,11 +268,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(lctx.inp_mean); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); + GGML_ASSERT(inp_mean); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer)); - float * data = (float *) lctx.inp_mean->data; - memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); + float * data = (float *) inp_mean->data; + memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean)); std::vector sum(n_tokens, 0); @@ -279,11 +309,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(lctx.inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - uint32_t * data = (uint32_t *) lctx.inp_cls->data; - memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch.seq_id[s][0]; @@ -306,11 +336,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(lctx.inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - uint32_t * data = (uint32_t *) lctx.inp_cls->data; - memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); std::vector last_pos(n_tokens, -1); std::vector last_row(n_tokens, -1); @@ -341,17 +371,18 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { if (kv_self.recurrent) { const int64_t n_kv = kv_self.n; - if (lctx.inp_s_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer)); - float * data = (float *) lctx.inp_s_mask->data; + if (inp_s_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer)); + float * data = (float *) inp_s_mask->data; // clear unused states for (int i = 0; i < n_kv; ++i) { const uint32_t cell_id = i + kv_self.head; - llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; + llama_kv_cell & kv_cell = kv_self.cells[cell_id]; data[i] = (float) (kv_cell.src >= 0); + // TODO: do not mutate the KV cache // only clear once if (kv_cell.src < 0) { kv_cell.src = cell_id; @@ -359,14 +390,14 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } - if (lctx.inp_s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer)); - int32_t * data = (int32_t *) lctx.inp_s_copy->data; + if (inp_s_copy) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer)); + int32_t * data = (int32_t *) inp_s_copy->data; // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_kv; ++i) { const uint32_t cell_id = i + kv_self.head; - llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; + llama_kv_cell & kv_cell = kv_self.cells[cell_id]; // prevent out-of-bound sources if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) { @@ -375,6 +406,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { data[i] = kv_cell.src; + // TODO: do not mutate the KV cache // ensure copy only happens once if (kv_cell.src != (int32_t) cell_id) { kv_cell.src = cell_id; @@ -383,20 +415,20 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } - if (lctx.inp_pos_bucket) { + if (inp_pos_bucket) { const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_pos_bucket->buffer)); GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - int32_t * data = (int32_t *) lctx.inp_pos_bucket->data; + int32_t * data = (int32_t *) inp_pos_bucket->data; - if (!lctx.is_encoding) { + if (!is_encoding) { const int64_t n_kv = kv_self.n; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_kv; ++i) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding); + data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); } } } @@ -404,28 +436,28 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_tokens; ++i) { - data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding); + data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); } } } } } - if (!lctx.is_encoding && lctx.inp_embd_enc) { - assert(lctx.inp_embd_enc->type == GGML_TYPE_F32); - assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size()); + if (!is_encoding && inp_embd_enc) { + assert(inp_embd_enc->type == GGML_TYPE_F32); + assert((size_t) ggml_nelements(inp_embd_enc) == embd_enc.size()); - ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc)); + ggml_backend_tensor_set(inp_embd_enc, embd_enc.data(), 0, ggml_nbytes(inp_embd_enc)); } - if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) { - const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd; + if (!is_encoding && inp_KQ_mask_cross) { + const int64_t n_output_enc = embd_enc.size() / hparams.n_embd; const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_cross->buffer)); GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - float * data = (float *) lctx.inp_KQ_mask_cross->data; + float * data = (float *) inp_KQ_mask_cross->data; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -433,7 +465,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { float f = -INFINITY; for (int s = 0; s < ubatch.n_seq_id[j]; ++s) { const llama_seq_id seq_id = ubatch.seq_id[j][s]; - if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) { + if (seq_ids_enc[i].find(seq_id) != seq_ids_enc[i].end()) { f = 0.0f; } } @@ -450,6 +482,851 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } +// do mat_mul, while optionally apply lora +ggml_tensor * llama_context::build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lw->b, + ggml_mul_mat(ctx0, lw->a, cur) + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; +} + +// do mat_mul_id, while optionally apply lora +ggml_tensor * llama_context::build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur, + ggml_tensor * ids) { + struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float alpha = lora.first->alpha; + const float rank = (float) lw->b->ne[0]; + const float scale = alpha ? lora.second * alpha / rank : lora.second; + + struct ggml_tensor * ab_cur = ggml_mul_mat_id( + ctx0, lw->b, + ggml_mul_mat_id(ctx0, lw->a, cur, ids), + ids + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; +} + +void llama_context::build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + + inp_KQ_mask = causal + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + //cb(inp_KQ_mask, "KQ_mask", -1); + ggml_set_input(inp_KQ_mask); + + inp_KQ_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask, GGML_TYPE_F16) : inp_KQ_mask; + + if (swa) { + GGML_ASSERT(hparams.n_swa > 0); + + inp_KQ_mask_swa = causal + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + //cb(inp_KQ_mask_swa, "KQ_mask_swa", -1); + ggml_set_input(inp_KQ_mask_swa); + + inp_KQ_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask_swa, GGML_TYPE_F16) : inp_KQ_mask_swa; + } +} + +void llama_context::build_attn_kv_store( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + int64_t il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto & n_ctx = cparams.n_ctx; + + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + GGML_ASSERT(kv_self.size == n_ctx); + + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head); + //cb(k_cache_view, "k_cache_view", il); + + // note: storing RoPE-ed version of K in the KV cache + ggml_build_forward_expand(graph, ggml_cpy(ctx0, k_cur, k_cache_view)); + + assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); + + struct ggml_tensor * v_cache_view = nullptr; + + if (cparams.flash_attn) { + v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head); + } else { + // note: the V cache is transposed when not using flash attention + v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa, + ( n_ctx)*ggml_element_size(kv_self.v_l[il]), + (kv_head)*ggml_element_size(kv_self.v_l[il])); + + v_cur = ggml_transpose(ctx0, v_cur); + } + //cb(v_cache_view, "v_cache_view", il); + + ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view)); +} + +ggml_tensor * llama_context::build_attn_qkv( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto & n_ctx = cparams.n_ctx; + const auto & n_embd_head_k = hparams.n_embd_head_k; + const auto & n_embd_head_v = hparams.n_embd_head_v; + + // TODO: improve + bool is_sliding = false; + + switch (model.arch) { + case LLM_ARCH_COHERE2: + { + const int32_t sliding_window_pattern = 4; + is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + } break; + case LLM_ARCH_GEMMA2: + { + const int32_t sliding_window_pattern = 2; + is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + } break; + case LLM_ARCH_PHI3: + { + is_sliding = hparams.n_swa > 0; + } break; + default: + { + is_sliding = false; + } + }; + + const auto & kq_mask = is_sliding ? inp_KQ_mask_swa_cnv : inp_KQ_mask_cnv; + + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + 0); + //cb(k, "k", il); + + struct ggml_tensor * cur; + + if (cparams.flash_attn) { + GGML_UNUSED(model); + GGML_UNUSED(n_ctx); + + // split cached v into n_head heads (not transposed) + struct ggml_tensor * v = + ggml_view_3d(ctx0, kv_self.v_l[il], + n_embd_head_v, n_kv, n_head_kv, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v), + 0); + //cb(v, "v", il); + + cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, + hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); + + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + + cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); + } else { + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + //cb(kq, "kq", il); + + // note: this op tends to require high floating point range + // while for some models F16 is enough, for others it is not, so we default to F32 here + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + if (model.arch == LLM_ARCH_GROK) { + // need to do the following: + // multiply by attn_output_multiplyer of 0.08838834764831845 + // and then : + // kq = 30 * tanh(kq / 30) + // before the softmax below + + kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f)); + kq = ggml_scale(ctx0, kq, 30); + } + + if (hparams.attn_soft_cap) { + kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping); + kq = ggml_tanh(ctx0, kq); + kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping); + } + + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); + //cb(kq, "kq_soft_max_ext", il); + + GGML_ASSERT(kv_self.size == n_ctx); + + // split cached v into n_head heads + struct ggml_tensor * v = + ggml_view_3d(ctx0, kv_self.v_l[il], + n_kv, n_embd_head_v, n_head_kv, + ggml_element_size(kv_self.v_l[il])*n_ctx, + ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + 0); + //cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + //cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + //cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); + //cb(cur, "kqv_merged_cont", il); + + if (!cparams.offload_kqv) { + // all nodes between the KV store and the attention output are run on the CPU + ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu); + } + } + + ggml_build_forward_expand(graph, cur); + + if (wo) { + cur = build_lora_mm(ctx0, wo, cur); + } + + if (wo_b) { + //cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; +} + +ggml_tensor * llama_context::build_soft_max_ext( + ggml_context * ctx0, + ggml_tensor * kq, + float kq_scale) { + const auto & hparams = model.hparams; + + return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); +} + +ggml_tensor * llama_context::get_rope_factors(int il) { + const auto & hparams = model.hparams; + + // choose long/short freq factors based on the context size + const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; + + if (model.layers[il].rope_freqs != nullptr) { + return model.layers[il].rope_freqs; + } + + if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { + return model.layers[il].rope_long; + } + + return model.layers[il].rope_short; +} + +void llama_context::build_k_shift( + ggml_context * ctx0, + ggml_cgraph * graph) { + const auto & n_ctx = cparams.n_ctx; + const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; + const auto & freq_base = cparams.rope_freq_base; + const auto & freq_scale = cparams.rope_freq_scale; + + const auto & yarn_ext_factor = cparams.yarn_ext_factor; + const auto & yarn_attn_factor = cparams.yarn_attn_factor; + const auto & yarn_beta_fast = cparams.yarn_beta_fast; + const auto & yarn_beta_slow = cparams.yarn_beta_slow; + + const auto & hparams = model.hparams; + + const auto & n_rot = hparams.n_rot; + const auto & n_layer = hparams.n_layer; + const auto & rope_type = hparams.rope_type; + + const auto & n_embd_head_k = hparams.n_embd_head_k; + //const auto & n_embd_head_v = hparams.n_embd_head_v; + + GGML_ASSERT(kv_self.size == n_ctx); + + inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + //cb(inp_K_shift, "K_shift", -1); + ggml_set_input(inp_K_shift); + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + + struct ggml_tensor * rope_factors = get_rope_factors(il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_head_kv, n_ctx, + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + 0); + + struct ggml_tensor * tmp; + if (ggml_is_quantized(k->type)) { + // dequantize to f32 -> RoPE -> quantize back + tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); + //cb(tmp, "K_f32", il); + + for (auto & backend : backends) { + // Figure out which backend KV cache belongs to + if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { + ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); + break; + } + } + tmp = ggml_rope_ext_inplace(ctx0, tmp, + inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + //cb(tmp, "K_shifted_f32", il); + + tmp = ggml_cpy(ctx0, tmp, k); + } else { + // we rotate only the first n_rot dimensions + tmp = ggml_rope_ext_inplace(ctx0, k, + inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + } + //cb(tmp, "K_shifted", il); + + ggml_build_forward_expand(graph, tmp); + } +} + +void llama_context::build_defrag( + ggml_context * ctx0, + ggml_cgraph * graph) { + const auto & hparams = model.hparams; + + const uint32_t n_layer = hparams.n_layer; + + const uint32_t n_kv = kv_self.cell_max(); + const uint32_t n_used = kv_self.used; + + assert(n_used <= n_kv); + + //const int64_t t_start = ggml_time_us(); + + // number of cells moved + uint32_t n_moves = 0; + + // each move requires 6*n_layer tensors (see build_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + //const uint32_t max_moves = model.max_nodes()/(6*n_layer); + // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 + const uint32_t max_moves = (model.max_nodes() - 2*n_layer)/(6*n_layer); + + // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // + std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = kv_self.cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + uint32_t nh = 1; + + // determine the size of the hole + while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { + nh++; + } + + uint32_t nf = 0; + uint32_t is = n_kv - 1; + + // starting from the end, find nh non-empty cells + for (; is > i0; --is) { + const auto & cell1 = kv_self.cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } + + // non-empty cell which is not yet moved + nf++; + + if (nf == nh) { + break; + } + } + + // this can only happen if `n_used` is not accurate, which would be a bug + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + uint32_t i1 = is; + + // are we moving a continuous block of memory? + bool cont = false; + + // should we stop searching for the next move? + bool stop = false; + + // go back and move the nf cells to the hole + for (; i1 < n_kv; ++i1) { + auto & cell1 = kv_self.cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + + cont = false; + continue; + } + + // this cell goes to (i0 + nf) + ids[i1] = i0 + nf; + + // move the cell meta data + kv_self.cells[i0 + nf] = cell1; + + // clear the old cell and move the head there + cell1 = llama_kv_cell(); + kv_self.head = n_used; + + if (!cont) { + n_moves++; + cont = true; + } + + nf++; + + if (nf == nh) { + break; + } + } + + if (stop || n_moves == max_moves) { + break; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); + + i0 += nh - 1; + } + + if (n_moves == 0) { + return; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); + + //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); + +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = kv_self.size; + + std::vector buf_k; + std::vector buf_v; + + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); + + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); + + buf_k.resize(k_size); + buf_v.resize(v_size); + + ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } + + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; + + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); + } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + } +#else + for (uint32_t i = 0; i < ids.size(); ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == ids.size()) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < ids.size() && ids[i + nm] == id + nm) { + nm++; + } + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; + + if (cparams.flash_attn) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); + } else { + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, id)); + } + + ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_v_src, view_v_dst)); + } + + i += nm - 1; + } + + //LLAMA_LOG_INFO("graph->n_nodes = %d\n", graph->n_nodes); +#endif +} + +ggml_tensor * llama_context::build_inp_s_copy( + ggml_context * ctx0, + bool worst_case) { + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + + inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); + //cb(inp_s_copy, "inp_s_copy", -1); + ggml_set_input(inp_s_copy); + return inp_s_copy; +} + +ggml_tensor * llama_context::build_inp_s_mask( + ggml_context * ctx0, + bool worst_case) { + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); + //cb(inp_s_mask, "inp_s_mask", -1); + ggml_set_input(inp_s_mask); + return inp_s_mask; +} + +ggml_tensor * llama_context::build_copy_mask_state( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_tokens, + int32_t n_state, + int32_t n_seqs, + bool worst_case) { + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self.size); + + // copy states + // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv + // this shrinks the tensors's ne[1] to n_kv + states = ggml_get_rows(ctx0, states, state_copy); + + // clear states of sequences which are starting at the beginning of this batch + // FIXME: zero-out NANs? + states = ggml_mul(ctx0, states, state_mask); + + // copy states which won't be changed further (between n_seqs and n_kv) + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)), + ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s)))); + + // the part of the states that will be used and modified + return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0); +} + +// TODO: split +ggml_tensor * llama_context::build_mamba_layer( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto & n_tokens = ubatch.n_tokens; + + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t dt_rank = hparams.ssm_dt_rank; + const int64_t n_seqs = ubatch.n_seqs; + // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) + const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; + // Use the same RMS norm as the final layer norm + const float norm_rms_eps = hparams.f_norm_rms_eps; + + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + struct ggml_tensor * conv_states_all = kv_self.k_l[il]; + struct ggml_tensor * ssm_states_all = kv_self.v_l[il]; + + // (ab)using the KV cache to store the states + struct ggml_tensor * conv = build_copy_mask_state( + ctx0, graph, conv_states_all, state_copy, state_mask, + n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); + conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); + struct ggml_tensor * ssm = build_copy_mask_state( + ctx0, graph, ssm_states_all, state_copy, state_mask, + n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); + ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs); + + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); + + // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} + struct ggml_tensor * xz = build_lora_mm(ctx0, model.layers[il].ssm_in, cur); + // split the above in two + // => {d_inner, n_seq_tokens, n_seqs} + struct ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); + struct ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz)); + + // conv + { + // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} + struct ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0); + + // copy last (d_conv - 1) columns back into the state cache + struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); + + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, last_conv, + ggml_view_1d(ctx0, conv_states_all, + (d_conv - 1)*(d_inner)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); + + // 1D convolution + // The equivalent is to make a self-overlapping view of conv_x + // over d_conv columns at each stride in the 3rd dimension, + // then element-wise multiply that with the conv1d weight, + // then sum the elements of each row, + // (the last two steps are a dot product over rows (also doable with mul_mat)) + // then permute away the ne[0] dimension, + // and then you're left with the resulting x tensor. + // For simultaneous sequences, all sequences need to have the same length. + x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); + + // bias + x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b); + + x = ggml_silu(ctx0, x); + } + + // ssm + { + // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} + struct ggml_tensor * x_db = build_lora_mm(ctx0, model.layers[il].ssm_x, x); + // split + struct ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); + struct ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); + struct ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); + + // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers + if (ssm_dt_b_c_rms) { + dt = ggml_rms_norm(ctx0, dt, norm_rms_eps); + B = ggml_rms_norm(ctx0, B, norm_rms_eps); + C = ggml_rms_norm(ctx0, C, norm_rms_eps); + } + + // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} + dt = build_lora_mm(ctx0, model.layers[il].ssm_dt, dt); + dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); + + // Custom operator to optimize the parallel associative scan + // as described in the Annex D of the Mamba paper. + // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C); + + // store last states + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), + ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); + + struct ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); + + // TODO: skip computing output earlier for unused tokens + + // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} + y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); + y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z))); + + // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} + cur = build_lora_mm(ctx0, model.layers[il].ssm_out, y); + } + + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); + //cb(cur, "mamba_out", il); + + return cur; +} + + // llama output size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { diff --git a/src/llama-context.h b/src/llama-context.h index 73baa711f394a..a2f41b5c8fc7d 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -14,6 +14,8 @@ #include #include +using llama_loras = std::unordered_map; + struct llama_context { llama_context(const llama_model & model) : model(model) @@ -22,12 +24,10 @@ struct llama_context { const struct llama_model & model; - struct llama_cparams cparams; - struct llama_sbatch sbatch; // TODO: revisit if needed - struct llama_kv_cache kv_self; - struct llama_adapter_cvec cvec; - - std::unordered_map lora; + llama_cparams cparams; + llama_sbatch sbatch; // TODO: revisit if needed + llama_adapter_cvec cvec; + llama_loras loras; std::vector backends; std::vector> set_n_threads_fns; @@ -72,18 +72,6 @@ struct llama_context { // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE std::map> embd_seq; - // whether we are computing encoder output or decoder output - bool is_encoding = false; - - // TODO: find a better way to accommodate mutli-dimension position encoding methods - // number of position id each token get, 1 for each token in most cases. - // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate. - int n_pos_per_token = 1; - - // output of the encoder part of the encoder-decoder models - std::vector embd_enc; - std::vector> seq_ids_enc; - // memory buffers used to evaluate the model std::vector buf_compute_meta; ggml_backend_sched_ptr sched; @@ -91,28 +79,144 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; + void reset(); + + void prepare_k_shift(); + void prepare_defrag(); + void prepare_decode(const llama_ubatch & ubatch); + + void set_inputs(const llama_ubatch & ubatch); + + ggml_tensor * build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur); + + ggml_tensor * build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, // struct ggml_tensor * as + ggml_tensor * cur, // struct ggml_tensor * b + ggml_tensor * ids); + // input tensors struct ggml_tensor * inp_tokens; // I32 [n_batch] struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_out_ids; // I32 [n_outputs] - struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_K_shift; // I32 [kv_size] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch] + + // === encoder-decoder === + + // whether we are computing encoder output or decoder output + bool is_encoding = false; + + // output of the encoder part of the encoder-decoder models + std::vector embd_enc; + std::vector> seq_ids_enc; + + struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] + struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] + + // === unified KV cache === + + llama_kv_cache kv_self; + + struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] + struct ggml_tensor * inp_K_shift; // I32 [kv_size] + + void build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa, + bool worst_case); + + void build_attn_kv_store( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + int64_t il, + bool worst_case); + + ggml_tensor * build_attn_qkv( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case); + + ggml_tensor * build_soft_max_ext( + ggml_context * ctx0, + ggml_tensor * kq, + float kq_scale); + + ggml_tensor * get_rope_factors(int il); + + void build_k_shift( + ggml_context * ctx0, + ggml_cgraph * graph); + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + void build_defrag( + ggml_context * ctx0, + ggml_cgraph * graph); + + // === recurrent === + + // TODO: add recurrent cache + // TODO: add mamba-specific llama_context + + // TODO: change these to build_mamba_inp and hide `state_copy` and `state_mask` inside the llama_context impl + ggml_tensor * build_inp_s_copy( + ggml_context * ctx0, + bool worst_case); + + ggml_tensor * build_inp_s_mask( + ggml_context * ctx0, + bool worst_case); + + ggml_tensor * build_copy_mask_state( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_tokens, + int32_t n_state, + int32_t n_seqs, + bool worst_case); + + ggml_tensor * build_mamba_layer( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case); + struct ggml_tensor * inp_s_copy; // I32 [kv_size] struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] - struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch] - struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] - struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] - struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] - void set_k_shift(llama_kv_cache & kv); -}; + // === vision === -// TODO: make these methods of llama_context -void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch); + // TODO: find a better way to accommodate mutli-dimension position encoding methods + // number of position id each token get, 1 for each token in most cases. + // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate. + int n_pos_per_token = 1; +}; // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. diff --git a/src/llama.cpp b/src/llama.cpp index 37816ddc28a38..a2e5e0bea0fb5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4,8 +4,6 @@ #include "llama-mmap.h" #include "llama-context.h" #include "llama-vocab.h" -#include "llama-sampling.h" -#include "llama-kv-cache.h" #include "llama-model-loader.h" #include "llama-model.h" @@ -106,946 +104,15 @@ enum llm_norm_type { LLM_NORM_GROUP, }; -static struct ggml_tensor * llm_build_inp_embd( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_hparams & hparams, - const llama_ubatch & ubatch, - struct ggml_tensor * tok_embd, - const llm_build_cb & cb) { - const int64_t n_embd = hparams.n_embd; - - struct ggml_tensor * inpL; - - if (ubatch.token) { - lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens); - cb(lctx.inp_tokens, "inp_tokens", -1); - ggml_set_input(lctx.inp_tokens); - - inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); - - // apply lora for embedding tokens if needed - for (auto & it : lctx.lora) { - struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd); - if (lw == nullptr) { - continue; - } - const float adapter_scale = it.second; - const float scale = lw->get_scale(it.first->alpha, adapter_scale); - struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat( - ctx, lw->b, // non-transposed lora_b - ggml_get_rows(ctx, lw->a, lctx.inp_tokens) - ), scale); - inpL = ggml_add(ctx, inpL, inpL_delta); - } - } else { - lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - inpL = lctx.inp_embd; - ggml_set_input(lctx.inp_embd); - } - - // For Granite architecture - if (hparams.f_embedding_scale != 0.0f) { - inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale); - } - - cb(inpL, "inp_embd", -1); - - return inpL; -} - -static void llm_build_kv_store( - struct ggml_context * ctx, - const llama_hparams & hparams, - const llama_cparams & cparams, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - int32_t n_tokens, - int32_t kv_head, - const llm_build_cb & cb, - int64_t il) { - const int64_t n_ctx = cparams.n_ctx; - - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - GGML_ASSERT(kv.size == n_ctx); - - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head); - cb(k_cache_view, "k_cache_view", il); - - // note: storing RoPE-ed version of K in the KV cache - ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); - - assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); - - struct ggml_tensor * v_cache_view = nullptr; - - if (cparams.flash_attn) { - v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head); - } else { - // note: the V cache is transposed when not using flash attention - v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa, - ( n_ctx)*ggml_element_size(kv.v_l[il]), - (kv_head)*ggml_element_size(kv.v_l[il])); - - v_cur = ggml_transpose(ctx, v_cur); - } - cb(v_cache_view, "v_cache_view", il); - - ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); -} - -// do mat_mul, while optionally apply lora -static struct ggml_tensor * llm_build_lora_mm( - struct llama_context & lctx, - struct ggml_context * ctx0, - struct ggml_tensor * w, - struct ggml_tensor * cur) { - struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); - for (auto & it : lctx.lora) { - struct llama_adapter_lora_weight * lw = it.first->get_weight(w); - if (lw == nullptr) { - continue; - } - const float adapter_scale = it.second; - const float scale = lw->get_scale(it.first->alpha, adapter_scale); - struct ggml_tensor * ab_cur = ggml_mul_mat( - ctx0, lw->b, - ggml_mul_mat(ctx0, lw->a, cur) - ); - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - return res; -} - -// do mat_mul_id, while optionally apply lora -static struct ggml_tensor * llm_build_lora_mm_id( - struct llama_context & lctx, - struct ggml_context * ctx0, - struct ggml_tensor * w, // struct ggml_tensor * as - struct ggml_tensor * cur, // struct ggml_tensor * b - struct ggml_tensor * ids) { - struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); - for (auto & it : lctx.lora) { - struct llama_adapter_lora_weight * lw = it.first->get_weight(w); - if (lw == nullptr) { - continue; - } - const float alpha = it.first->alpha; - const float rank = (float) lw->b->ne[0]; - const float scale = alpha ? it.second * alpha / rank : it.second; - struct ggml_tensor * ab_cur = ggml_mul_mat_id( - ctx0, lw->b, - ggml_mul_mat_id(ctx0, lw->a, cur, ids), - ids - ); - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - return res; -} - -static struct ggml_tensor * llm_build_norm( - struct ggml_context * ctx, - struct ggml_tensor * cur, - const llama_hparams & hparams, - struct ggml_tensor * mw, - struct ggml_tensor * mb, - llm_norm_type type, - const llm_build_cb & cb, - int il) { - switch (type) { - case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break; - case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break; - case LLM_NORM_GROUP: - { - cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]); - cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); - cur = ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]); - } break; - } - - if (mw || mb) { - cb(cur, "norm", il); - } - - if (mw) { - cur = ggml_mul(ctx, cur, mw); - if (mb) { - cb(cur, "norm_w", il); - } - } - - if (mb) { - cur = ggml_add(ctx, cur, mb); - } - - return cur; -} - -static struct ggml_tensor * llm_build_ffn( - struct ggml_context * ctx, - struct llama_context & lctx, - struct ggml_tensor * cur, - struct ggml_tensor * up, - struct ggml_tensor * up_b, - struct ggml_tensor * up_s, - struct ggml_tensor * gate, - struct ggml_tensor * gate_b, - struct ggml_tensor * gate_s, - struct ggml_tensor * down, - struct ggml_tensor * down_b, - struct ggml_tensor * down_s, - struct ggml_tensor * act_scales, - llm_ffn_op_type type_op, - llm_ffn_gate_type type_gate, - const llm_build_cb & cb, - int il) { - struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur; - cb(tmp, "ffn_up", il); - - if (up_b) { - tmp = ggml_add(ctx, tmp, up_b); - cb(tmp, "ffn_up_b", il); - } - - if (up_s) { - tmp = ggml_mul(ctx, tmp, up_s); - cb(tmp, "ffn_up_s", il); - } - - if (gate) { - switch (type_gate) { - case LLM_FFN_SEQ: - { - cur = llm_build_lora_mm(lctx, ctx, gate, tmp); - cb(cur, "ffn_gate", il); - } break; - case LLM_FFN_PAR: - { - cur = llm_build_lora_mm(lctx, ctx, gate, cur); - cb(cur, "ffn_gate", il); - } break; - } - - if (gate_b) { - cur = ggml_add(ctx, cur, gate_b); - cb(cur, "ffn_gate_b", il); - } - - if (gate_s) { - cur = ggml_mul(ctx, cur, gate_s); - cb(cur, "ffn_gate_s", il); - } - - } else { - cur = tmp; - } - - switch (type_op) { - case LLM_FFN_SILU: - { - cur = ggml_silu(ctx, cur); - cb(cur, "ffn_silu", il); - } break; - case LLM_FFN_GELU: - { - cur = ggml_gelu(ctx, cur); - cb(cur, "ffn_gelu", il); - if (act_scales != NULL) { - cur = ggml_div(ctx, cur, act_scales); - cb(cur, "ffn_act", il); - } - } break; - case LLM_FFN_RELU: - { - cur = ggml_relu(ctx, cur); - cb(cur, "ffn_relu", il); - } break; - case LLM_FFN_RELU_SQR: - { - cur = ggml_relu(ctx, cur); - cb(cur, "ffn_relu", il); - - cur = ggml_sqr(ctx, cur); - cb(cur, "ffn_sqr(relu)", il); - } break; - case LLM_FFN_SWIGLU: - { - // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf - int64_t split_point = cur->ne[0] / 2; - struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0)); - struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); - - x0 = ggml_silu(ctx, x0); - cb(cur, "ffn_silu", il); - - cur = ggml_mul(ctx, x0, x1); - cb(cur, "ffn_mul", il); - } break; - } - - if (type_gate == LLM_FFN_PAR) { - cur = ggml_mul(ctx, cur, tmp); - cb(cur, "ffn_gate_par", il); - } - - if (down) { - cur = llm_build_lora_mm(lctx, ctx, down, cur); - } - - if (down_b) { - cb(cur, "ffn_down", il); - } - - if (down_b) { - cur = ggml_add(ctx, cur, down_b); - } - - if (down_s) { - cur = ggml_mul(ctx, cur, down_s); - cb(cur, "ffn_down_s", il); - } - - return cur; -} - -static struct ggml_tensor * llm_build_moe_ffn( - struct ggml_context * ctx, - struct llama_context & lctx, - struct ggml_tensor * cur, - struct ggml_tensor * gate_inp, - struct ggml_tensor * up_exps, - struct ggml_tensor * gate_exps, - struct ggml_tensor * down_exps, - struct ggml_tensor * exp_probs_b, - int64_t n_expert, - int64_t n_expert_used, - llm_ffn_op_type type_op, - bool norm_w, - bool scale_w, - float w_scale, -llama_expert_gating_func_type gating_op, - const llm_build_cb & cb, - int il) { - int64_t n_embd = cur->ne[0]; - int64_t n_tokens = cur->ne[1]; - - ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] - cb(logits, "ffn_moe_logits", il); - - ggml_tensor * probs = nullptr; - switch (gating_op) { - case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: - { - probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] - } break; - case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: - { - probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens] - } break; - default: - GGML_ABORT("fatal error"); - } - cb(probs, "ffn_moe_probs", il); - - // add experts selection bias - introduced in DeepSeek V3 - // leave probs unbiased as it's later used to get expert weights - ggml_tensor * selection_probs = probs; - if (exp_probs_b != nullptr) { - selection_probs = ggml_add(ctx, probs, exp_probs_b); - cb(selection_probs, "ffn_moe_probs_biased", il); - } - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens] - cb(selected_experts->src[0], "ffn_moe_argsort", il); - cb(selected_experts, "ffn_moe_topk", il); - - ggml_tensor * weights = ggml_get_rows(ctx, - ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights", il); - - if (norm_w) { - weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens); - - ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens] - cb(weights_sum, "ffn_moe_weights_sum", il); - - weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights_norm", il); - - weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens); - } - if (scale_w) { - weights = ggml_scale(ctx, weights, w_scale); - cb(weights, "ffn_moe_weights_scaled", il); - } - - cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); - ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(up, "ffn_moe_up", il); - - ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(gate, "ffn_moe_gate", il); - - switch (type_op) { - case LLM_FFN_SILU: - { - gate = ggml_silu(ctx, gate); - cb(gate, "ffn_moe_silu", il); - } break; - case LLM_FFN_GELU: - { - gate = ggml_gelu(ctx, gate); - cb(gate, "ffn_moe_gelu", il); - } break; - default: - GGML_ABORT("fatal error"); - } - - ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] - cb(par, "ffn_moe_gate_par", il); - - ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] - cb(experts, "ffn_moe_down", il); - - experts = ggml_mul(ctx, experts, weights); - - // aggregate experts - ggml_tensor * moe_out = nullptr; - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, - experts->nb[2], i*experts->nb[1]); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx, moe_out, cur_expert); - } - } - - if (n_expert_used == 1) { - // avoid returning a non-contiguous tensor - moe_out = ggml_cont(ctx, moe_out); - } - - return moe_out; -} - -static struct ggml_tensor * llm_build_kqv( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * wo, - struct ggml_tensor * wo_b, - struct ggml_tensor * q_cur, - struct ggml_tensor * kq_mask, - int32_t n_tokens, - int32_t n_kv, - float kq_scale, - const llm_build_cb & cb, - int il) { - const llama_model & model = lctx.model; - const llama_hparams & hparams = lctx.model.hparams; - const llama_cparams & cparams = lctx.cparams; - - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head(il); - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_head_v = hparams.n_embd_head_v; - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); - cb(q, "q", il); - - struct ggml_tensor * k = - ggml_view_3d(ctx, kv.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); - - struct ggml_tensor * cur; - - if (cparams.flash_attn) { - GGML_UNUSED(model); - GGML_UNUSED(n_ctx); - - // split cached v into n_head heads (not transposed) - struct ggml_tensor * v = - ggml_view_3d(ctx, kv.v_l[il], - n_embd_head_v, n_kv, n_head_kv, - ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv.v_l[il]->type, n_embd_head_v), - 0); - cb(v, "v", il); - - cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, - hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); - - ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); - - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); - } else { - struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); - cb(kq, "kq", il); - - // note: this op tends to require high floating point range - // while for some models F16 is enough, for others it is not, so we default to F32 here - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - - if (model.arch == LLM_ARCH_GROK) { - // need to do the following: - // multiply by attn_output_multiplyer of 0.08838834764831845 - // and then : - // kq = 30 * tanh(kq / 30) - // before the softmax below - - kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f)); - kq = ggml_scale(ctx, kq, 30); - } - - if (hparams.attn_soft_cap) { - kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping); - kq = ggml_tanh(ctx, kq); - kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping); - } - - kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); - - GGML_ASSERT(kv.size == n_ctx); - - // split cached v into n_head heads - struct ggml_tensor * v = - ggml_view_3d(ctx, kv.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv.v_l[il])*n_ctx, - ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); - cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); - - cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens); - cb(cur, "kqv_merged_cont", il); - } - - ggml_build_forward_expand(graph, cur); - - if (wo) { - cur = llm_build_lora_mm(lctx, ctx, wo, cur); - } - - if (wo_b) { - cb(cur, "kqv_wo", il); - } - - if (wo_b) { - cur = ggml_add(ctx, cur, wo_b); - } - - return cur; -} - -static struct ggml_tensor * llm_build_kv( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * wo, - struct ggml_tensor * wo_b, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - struct ggml_tensor * q_cur, - struct ggml_tensor * kq_mask, - int32_t n_tokens, - int32_t kv_head, - int32_t n_kv, - float kq_scale, - const llm_build_cb & cb, - int il) { - const llama_hparams & hparams = lctx.model.hparams; - const llama_cparams & cparams = lctx.cparams; - - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(graph, q_cur); - ggml_build_forward_expand(graph, k_cur); - ggml_build_forward_expand(graph, v_cur); - - llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il); - - struct ggml_tensor * cur; - - cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); - cb(cur, "kqv_out", il); - - return cur; -} - -static struct ggml_tensor * llm_build_copy_mask_state( - struct ggml_context * ctx, - struct ggml_cgraph * graph, - struct ggml_tensor * s, - struct ggml_tensor * state_copy, - struct ggml_tensor * state_mask, - int32_t n_state, - int32_t kv_size, - int32_t kv_head, - int32_t n_kv, - int32_t n_seqs) { - struct ggml_tensor * states = ggml_reshape_2d(ctx, s, n_state, kv_size); - - // copy states - // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv - // this shrinks the tensors's ne[1] to n_kv - states = ggml_get_rows(ctx, states, state_copy); - - // clear states of sequences which are starting at the beginning of this batch - // FIXME: zero-out NANs? - states = ggml_mul(ctx, states, state_mask); - - // copy states which won't be changed further (between n_seqs and n_kv) - ggml_build_forward_expand(graph, - ggml_cpy(ctx, - ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)), - ggml_view_1d(ctx, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s)))); - - // the part of the states that will be used and modified - return ggml_view_2d(ctx, states, n_state, n_seqs, states->nb[1], 0); -} - -// TODO: split -static struct ggml_tensor * llm_build_mamba( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_ubatch & ubatch, - struct ggml_cgraph * graph, - struct ggml_tensor * cur, - struct ggml_tensor * state_copy, - struct ggml_tensor * state_mask, - int32_t kv_head, - int32_t n_kv, - const llm_build_cb & cb, - int il) { - const llama_model & model = lctx.model; - const llama_hparams & hparams = model.hparams; - const llama_kv_cache & kv = lctx.kv_self; - const int64_t d_conv = hparams.ssm_d_conv; - const int64_t d_inner = hparams.ssm_d_inner; - const int64_t d_state = hparams.ssm_d_state; - const int64_t dt_rank = hparams.ssm_dt_rank; - const int64_t n_seqs = ubatch.n_seqs; - // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) - const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; - // Use the same RMS norm as the final layer norm - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); - GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); - - struct ggml_tensor * conv_states_all = kv.k_l[il]; - struct ggml_tensor * ssm_states_all = kv.v_l[il]; - - // (ab)using the KV cache to store the states - struct ggml_tensor * conv = llm_build_copy_mask_state(ctx, - graph, conv_states_all, state_copy, state_mask, - hparams.n_embd_k_s(), kv.size, kv_head, n_kv, n_seqs); - conv = ggml_reshape_3d(ctx, conv, d_conv - 1, d_inner, n_seqs); - struct ggml_tensor * ssm = llm_build_copy_mask_state(ctx, - graph, ssm_states_all, state_copy, state_mask, - hparams.n_embd_v_s(), kv.size, kv_head, n_kv, n_seqs); - ssm = ggml_reshape_3d(ctx, ssm, d_state, d_inner, n_seqs); - - // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} - cur = ggml_reshape_3d(ctx, cur, cur->ne[0], n_seq_tokens, n_seqs); - - // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} - struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_in, cur); - // split the above in two - // => {d_inner, n_seq_tokens, n_seqs} - struct ggml_tensor * x = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); - struct ggml_tensor * z = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz)); - - // conv - { - // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} - struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_transpose(ctx, x), 0); - - // copy last (d_conv - 1) columns back into the state cache - struct ggml_tensor * last_conv = ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); - - ggml_build_forward_expand(graph, - ggml_cpy(ctx, last_conv, - ggml_view_1d(ctx, conv_states_all, - (d_conv - 1)*(d_inner)*(n_seqs), - kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); - - // 1D convolution - // The equivalent is to make a self-overlapping view of conv_x - // over d_conv columns at each stride in the 3rd dimension, - // then element-wise multiply that with the conv1d weight, - // then sum the elements of each row, - // (the last two steps are a dot product over rows (also doable with mul_mat)) - // then permute away the ne[0] dimension, - // and then you're left with the resulting x tensor. - // For simultaneous sequences, all sequences need to have the same length. - x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d); - - // bias - x = ggml_add(ctx, x, model.layers[il].ssm_conv1d_b); - - x = ggml_silu(ctx, x); - } - - // ssm - { - // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} - struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_x, x); - // split - struct ggml_tensor * dt = ggml_view_3d(ctx, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); - struct ggml_tensor * B = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); - struct ggml_tensor * C = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); - - // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers - if (ssm_dt_b_c_rms) { - dt = ggml_rms_norm(ctx, dt, norm_rms_eps); - B = ggml_rms_norm(ctx, B, norm_rms_eps); - C = ggml_rms_norm(ctx, C, norm_rms_eps); - } - - // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} - dt = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_dt, dt); - dt = ggml_add(ctx, dt, model.layers[il].ssm_dt_b); - - // Custom operator to optimize the parallel associative scan - // as described in the Annex D of the Mamba paper. - // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} - struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C); - - // store last states - ggml_build_forward_expand(graph, - ggml_cpy(ctx, - ggml_view_1d(ctx, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), - ggml_view_1d(ctx, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); - - struct ggml_tensor * y = ggml_view_3d(ctx, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); - - // TODO: skip computing output earlier for unused tokens - - // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} - y = ggml_add(ctx, y, ggml_mul(ctx, x, model.layers[il].ssm_d)); - y = ggml_mul(ctx, y, ggml_silu(ctx, ggml_cont(ctx, z))); - - // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} - cur = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_out, y); - } - - // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} - cur = ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs); - cb(cur, "mamba_out", il); - - return cur; -} - -static struct ggml_tensor * llm_build_rwkv6_time_mix( - struct llama_context & lctx, - struct ggml_context * ctx, - const struct llama_layer * layer, - struct ggml_tensor * cur, - struct ggml_tensor * x_prev, - struct ggml_tensor ** wkv_state, - size_t wkv_head_size, - size_t head_count_kv) { - size_t n_embd = cur->ne[0]; - size_t n_seq_tokens = cur->ne[1]; - size_t n_seqs = cur->ne[2]; - - size_t head_size = wkv_head_size; - size_t head_count = n_embd / head_size; - - size_t n_tokens = n_seqs * n_seq_tokens; - - bool is_qrwkv = layer->time_mix_first == nullptr; - - struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); - - sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens); - cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); - - struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); - - xxx = ggml_reshape_4d( - ctx, - ggml_tanh( - ctx, - ggml_mul_mat(ctx, layer->time_mix_w1, xxx) - ), - layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens - ); - - xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2)); - - xxx = ggml_mul_mat( - ctx, - ggml_reshape_4d( - ctx, - layer->time_mix_w2, - layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 - ), - xxx - ); - - struct ggml_tensor *xw, *xk, *xv, *xr, *xg; - if (layer->time_mix_lerp_fused) { - // fusing these weights makes some performance improvement - sx = ggml_reshape_3d(ctx, sx, n_embd, 1, n_tokens); - cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); - xxx = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur); - xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); - xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - } else { - // for backward compatibility - xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); - xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - - xw = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur); - xk = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur); - xv = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur); - xr = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur); - xg = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur); - } - - struct ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr); - struct ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk); - struct ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv); - if (layer->time_mix_receptance_b) { - r = ggml_add(ctx, r, layer->time_mix_receptance_b); - } - if (layer->time_mix_key_b) { - k = ggml_add(ctx, k, layer->time_mix_key_b); - } - if (layer->time_mix_value_b) { - v = ggml_add(ctx, v, layer->time_mix_value_b); - } - - struct ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg); - if (is_qrwkv) { - g = ggml_sigmoid(ctx, g); - } else { - g = ggml_silu(ctx, g); - } - - if (head_count_kv != head_count) { - GGML_ASSERT(head_count % head_count_kv == 0); - k = ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens); - v = ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens); - struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens); - k = ggml_repeat(ctx, k, tmp); - v = ggml_repeat(ctx, v, tmp); - } - - k = ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens); - v = ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens); - r = ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens); - - struct ggml_tensor * w = ggml_mul_mat( - ctx, - layer->time_mix_decay_w2, - ggml_tanh( - ctx, - ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw) - ) - ); - - w = ggml_add(ctx, w, layer->time_mix_decay); - w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); - w = ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens); - - if (is_qrwkv) { - // k = k * (1 - w) - k = ggml_sub(ctx, k, ggml_mul(ctx, k, w)); - } - - struct ggml_tensor * wkv_output; - if (!layer->time_mix_first) { - wkv_output = ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f)); - } else { - wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); - } - cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0); - *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); - - if (!is_qrwkv) { - // group norm with head_count groups - cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens); - cur = ggml_norm(ctx, cur, 64e-5f); - - // Convert back to regular vectors. - cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); - cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); - } else { - cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); - } - - cur = ggml_mul(ctx, cur, g); - cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur); - - return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs); -} - -static struct ggml_tensor * llm_build_rwkv6_channel_mix( - struct llama_context & lctx, - struct ggml_context * ctx, - const struct llama_layer * layer, - struct ggml_tensor * cur, - struct ggml_tensor * x_prev) { - struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); - struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur); - struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur); - - struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr)); - struct ggml_tensor * k = ggml_sqr( - ctx, - ggml_relu( - ctx, - llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk) - ) - ); - - return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k)); -} - struct llm_build_context { - const llama_model & model; - llama_context & lctx; - const llama_hparams & hparams; - const llama_cparams & cparams; - const llama_ubatch & ubatch; - const llama_kv_cache & kv_self; + llama_context & lctx; + const llama_model & model; + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_ubatch & ubatch; + //const llama_kv_cache & kv_self; + const llama_adapter_cvec & cvec; + const llama_loras & loras; const int64_t n_embd; const int64_t n_layer; @@ -1070,12 +137,13 @@ struct llm_build_context { const float norm_rms_eps; const int32_t n_tokens; - const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size) + //const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size) + //const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_outputs; const int32_t n_outputs_enc; - const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_ctx_orig; + const bool worst_case; const bool flash_attn; const enum llama_pooling_type pooling_type; @@ -1089,16 +157,18 @@ struct llm_build_context { // TODO: consider making the entire interface noexcept llm_build_context( - llama_context & lctx, - const llama_ubatch & ubatch, - const llm_build_cb & cb, - bool worst_case) : - model (lctx.model), + llama_context & lctx, + const llama_ubatch & ubatch, + const llm_build_cb & cb, + bool worst_case) : lctx (lctx), + model (lctx.model), hparams (model.hparams), cparams (lctx.cparams), ubatch (ubatch), - kv_self (lctx.kv_self), + //kv_self (lctx.kv_self), + cvec (lctx.cvec), + loras (lctx.loras), n_embd (hparams.n_embd), n_layer (hparams.n_layer), n_rot (hparams.n_rot), @@ -1120,11 +190,12 @@ struct llm_build_context { norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (ubatch.n_tokens), - n_kv (worst_case ? kv_self.size : kv_self.n), + //n_kv (worst_case ? kv_self.size : kv_self.n), + //kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_outputs (worst_case ? n_tokens : lctx.n_outputs), n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd), - kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_ctx_orig (cparams.n_ctx_orig_yarn), + worst_case (worst_case), flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), @@ -1133,156 +204,614 @@ struct llm_build_context { // all initializations should be done in init() } - void init() { - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; + void init() { + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ctx0 = ggml_init(params); + + lctx.reset(); + } + + void free() { + ggml_free(ctx0); + ctx0 = nullptr; + } + + struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { + struct ggml_tensor * inpL; + + if (ubatch.token) { + lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(lctx.inp_tokens, "inp_tokens", -1); + ggml_set_input(lctx.inp_tokens); + + inpL = ggml_get_rows(ctx0, tok_embd, lctx.inp_tokens); + + // apply lora for embedding tokens if needed + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( + ctx0, lw->b, // non-transposed lora_b + ggml_get_rows(ctx0, lw->a, lctx.inp_tokens) + ), scale); + + inpL = ggml_add(ctx0, inpL, inpL_delta); + } + } else { + lctx.inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); + inpL = lctx.inp_embd; + ggml_set_input(lctx.inp_embd); + } + + // For Granite architecture + if (hparams.f_embedding_scale != 0.0f) { + inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale); + } + + cb(inpL, "inp_embd", -1); + + return inpL; + } + + // do mat_mul, while optionally apply lora + struct ggml_tensor * build_lora_mm( + struct ggml_tensor * w, + struct ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lw->b, + ggml_mul_mat(ctx0, lw->a, cur) + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; + } + + // do mat_mul_id, while optionally apply lora + struct ggml_tensor * build_lora_mm_id( + struct ggml_tensor * w, // struct ggml_tensor * as + struct ggml_tensor * cur, // struct ggml_tensor * b + struct ggml_tensor * ids) { + struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float alpha = lora.first->alpha; + const float rank = (float) lw->b->ne[0]; + const float scale = alpha ? lora.second * alpha / rank : lora.second; + + struct ggml_tensor * ab_cur = ggml_mul_mat_id( + ctx0, lw->b, + ggml_mul_mat_id(ctx0, lw->a, cur, ids), + ids + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; + } + + struct ggml_tensor * build_norm( + struct ggml_tensor * cur, + struct ggml_tensor * mw, + struct ggml_tensor * mb, + llm_norm_type type, + int il) { + switch (type) { + case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm (ctx0, cur, hparams.f_norm_rms_eps); break; + case LLM_NORM_GROUP: + { + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]); + cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[2]); + } break; + } + + if (mw || mb) { + cb(cur, "norm", il); + } + + if (mw) { + cur = ggml_mul(ctx0, cur, mw); + if (mb) { + cb(cur, "norm_w", il); + } + } + + if (mb) { + cur = ggml_add(ctx0, cur, mb); + } + + return cur; + } + + struct ggml_tensor * build_ffn( + struct ggml_tensor * cur, + struct ggml_tensor * up, + struct ggml_tensor * up_b, + struct ggml_tensor * up_s, + struct ggml_tensor * gate, + struct ggml_tensor * gate_b, + struct ggml_tensor * gate_s, + struct ggml_tensor * down, + struct ggml_tensor * down_b, + struct ggml_tensor * down_s, + struct ggml_tensor * act_scales, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + const llm_build_cb & cb, + int il) { + struct ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur; + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx0, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (up_s) { + tmp = ggml_mul(ctx0, tmp, up_s); + cb(tmp, "ffn_up_s", il); + } + + if (gate) { + switch (type_gate) { + case LLM_FFN_SEQ: + { + cur = build_lora_mm(gate, tmp); + cb(cur, "ffn_gate", il); + } break; + case LLM_FFN_PAR: + { + cur = build_lora_mm(gate, cur); + cb(cur, "ffn_gate", il); + } break; + } + + if (gate_b) { + cur = ggml_add(ctx0, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + + if (gate_s) { + cur = ggml_mul(ctx0, cur, gate_s); + cb(cur, "ffn_gate_s", il); + } + + } else { + cur = tmp; + } + + switch (type_op) { + case LLM_FFN_SILU: + { + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_silu", il); + } break; + case LLM_FFN_GELU: + { + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_gelu", il); + if (act_scales != NULL) { + cur = ggml_div(ctx0, cur, act_scales); + cb(cur, "ffn_act", il); + } + } break; + case LLM_FFN_RELU: + { + cur = ggml_relu(ctx0, cur); + cb(cur, "ffn_relu", il); + } break; + case LLM_FFN_RELU_SQR: + { + cur = ggml_relu(ctx0, cur); + cb(cur, "ffn_relu", il); + + cur = ggml_sqr(ctx0, cur); + cb(cur, "ffn_sqr(relu)", il); + } break; + case LLM_FFN_SWIGLU: + { + // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + int64_t split_point = cur->ne[0] / 2; + struct ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); + struct ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); + + x0 = ggml_silu(ctx0, x0); + cb(cur, "ffn_silu", il); + + cur = ggml_mul(ctx0, x0, x1); + cb(cur, "ffn_mul", il); + } break; + } + + if (type_gate == LLM_FFN_PAR) { + cur = ggml_mul(ctx0, cur, tmp); + cb(cur, "ffn_gate_par", il); + } + + if (down) { + cur = build_lora_mm(down, cur); + } + + if (down_b) { + cb(cur, "ffn_down", il); + } - ctx0 = ggml_init(params); + if (down_b) { + cur = ggml_add(ctx0, cur, down_b); + } - lctx.inp_tokens = nullptr; - lctx.inp_embd = nullptr; - lctx.inp_pos = nullptr; - lctx.inp_out_ids = nullptr; - lctx.inp_KQ_mask = nullptr; - lctx.inp_KQ_mask_swa = nullptr; - lctx.inp_K_shift = nullptr; - lctx.inp_mean = nullptr; - lctx.inp_cls = nullptr; - lctx.inp_s_copy = nullptr; - lctx.inp_s_mask = nullptr; - lctx.inp_s_seq = nullptr; - lctx.inp_pos_bucket = nullptr; - lctx.inp_embd_enc = nullptr; - lctx.inp_KQ_mask_cross = nullptr; - } + if (down_s) { + cur = ggml_mul(ctx0, cur, down_s); + cb(cur, "ffn_down_s", il); + } - void free() { - ggml_free(ctx0); - ctx0 = nullptr; + return cur; } - struct ggml_cgraph * build_k_shift() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + struct ggml_tensor * build_moe_ffn( + struct ggml_tensor * cur, + struct ggml_tensor * gate_inp, + struct ggml_tensor * up_exps, + struct ggml_tensor * gate_exps, + struct ggml_tensor * down_exps, + struct ggml_tensor * exp_probs_b, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + bool scale_w, + float w_scale, + llama_expert_gating_func_type gating_op, + const llm_build_cb & cb, + int il) { + int64_t n_embd = cur->ne[0]; + int64_t n_tokens = cur->ne[1]; - GGML_ASSERT(kv_self.size == n_ctx); + ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens] + cb(logits, "ffn_moe_logits", il); - lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(lctx.inp_K_shift, "K_shift", -1); - ggml_set_input(lctx.inp_K_shift); + ggml_tensor * probs = nullptr; + switch (gating_op) { + case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: + { + probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens] + } break; + case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: + { + probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens] + } break; + default: + GGML_ABORT("fatal error"); + } + cb(probs, "ffn_moe_probs", il); - for (int il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + // add experts selection bias - introduced in DeepSeek V3 + // leave probs unbiased as it's later used to get expert weights + ggml_tensor * selection_probs = probs; + if (exp_probs_b != nullptr) { + selection_probs = ggml_add(ctx0, probs, exp_probs_b); + cb(selection_probs, "ffn_moe_probs_biased", il); + } - struct ggml_tensor * rope_factors = build_rope_factors(il); + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + cb(selected_experts, "ffn_moe_topk", il); - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - 0); + ggml_tensor * weights = ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights", il); - struct ggml_tensor * tmp; - if (ggml_is_quantized(k->type)) { - // dequantize to f32 -> RoPE -> quantize back - tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); - cb(tmp, "K_f32", il); + if (norm_w) { + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); - for (auto & backend : lctx.backends) { - // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { - ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get()); - break; - } - } - tmp = ggml_rope_ext_inplace(ctx0, tmp, - lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(tmp, "K_shifted_f32", il); + ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] + cb(weights_sum, "ffn_moe_weights_sum", il); - tmp = ggml_cpy(ctx0, tmp, k); - } else { - // we rotate only the first n_rot dimensions - tmp = ggml_rope_ext_inplace(ctx0, k, - lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - } - cb(tmp, "K_shifted", il); + weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights_norm", il); - ggml_build_forward_expand(gf, tmp); + weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); + } + if (scale_w) { + weights = ggml_scale(ctx0, weights, w_scale); + cb(weights, "ffn_moe_weights_scaled", il); } - return gf; - } + cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); + ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(up, "ffn_moe_up", il); - struct ggml_cgraph * build_defrag(const std::vector & ids) { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(gate, "ffn_moe_gate", il); + + switch (type_op) { + case LLM_FFN_SILU: + { + gate = ggml_silu(ctx0, gate); + cb(gate, "ffn_moe_silu", il); + } break; + case LLM_FFN_GELU: + { + gate = ggml_gelu(ctx0, gate); + cb(gate, "ffn_moe_gelu", il); + } break; + default: + GGML_ABORT("fatal error"); + } - for (uint32_t i = 0; i < ids.size(); ++i) { - const uint32_t id = ids[i]; + ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] + cb(par, "ffn_moe_gate_par", il); - if (i == id || id == ids.size()) { - continue; - } + ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + cb(experts, "ffn_moe_down", il); - uint32_t nm = 1; + experts = ggml_mul(ctx0, experts, weights); - while (i + nm < ids.size() && ids[i + nm] == id + nm) { - nm++; + // aggregate experts + ggml_tensor * moe_out = nullptr; + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens, + experts->nb[2], i*experts->nb[1]); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx0, moe_out, cur_expert); } + } - for (int il = 0; il < n_layer; ++il) { - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + if (n_expert_used == 1) { + // avoid returning a non-contiguous tensor + moe_out = ggml_cont(ctx0, moe_out); + } - ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + return moe_out; + } - ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + struct ggml_tensor * build_attn( + struct ggml_cgraph * graph, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + struct ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + const llm_build_cb & cb, + int il) { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(graph, q_cur); + ggml_build_forward_expand(graph, k_cur); + ggml_build_forward_expand(graph, v_cur); - ggml_tensor * view_v_src; - ggml_tensor * view_v_dst; + //build_kv_store(graph, k_cur, v_cur, il); + lctx.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); - if (flash_attn) { - // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + struct ggml_tensor * cur; - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); - } else { - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, i)); - - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, id)); - } + //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il); + cur = lctx.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); + cb(cur, "kqv_out", il); + + return cur; + } + + //struct ggml_tensor * build_rwkv6_time_mix( + // const struct llama_layer * layer, + // struct ggml_tensor * cur, + // struct ggml_tensor * x_prev, + // struct ggml_tensor ** wkv_state, + // size_t wkv_head_size, + // size_t head_count_kv) { + // size_t n_embd = cur->ne[0]; + // size_t n_seq_tokens = cur->ne[1]; + // size_t n_seqs = cur->ne[2]; + + // size_t head_size = wkv_head_size; + // size_t head_count = n_embd / head_size; + + // size_t n_tokens = n_seqs * n_seq_tokens; + + // bool is_qrwkv = layer->time_mix_first == nullptr; + + // struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + + // sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens); + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + + // struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur); + + // xxx = ggml_reshape_4d( + // ctx0, + // ggml_tanh( + // ctx0, + // ggml_mul_mat(ctx0, layer->time_mix_w1, xxx) + // ), + // layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens + // ); + + // xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2)); + + // xxx = ggml_mul_mat( + // ctx0, + // ggml_reshape_4d( + // ctx0, + // layer->time_mix_w2, + // layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 + // ), + // xxx + // ); + + // struct ggml_tensor *xw, *xk, *xv, *xr, *xg; + // if (layer->time_mix_lerp_fused) { + // // fusing these weights makes some performance improvement + // sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens); + // cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); + // xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur); + // xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + // xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + // xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + // xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + // xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + // } else { + // // for backward compatibility + // xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + // xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + // xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + // xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + // xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + + // xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur); + // xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur); + // xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur); + // xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur); + // xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur); + // } + + // struct ggml_tensor * r = build_lora_mm(layer->time_mix_receptance, xr); + // struct ggml_tensor * k = build_lora_mm(layer->time_mix_key, xk); + // struct ggml_tensor * v = build_lora_mm(layer->time_mix_value, xv); + // if (layer->time_mix_receptance_b) { + // r = ggml_add(ctx0, r, layer->time_mix_receptance_b); + // } + // if (layer->time_mix_key_b) { + // k = ggml_add(ctx0, k, layer->time_mix_key_b); + // } + // if (layer->time_mix_value_b) { + // v = ggml_add(ctx0, v, layer->time_mix_value_b); + // } + + // struct ggml_tensor * g = build_lora_mm(layer->time_mix_gate, xg); + // if (is_qrwkv) { + // g = ggml_sigmoid(ctx0, g); + // } else { + // g = ggml_silu(ctx0, g); + // } + + // if (head_count_kv != head_count) { + // GGML_ASSERT(head_count % head_count_kv == 0); + // k = ggml_reshape_4d(ctx0, k, head_size, 1, head_count_kv, n_tokens); + // v = ggml_reshape_4d(ctx0, v, head_size, 1, head_count_kv, n_tokens); + // struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens); + // k = ggml_repeat(ctx0, k, tmp); + // v = ggml_repeat(ctx0, v, tmp); + // } + + // k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens); + // v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens); + // r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens); + + // struct ggml_tensor * w = ggml_mul_mat( + // ctx0, + // layer->time_mix_decay_w2, + // ggml_tanh( + // ctx0, + // ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw) + // ) + // ); + + // w = ggml_add(ctx0, w, layer->time_mix_decay); + // w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w))); + // w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens); + + // if (is_qrwkv) { + // // k = k * (1 - w) + // k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w)); + // } + + // struct ggml_tensor * wkv_output; + // if (!layer->time_mix_first) { + // wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, *wkv_state, pow(head_size, -0.5f)); + // } else { + // wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, *wkv_state); + // } + // cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); + // *wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); + + // if (!is_qrwkv) { + // // group norm with head_count groups + // cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens); + // cur = ggml_norm(ctx0, cur, 64e-5f); + + // // Convert back to regular vectors. + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b); + // } else { + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // } + + // cur = ggml_mul(ctx0, cur, g); + // cur = build_lora_mm(layer->time_mix_output, cur); + + // return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs); + //} + + //struct ggml_tensor * build_rwkv6_channel_mix( + // const struct llama_layer * layer, + // struct ggml_tensor * cur, + // struct ggml_tensor * x_prev) { + // struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + // struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); + // struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); + + // struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); + // struct ggml_tensor * k = ggml_sqr( + // ctx0, + // ggml_relu( + // ctx0, + // build_lora_mm(layer->channel_mix_key, xk) + // ) + // ); + + // return ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k)); + //} - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); - } + struct ggml_cgraph * build_k_shift() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - i += nm - 1; - } + lctx.build_k_shift(ctx0, gf); + + return gf; + } + + struct ggml_cgraph * build_defrag() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); + lctx.build_defrag(ctx0, gf); return gf; } @@ -1294,21 +823,6 @@ struct llm_build_context { return lctx.inp_pos; } - struct ggml_tensor * build_rope_factors(int il) { - // choose long/short freq factors based on the context size - const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; - - if (model.layers[il].rope_freqs != nullptr) { - return model.layers[il].rope_freqs; - } - - if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { - return model.layers[il].rope_long; - } - - return model.layers[il].rope_short; - } - struct ggml_tensor * build_inp_out_ids() { lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); cb(lctx.inp_out_ids, "inp_out_ids", -1); @@ -1316,28 +830,6 @@ struct llm_build_context { return lctx.inp_out_ids; } - struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { - lctx.inp_KQ_mask = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) - : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - cb(lctx.inp_KQ_mask, "KQ_mask", -1); - ggml_set_input(lctx.inp_KQ_mask); - - return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask; - } - - struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) { - GGML_ASSERT(hparams.n_swa > 0); - - lctx.inp_KQ_mask_swa = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) - : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1); - ggml_set_input(lctx.inp_KQ_mask_swa); - - return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa; - } - struct ggml_tensor * build_inp_mean() { lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); cb(lctx.inp_mean, "inp_mean", -1); @@ -1352,20 +844,6 @@ struct llm_build_context { return lctx.inp_cls; } - struct ggml_tensor * build_inp_s_copy() { - lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); - cb(lctx.inp_s_copy, "inp_s_copy", -1); - ggml_set_input(lctx.inp_s_copy); - return lctx.inp_s_copy; - } - - struct ggml_tensor * build_inp_s_mask() { - lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); - cb(lctx.inp_s_mask, "inp_s_mask", -1); - ggml_set_input(lctx.inp_s_mask); - return lctx.inp_s_mask; - } - struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { // find result_norm tensor for input struct ggml_tensor * inp = nullptr; @@ -1431,37 +909,37 @@ struct llm_build_context { return gf; } - struct ggml_tensor * build_pos_bucket(bool causal) { - if (causal) { - lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); - } else { - lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); - } + //struct ggml_tensor * build_pos_bucket(bool causal) { + // if (causal) { + // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); + // } else { + // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); + // } - ggml_set_input(lctx.inp_pos_bucket); - cb(lctx.inp_pos_bucket, "pos_bucket", -1); + // ggml_set_input(lctx.inp_pos_bucket); + // cb(lctx.inp_pos_bucket, "pos_bucket", -1); - return lctx.inp_pos_bucket; - } + // return lctx.inp_pos_bucket; + //} - struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { - struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); - cb(pos_bucket_1d, "pos_bucket_1d", -1); + //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { + // struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); + // cb(pos_bucket_1d, "pos_bucket_1d", -1); - struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); - cb(pos_bias, "pos_bias", -1); + // struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); + // cb(pos_bias, "pos_bias", -1); - pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0); - cb(pos_bias, "pos_bias", -1); + // pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0); + // cb(pos_bias, "pos_bias", -1); - pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); - cb(pos_bias, "pos_bias", -1); + // pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); + // cb(pos_bias, "pos_bias", -1); - pos_bias = ggml_cont(ctx0, pos_bias); - cb(pos_bias, "pos_bias", -1); + // pos_bias = ggml_cont(ctx0, pos_bias); + // cb(pos_bias, "pos_bias", -1); - return pos_bias; - } + // return pos_bias; + //} struct ggml_tensor * build_inp_embd_enc() { const int64_t n_embd = hparams.n_embd; @@ -1491,45 +969,44 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -1550,9 +1027,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -1574,12 +1051,12 @@ struct llm_build_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -1588,12 +1065,12 @@ struct llm_build_context { cb(cur, "ffn_out", il); } else { // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -1615,7 +1092,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1624,13 +1101,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // For Granite architecture if (hparams.f_logit_scale) { @@ -1657,13 +1134,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -1676,37 +1152,37 @@ struct llm_build_context { cur = inpL; } else { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); } if (n_head > 0 && n_head_kv == 0) { // "linear attention" of Llama-3_1-Nemotron-51B - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur); cb(cur, "wo", il); } else if (n_head > 0) { // self-attention // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -1727,9 +1203,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -1754,12 +1230,12 @@ struct llm_build_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -1776,7 +1252,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1785,13 +1261,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // For Granite architecture if (hparams.f_logit_scale) { @@ -1815,31 +1291,30 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); switch (model.type) { @@ -1865,9 +1340,9 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -1882,12 +1357,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -1897,7 +1372,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1906,13 +1381,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -1930,31 +1405,30 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -1970,9 +1444,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -1987,12 +1461,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2002,7 +1476,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2011,11 +1485,11 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2034,37 +1508,36 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - attn_norm = llm_build_norm(ctx0, inpL, hparams, + attn_norm = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(attn_norm, "attn_norm", il); // self-attention { if (model.layers[il].attn_norm_2) { // Falcon-40B - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm_2", il); } else { cur = attn_norm; } - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); @@ -2091,9 +1564,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2108,7 +1581,7 @@ struct llm_build_context { // feed forward { - cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result + cur = build_ffn(attn_norm, // !! use the attn norm, not the result model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2119,7 +1592,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2129,13 +1602,13 @@ struct llm_build_context { cur = inpL; // norm - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2156,7 +1629,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // multiply by embedding_multiplier_scale of 78.38367176906169 inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); @@ -2164,37 +1637,36 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -2215,9 +1687,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -2231,9 +1703,9 @@ struct llm_build_context { // Grok // if attn_out_norm is present then apply it before adding the input if (model.layers[il].attn_out_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_out_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_out_norm", il); } @@ -2242,12 +1714,12 @@ struct llm_build_context { // feed-forward network // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -2264,16 +1736,16 @@ struct llm_build_context { // if layer_out_norm is present then apply it before adding the input // Idea: maybe ffn_out_norm is a better name if (model.layers[il].layer_out_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].layer_out_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "layer_out_norm", il); } cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2282,13 +1754,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // Grok // multiply logits by output_multiplier_scale of 0.5773502691896257 @@ -2316,21 +1788,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM, cb, il); + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention @@ -2339,7 +1810,7 @@ struct llm_build_context { struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -2367,9 +1838,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2385,12 +1856,12 @@ struct llm_build_context { // feed-forward network // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].attn_out_norm, NULL, - LLM_NORM, cb, il); + cur = build_norm(ffn_inp, + model.layers[il].attn_out_norm, NULL, + LLM_NORM, il); cb(cur, "attn_out_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -2406,7 +1877,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2415,13 +1886,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM, cb, -1); + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); @@ -2440,13 +1911,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -2455,15 +1925,15 @@ struct llm_build_context { cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -2479,9 +1949,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2497,13 +1967,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2513,20 +1983,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2543,28 +2013,27 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); @@ -2573,9 +2042,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2590,12 +2059,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2605,7 +2074,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2614,13 +2083,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2645,7 +2114,7 @@ struct llm_build_context { } // construct input embeddings (token, type, position) - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // token types are hardcoded to zero ("Sentence A") struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); @@ -2656,11 +2125,10 @@ struct llm_build_context { cb(inpL, "inp_embd", -1); // embed layer norm - inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); cb(inpL, "inp_norm", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false); + lctx.build_attn_inp(ctx0, n_tokens, false, false, worst_case); // iterate layers for (int il = 0; il < n_layer; ++il) { @@ -2672,33 +2140,33 @@ struct llm_build_context { // self-attention if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { - Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq); + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); if (model.layers[il].attn_q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); } - Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk); cb(Kcur, "Kcur", il); if (model.layers[il].attn_k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); } - Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } else { // compute Q and K and RoPE them - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); @@ -2730,7 +2198,8 @@ struct llm_build_context { struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); cb(kq, "kq", il); - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); + //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); + kq = lctx.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); cb(kq, "kq_soft_max_ext", il); struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); @@ -2747,7 +2216,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur); if (model.layers[il].bo) { cb(cur, "kqv_wo", il); } @@ -2768,11 +2237,11 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); // attention layer norm - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il); + cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il); if (model.layers[il].attn_norm_2 != nullptr) { cur = ggml_add(ctx0, cur, inpL); // re-add the layer input - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il); + cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il); } struct ggml_tensor * ffn_inp = cur; @@ -2780,21 +2249,21 @@ struct llm_build_context { // feed-forward network if (model.arch == LLM_ARCH_BERT) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2807,7 +2276,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); // output layer norm - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il); + cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il); // input for next layer inpL = cur; @@ -2832,27 +2301,26 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - inpL = llm_build_norm(ctx0, inpL, hparams, + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(inpL, "inp_norm", -1); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -2868,9 +2336,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2886,13 +2354,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2902,20 +2370,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2934,10 +2402,9 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); if (model.pos_embd) { // inp_pos - contains the positions @@ -2952,17 +2419,17 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - attn_norm = llm_build_norm(ctx0, inpL, hparams, + attn_norm = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(attn_norm, "attn_norm", il); // self-attention { cur = attn_norm; - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); if (model.layers[il].bqkv){ @@ -2985,30 +2452,30 @@ struct llm_build_context { // Q/K Layernorm if (model.layers[il].attn_q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } } @@ -3025,12 +2492,12 @@ struct llm_build_context { // feed forward { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -3040,7 +2507,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3049,13 +2516,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3072,22 +2539,21 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); struct ggml_tensor * inpSA = cur; @@ -3095,21 +2561,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -3122,17 +2588,17 @@ struct llm_build_context { cb(Kcur, "Kcur", il); if (model.layers[il].attn_q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); } if (model.layers[il].attn_k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -3151,9 +2617,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3170,16 +2636,16 @@ struct llm_build_context { // feed-forward network { if (model.layers[il].ffn_norm) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); } else { // parallel residual cur = inpSA; } - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3189,7 +2655,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3198,14 +2664,14 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3222,25 +2688,24 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -3270,9 +2735,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3287,12 +2752,12 @@ struct llm_build_context { // feed-forward forward { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3302,7 +2767,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3311,13 +2776,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3335,37 +2800,36 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -3384,9 +2848,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3400,12 +2864,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3414,7 +2878,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3423,13 +2887,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3446,7 +2910,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4); @@ -3454,8 +2918,8 @@ struct llm_build_context { ggml_set_input(lctx.inp_pos); struct ggml_tensor * inp_pos = lctx.inp_pos; - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -3463,25 +2927,25 @@ struct llm_build_context { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -3502,9 +2966,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3518,12 +2982,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3532,7 +2996,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3541,13 +3005,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3568,37 +3032,36 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -3617,9 +3080,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3634,13 +3097,13 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, lctx, cur, + build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -3655,14 +3118,14 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur); + ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur); cb(cur_gate_inp, "ffn_shexp_gate_inp", il); // sigmoid ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); cb(cur_gate, "ffn_shexp_gate", il); - ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * cur_ffn = build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -3680,7 +3143,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3689,13 +3152,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3715,19 +3178,18 @@ struct llm_build_context { struct ggml_tensor * ffn_output; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - attn_norm_output = llm_build_norm(ctx0, inpL, hparams, + attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(attn_norm_output, "attn_norm", il); // self-attention @@ -3737,7 +3199,7 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); + cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -3747,9 +3209,9 @@ struct llm_build_context { Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); } else { - Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); } cb(Qcur, "Qcur", il); @@ -3776,9 +3238,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -3791,7 +3253,7 @@ struct llm_build_context { // FF { - ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output, + ffn_output = build_ffn(attn_norm_output, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -3802,20 +3264,20 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_output); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); @@ -3834,19 +3296,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = nullptr; - if (hparams.n_swa == 0) { - // Phi-4 doesn't use sliding window attention - KQ_mask = build_inp_KQ_mask(); - } else { - KQ_mask = build_inp_KQ_mask_swa(); - } + lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { auto residual = inpL; @@ -3854,12 +3310,12 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); - struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams, + struct ggml_tensor* attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(attn_norm_output, "attn_norm", il); struct ggml_tensor * Qcur = nullptr; @@ -3867,16 +3323,16 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); + cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); } else { - Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); } cb(Qcur, "Qcur", il); @@ -3901,9 +3357,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -3916,14 +3372,14 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, residual); residual = cur; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3932,7 +3388,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); } else { // MoE branch - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -3947,20 +3403,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, residual, cur); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); if (model.output_b != nullptr) { cb(cur, "result_output_no_bias", -1); @@ -3984,20 +3440,19 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); struct ggml_tensor * attention_norm = cur; @@ -4005,13 +3460,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -4026,9 +3481,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } struct ggml_tensor * sa_out = cur; @@ -4044,7 +3499,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4055,7 +3510,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4064,13 +3519,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4089,13 +3544,12 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -4104,15 +3558,15 @@ struct llm_build_context { cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -4128,9 +3582,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4146,13 +3600,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4162,20 +3616,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4194,24 +3648,23 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -4239,9 +3692,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4257,13 +3710,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4273,20 +3726,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4304,41 +3757,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); // if (model.layers[il].bq) { // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); // cb(Qcur, "Qcur", il); // } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); // if (model.layers[il].bk) { // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); // cb(Kcur, "Kcur", il); // } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); // if (model.layers[il].bv) { // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -4359,9 +3811,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4375,12 +3827,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4389,7 +3841,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4398,13 +3850,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4422,41 +3874,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -4477,9 +3928,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4493,12 +3944,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4507,7 +3958,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4516,13 +3967,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4546,7 +3997,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // scale the input embeddings inpL = ggml_scale(ctx0, inpL, scale_embd); @@ -4555,17 +4006,16 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention @@ -4575,9 +4025,9 @@ struct llm_build_context { q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); - q = llm_build_norm(ctx0, q, hparams, + q = build_norm(q, model.layers[il].attn_q_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(q, "q", il); // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} @@ -4616,9 +4066,9 @@ struct llm_build_context { cb(k_pe, "k_pe", il); kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm - kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + kv_compressed = build_norm(kv_compressed, model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(kv_compressed, "kv_compressed", il); // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} @@ -4670,9 +4120,9 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + k_states, v_states, q_states, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -4692,12 +4142,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4711,7 +4161,7 @@ struct llm_build_context { cb(cur, "hidden_scaled_ffn", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4720,9 +4170,9 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head scaling @@ -4731,7 +4181,7 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4747,7 +4197,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -4755,26 +4205,25 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -4792,9 +4241,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -4807,14 +4256,14 @@ struct llm_build_context { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, + cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4824,7 +4273,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, sa_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4833,13 +4282,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4855,7 +4304,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -4863,31 +4312,25 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - // gemma 2 requires different mask for layers using sliding window (SWA) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true); - struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true); + lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { - // (il % 2) layers use SWA - struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask; - // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -4911,14 +4354,14 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); if (il == n_layer - 1) { @@ -4931,14 +4374,14 @@ struct llm_build_context { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, + cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4947,13 +4390,13 @@ struct llm_build_context { cb(cur, "ffn_out", il); } - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); cur = ggml_add(ctx0, cur, sa_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4962,13 +4405,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // final logit soft-capping cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); @@ -4993,41 +4436,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -5048,9 +4490,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5065,12 +4507,12 @@ struct llm_build_context { // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -5079,7 +4521,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5088,13 +4530,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5109,21 +4551,20 @@ struct llm_build_context { struct ggml_tensor * inpL; // {n_embd, n_tokens} - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); + struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur, - state_copy, state_mask, - kv_head, n_kv, cb, il); + //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il); + cur = lctx.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case); if (il == n_layer - 1) { // skip computing output for unused tokens @@ -5142,13 +4583,13 @@ struct llm_build_context { } // final rmsnorm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5167,41 +4608,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); struct ggml_tensor * ffn_inp = cur; // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -5220,16 +4660,16 @@ struct llm_build_context { 0); cb(Kcur, "Kcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -5247,9 +4687,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5264,7 +4704,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, ffn_inp, + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5276,7 +4716,7 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5285,13 +4725,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); @@ -5315,15 +4755,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - // cohere2 requires different mask for layers using sliding window (SWA) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); + lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); // sliding window switch pattern const int32_t sliding_window_pattern = 4; @@ -5331,35 +4768,34 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { // three layers sliding window attention (window size 4096) and ROPE // fourth layer uses global attention without positional embeddings - const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); - struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; + const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); // norm - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il); + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); cb(cur, "attn_norm", il); struct ggml_tensor * ffn_inp = cur; // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -5385,8 +4821,8 @@ struct llm_build_context { cb(Kcur, "Kcur", il); } - cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, - KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, + n_tokens, 1.0f / sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5401,7 +4837,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -5410,7 +4846,7 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5419,11 +4855,11 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1); + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); @@ -5455,41 +4891,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, NULL, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (hparams.f_clamp_kqv > 0.0f) { Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (hparams.f_clamp_kqv > 0.0f) { Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (hparams.f_clamp_kqv > 0.0f) { Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -5510,9 +4945,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5527,12 +4962,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, NULL, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5543,7 +4978,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5552,13 +4987,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, NULL, NULL, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5579,13 +5014,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5595,21 +5029,21 @@ struct llm_build_context { // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, cb, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, cb, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -5629,14 +5063,14 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); if (il == n_layer - 1) { @@ -5651,7 +5085,7 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_ffn(ctx0, lctx, ffn_inp, + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5659,15 +5093,15 @@ struct llm_build_context { LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5676,13 +5110,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5707,41 +5141,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, cb, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, cb, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -5761,9 +5194,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5778,12 +5211,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -5797,7 +5230,7 @@ struct llm_build_context { cb(cur, "ffn_moe_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5806,13 +5239,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5828,13 +5261,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { const int64_t n_head = hparams.n_head(il); @@ -5845,14 +5277,14 @@ struct llm_build_context { struct ggml_tensor * residual = cur; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); @@ -5866,14 +5298,14 @@ struct llm_build_context { struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); cb(Vcur, "Vcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(Qcur, "Qcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(Kcur, "Kcur", il); Qcur = ggml_rope_ext( @@ -5891,9 +5323,9 @@ struct llm_build_context { Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); cb(Qcur, "Vcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5908,12 +5340,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5923,7 +5355,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); inpL = cur; @@ -5932,12 +5364,12 @@ struct llm_build_context { cur = inpL; // norm - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5955,24 +5387,23 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -6000,9 +5431,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -6019,13 +5450,13 @@ struct llm_build_context { struct ggml_tensor * attn_out = cur; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -6037,7 +5468,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6050,13 +5481,13 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -6065,7 +5496,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6073,13 +5504,13 @@ struct llm_build_context { } } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -6100,33 +5531,32 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -6143,9 +5573,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -6160,12 +5590,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6177,12 +5607,12 @@ struct llm_build_context { cb(ffn_out, "ffn_out", il); // MoE - cur = llm_build_norm(ctx0, inpSA, hparams, + cur = build_norm(inpSA, model.layers[il].ffn_norm_exps, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm_exps", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -6198,7 +5628,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_out); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6207,13 +5637,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -6234,44 +5664,45 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -6292,9 +5723,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -6309,13 +5740,13 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6325,7 +5756,7 @@ struct llm_build_context { } else { // MoE branch ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, lctx, cur, + build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -6340,7 +5771,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -6354,7 +5785,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6363,13 +5794,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); @@ -6400,21 +5831,20 @@ struct llm_build_context { struct ggml_tensor * inpL; // {n_embd, n_tokens} - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention @@ -6425,9 +5855,9 @@ struct llm_build_context { q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); - q = llm_build_norm(ctx0, q, hparams, + q = build_norm(q, model.layers[il].attn_q_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(q, "q", il); // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} @@ -6470,9 +5900,9 @@ struct llm_build_context { cb(k_pe, "k_pe", il); kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm - kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + kv_compressed = build_norm(kv_compressed, model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(kv_compressed, "kv_compressed", il); // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} @@ -6524,9 +5954,9 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + k_states, v_states, q_states, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -6540,13 +5970,13 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6556,7 +5986,7 @@ struct llm_build_context { } else { // MoE branch ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, lctx, cur, + build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -6571,7 +6001,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -6585,7 +6015,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6594,9 +6024,9 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head @@ -6617,26 +6047,25 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); if (model.layers[il].wq_scale) { Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); } @@ -6647,7 +6076,7 @@ struct llm_build_context { } // B1.K - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); if (model.layers[il].wk_scale) { Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); } @@ -6658,7 +6087,7 @@ struct llm_build_context { } // B1.V - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); if (model.layers[il].wv_scale) { Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); } @@ -6682,16 +6111,16 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, NULL, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_sub_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_sub_norm", il); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur); if (model.layers[il].wo_scale) { cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); } @@ -6712,12 +6141,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward forward - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, NULL, NULL, NULL, @@ -6725,12 +6154,12 @@ struct llm_build_context { LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_sub_out", il); - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].ffn_sub_norm, NULL, - LLM_NORM_RMS, cb, il); + cur = build_norm(cur, + model.layers[il].ffn_sub_norm, NULL, + LLM_NORM_RMS, il); cb(cur, "ffn_sub_norm", il); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur); + cur = build_lora_mm(model.layers[il].ffn_down, cur); if (model.layers[il].ffn_down_scale) { cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); } @@ -6745,356 +6174,356 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head // FIXME: do not use model.tok_embd directly, duplicate as model.output - cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur); + cur = build_lora_mm(model.tok_embd, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); return gf; } - struct ggml_cgraph * build_t5_enc() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + //struct ggml_cgraph * build_t5_enc() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + // // mutable variable, needed during the last layer of the computation to skip unused tokens + // int32_t n_tokens = this->n_tokens; - GGML_ASSERT(lctx.is_encoding); - struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); + // const int64_t n_embd_head = hparams.n_embd_head_v; + // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + // inpL = build_inp_embd(model.tok_embd); - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + // GGML_ASSERT(lctx.is_encoding); + // struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); - // self-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); - cb(Qcur, "Qcur", il); + // // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + // struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); - cb(Kcur, "Kcur", il); + // for (int il = 0; il < n_layer; ++il) { + // struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); - cb(Vcur, "Vcur", il); + // // norm + // cur = build_norm(inpL, + // model.layers[il].attn_norm_enc, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + // // self-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur); + // cb(Qcur, "Qcur", il); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur); + // cb(Kcur, "Kcur", il); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur); + // cb(Vcur, "Vcur", il); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - cb(v, "v", il); + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; + // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); + // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + // cb(kq_b, "kq_b", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); + // cb(v, "v", il); - ggml_build_forward_expand(gf, cur); + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); + // cb(kqv, "kqv", il); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); - cb(cur, "kqv_out", il); - } + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + // ggml_build_forward_expand(gf, cur); - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + // cur = build_lora_mm(model.layers[il].wo_enc, cur); + // cb(cur, "kqv_out", il); + // } - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up_enc, NULL, NULL, - model.layers[il].ffn_gate_enc, NULL, NULL, - model.layers[il].ffn_down_enc, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + // if (il == n_layer - 1) { + // // skip computing output for unused tokens + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // n_tokens = n_outputs; + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + // } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + // cb(ffn_inp, "ffn_inp", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + // // feed-forward network + // { + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm_enc, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); - // input for next layer - inpL = cur; - } + // // T5 uses relu, flan-T5 uses gelu-gated + // cur = build_ffn(cur, + // model.layers[il].ffn_up_enc, NULL, NULL, + // model.layers[il].ffn_gate_enc, NULL, NULL, + // model.layers[il].ffn_down_enc, NULL, NULL, + // NULL, + // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + // cb, il); + // cb(cur, "ffn_out", il); + // } - cur = inpL; - cb(cur, "result_embd", -1); + // cur = ggml_add(ctx0, cur, ffn_inp); + // cb(cur, "ffn_out", il); - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm_enc, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + // ggml_tensor * layer_dir = cvec.tensor_for(il); + // if (layer_dir != nullptr) { + // cur = ggml_add(ctx0, cur, layer_dir); + // } + // cb(cur, "l_out", il); - ggml_build_forward_expand(gf, cur); + // // input for next layer + // inpL = cur; + // } - return gf; - } + // cur = inpL; + // cb(cur, "result_embd", -1); - struct ggml_cgraph * build_t5_dec() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + // cur = build_norm(cur, + // model.output_norm_enc, NULL, + // LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; + // ggml_build_forward_expand(gf, cur); - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + // return gf; + //} - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + //struct ggml_cgraph * build_t5_dec() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + // // mutable variable, needed during the last layer of the computation to skip unused tokens + // int32_t n_tokens = this->n_tokens; - GGML_ASSERT(!lctx.is_encoding); - GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); + // const int64_t n_embd_head = hparams.n_embd_head_v; + // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * embd_enc = build_inp_embd_enc(); - struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; - struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); + // inpL = build_inp_embd(model.tok_embd); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + // GGML_ASSERT(!lctx.is_encoding); + // GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + // struct ggml_tensor * embd_enc = build_inp_embd_enc(); + // struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); - // self-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); + // struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); + // struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); + // for (int il = 0; il < n_layer; ++il) { + // struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); + // // norm + // cur = build_norm(inpL, + // model.layers[il].attn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm", il); - llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); + // // self-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + // cb(Qcur, "Qcur", il); - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + // cb(Kcur, "Kcur", il); - struct ggml_tensor * v = - ggml_view_3d(ctx0, kv_self.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv_self.v_l[il])*n_ctx, - ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + // cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // build_kv_store(gf, Kcur, Vcur, il); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * k = + // ggml_view_3d(ctx0, kv_self.k_l[il], + // n_embd_head_k, n_kv, n_head_kv, + // ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + // ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + // 0); + // cb(k, "k", il); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // struct ggml_tensor * v = + // ggml_view_3d(ctx0, kv_self.v_l[il], + // n_kv, n_embd_head_v, n_head_kv, + // ggml_element_size(kv_self.v_l[il])*n_ctx, + // ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + // 0); + // cb(v, "v", il); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - cb(kqv, "kqv", il); + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; + // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); + // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + // cb(kq_b, "kq_b", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); - ggml_build_forward_expand(gf, cur); + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + // cb(kqv, "kqv", il); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); - cb(cur, "kqv_out", il); - } + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "cross_inp", il); + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); - struct ggml_tensor * inpCA = cur; + // ggml_build_forward_expand(gf, cur); - // norm - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].attn_norm_cross, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm_cross", il); + // cur = build_lora_mm(model.layers[il].wo, cur); + // cb(cur, "kqv_out", il); + // } - // cross-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); - cb(Qcur, "Qcur", il); + // cur = ggml_add(ctx0, cur, inpSA); + // cb(cur, "cross_inp", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); - cb(Kcur, "Kcur", il); + // struct ggml_tensor * inpCA = cur; - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); - cb(Vcur, "Vcur", il); + // // norm + // cur = build_norm(cur, + // model.layers[il].attn_norm_cross, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm_cross", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); + // // cross-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur); + // cb(Qcur, "Qcur", il); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc); + // cb(Kcur, "Kcur", il); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc); + // cb(Vcur, "Vcur", il); - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); - cb(v, "v", il); + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); + // cb(v, "v", il); - ggml_build_forward_expand(gf, cur); + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); + // cb(kqv, "kqv", il); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); - cb(cur, "kqv_out", il); - } + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); - } + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); - cb(ffn_inp, "ffn_inp", il); + // ggml_build_forward_expand(gf, cur); - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + // cur = build_lora_mm(model.layers[il].wo_cross, cur); + // cb(cur, "kqv_out", il); + // } - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + // if (il == n_layer - 1) { + // // skip computing output for unused tokens + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // n_tokens = n_outputs; + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + // inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); + // } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); + // cb(ffn_inp, "ffn_inp", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + // // feed-forward network + // { + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); - // input for next layer - inpL = cur; - } + // // T5 uses relu, flan-T5 uses gelu-gated + // cur = build_ffn(cur, + // model.layers[il].ffn_up, NULL, NULL, + // model.layers[il].ffn_gate, NULL, NULL, + // model.layers[il].ffn_down, NULL, NULL, + // NULL, + // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + // cb, il); + // cb(cur, "ffn_out", il); + // } - cur = inpL; - cb(cur, "result_embd", -1); + // cur = ggml_add(ctx0, cur, ffn_inp); + // cb(cur, "ffn_out", il); - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + // ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + // if (layer_dir != nullptr) { + // cur = ggml_add(ctx0, cur, layer_dir); + // } + // cb(cur, "l_out", il); - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); + // // input for next layer + // inpL = cur; + // } - ggml_build_forward_expand(gf, cur); + // cur = inpL; + // cb(cur, "result_embd", -1); - return gf; - } + // cur = build_norm(cur, + // model.output_norm, NULL, + // LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); + + // // lm_head + // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); + + // ggml_build_forward_expand(gf, cur); + + // return gf; + //} struct ggml_cgraph * build_jais() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -7106,21 +6535,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -7136,9 +6564,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), cb, il); } if (il == n_layer - 1) { @@ -7154,13 +6582,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -7173,13 +6601,13 @@ struct llm_build_context { cb(inpL, "l_out", il); } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); @@ -7198,21 +6626,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention @@ -7221,7 +6648,7 @@ struct llm_build_context { struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -7249,9 +6676,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -7268,13 +6695,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7288,13 +6715,13 @@ struct llm_build_context { cb(inpL, "l_out", il); } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -7312,42 +6739,41 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -7368,9 +6794,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7384,13 +6810,13 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -7409,13 +6835,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -7436,44 +6862,43 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -7494,9 +6919,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7511,12 +6936,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7536,13 +6961,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -7550,230 +6975,232 @@ struct llm_build_context { return gf; } - ggml_cgraph * build_rwkv6() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - // Token shift state dimensions should be 2 * n_emb - GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); - - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); - GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); - - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); - inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); - - for (int il = 0; il < n_layer; ++il) { - const llama_layer * layer = &model.layers[il]; - - // (ab)using the KV cache to store the states - struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, - gf, kv_self.k_l[il], state_copy, state_mask, - hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); - struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, - gf, kv_self.v_l[il], state_copy, state_mask, - hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); - - cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs); - - struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); - struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); - - struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il); - struct ggml_tensor * x_prev = ggml_concat( - ctx0, - att_shift, - ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), - 1 - ); - - cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size)); - ggml_build_forward_expand(gf, cur); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - wkv_states, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) - ) - ); - - struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il); - x_prev = ggml_concat( - ctx0, - ffn_shift, - ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), - 1 - ); - cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev)); - ggml_build_forward_expand(gf, cur); - - struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); - struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); - - token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1); - - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), - ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) - ) - ); - - if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { - cur = ggml_scale(ctx0, cur, 0.5F); - } - - cur = lctx.cvec.apply_to(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); - cb(cur, "result_norm", -1); - - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } + //ggml_cgraph * build_rwkv6() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + // // Token shift state dimensions should be 2 * n_emb + // GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); + + // const int64_t n_seqs = ubatch.n_seqs; + // const int64_t n_seq_tokens = ubatch.n_seq_tokens; + // const int64_t n_tokens = ubatch.n_tokens; + // GGML_ASSERT(n_seqs != 0); + // GGML_ASSERT(ubatch.equal_seqs); + // GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; + // struct ggml_tensor * state_copy = build_inp_s_copy(); + // struct ggml_tensor * state_mask = build_inp_s_mask(); + + // inpL = build_inp_embd(model.tok_embd); + // inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); + + // for (int il = 0; il < n_layer; ++il) { + // const llama_layer * layer = &model.layers[il]; + + // // (ab)using the KV cache to store the states + // struct ggml_tensor * token_shift = build_copy_mask_state( + // gf, kv_self.k_l[il], state_copy, state_mask, + // hparams.n_embd_k_s(), n_seqs); + + // struct ggml_tensor * wkv_states = build_copy_mask_state( + // gf, kv_self.v_l[il], state_copy, state_mask, + // hparams.n_embd_v_s(), n_seqs); + + // cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); + // token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs); + + // struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); + // struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); + + // struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); + // struct ggml_tensor * x_prev = ggml_concat( + // ctx0, + // att_shift, + // ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), + // 1 + // ); + + // cur = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size)); + // ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // wkv_states, + // ggml_view_1d( + // ctx0, + // kv_self.v_l[il], + // hparams.n_embd_v_s() * n_seqs, + // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + // ) + // ) + // ); + + // struct ggml_tensor * x_norm_ffn = build_norm(cur, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); + // x_prev = ggml_concat( + // ctx0, + // ffn_shift, + // ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), + // 1 + // ); + // cur = ggml_add(ctx0, cur, build_rwkv6_channel_mix(layer, x_norm_ffn, x_prev)); + // ggml_build_forward_expand(gf, cur); + + // struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); + // struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); + + // token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1); + + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), + // ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) + // ) + // ); + + // if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { + // cur = ggml_scale(ctx0, cur, 0.5F); + // } + + // cur = lctx.cvec.apply_to(ctx0, cur, il); + // cb(cur, "l_out", il); + + // // input for next layer + // inpL = cur; + // } + + // cur = inpL; + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + + // cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + // cb(cur, "result_norm", -1); + + // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); + + // ggml_build_forward_expand(gf, cur); + + // return gf; + //} // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py - ggml_cgraph * build_rwkv6qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - GGML_ASSERT(n_embd == hparams.n_embd_k_s()); - - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); - GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); - - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); - - for (int il = 0; il < n_layer; ++il) { - const llama_layer * layer = &model.layers[il]; - - // (ab)using the KV cache to store the states - struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, - gf, kv_self.k_l[il], state_copy, state_mask, - hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); - struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, - gf, kv_self.v_l[il], state_copy, state_mask, - hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); - - cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs); - - struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il); - struct ggml_tensor * x_prev = ggml_concat( - ctx0, - token_shift, - ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), - 1 - ); - - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - wkv_states, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) - ) - ); - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); - ggml_build_forward_expand(gf, ffn_inp); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - wkv_states, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) - ) - ); - - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } + //ggml_cgraph * build_rwkv6qwen2() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + // GGML_ASSERT(n_embd == hparams.n_embd_k_s()); + + // const int64_t n_seqs = ubatch.n_seqs; + // const int64_t n_seq_tokens = ubatch.n_seq_tokens; + // const int64_t n_tokens = ubatch.n_tokens; + // GGML_ASSERT(n_seqs != 0); + // GGML_ASSERT(ubatch.equal_seqs); + // GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; + // struct ggml_tensor * state_copy = build_inp_s_copy(); + // struct ggml_tensor * state_mask = build_inp_s_mask(); + + // inpL = build_inp_embd(model.tok_embd); + + // for (int il = 0; il < n_layer; ++il) { + // const llama_layer * layer = &model.layers[il]; + + // // (ab)using the KV cache to store the states + // struct ggml_tensor * token_shift = build_copy_mask_state( + // gf, kv_self.k_l[il], state_copy, state_mask, + // hparams.n_embd_k_s(), n_seqs); + + // struct ggml_tensor * wkv_states = build_copy_mask_state( + // gf, kv_self.v_l[il], state_copy, state_mask, + // hparams.n_embd_v_s(), n_seqs); + + // cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); + // token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs); + + // struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); + // struct ggml_tensor * x_prev = ggml_concat( + // ctx0, + // token_shift, + // ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), + // 1 + // ); + + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // wkv_states, + // ggml_view_1d( + // ctx0, + // kv_self.v_l[il], + // hparams.n_embd_v_s() * n_seqs, + // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + // ) + // ) + // ); + + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); + // ggml_build_forward_expand(gf, ffn_inp); + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // wkv_states, + // ggml_view_1d( + // ctx0, + // kv_self.v_l[il], + // hparams.n_embd_v_s() * n_seqs, + // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + // ) + // ) + // ); + + // cb(ffn_inp, "ffn_inp", il); + + // // feed-forward network + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); + + // cur = build_ffn(cur, + // model.layers[il].ffn_up, NULL, NULL, + // model.layers[il].ffn_gate, NULL, NULL, + // model.layers[il].ffn_down, NULL, NULL, + // NULL, + // LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + // cb(cur, "ffn_out", il); + + // cur = ggml_add(ctx0, cur, ffn_inp); + // cur = lctx.cvec.apply_to(ctx0, cur, il); + // cb(cur, "l_out", il); + + // // input for next layer + // inpL = cur; + // } + + // cur = inpL; + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + + // cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); + + // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); + + // ggml_build_forward_expand(gf, cur); + + // return gf; + //} // ref: https://github.com/facebookresearch/chameleon // based on the original build_llama() function, changes: @@ -7794,13 +7221,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7809,22 +7235,22 @@ struct llm_build_context { if (hparams.swin_norm) { cur = inpL; } else { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); } // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].attn_q_norm) { @@ -7834,10 +7260,10 @@ struct llm_build_context { 0); cb(Qcur, "Qcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); } @@ -7848,10 +7274,10 @@ struct llm_build_context { 0); cb(Kcur, "Kcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -7869,14 +7295,14 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); if (hparams.swin_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); } } @@ -7893,13 +7319,13 @@ struct llm_build_context { // feed-forward network if (!hparams.swin_norm) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); } - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7908,9 +7334,9 @@ struct llm_build_context { cb(cur, "ffn_out", il); if (hparams.swin_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); } @@ -7926,13 +7352,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output_with_img_logits", -1); // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. @@ -7959,7 +7385,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); @@ -7978,20 +7404,20 @@ struct llm_build_context { case 3: case 4: { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm1, layer.norm1_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); cur = ggml_add(ctx0, cur, layer.conv1_b); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm2, layer.norm2_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); @@ -8002,10 +7428,10 @@ struct llm_build_context { } break; case 2: { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.attn_norm, layer.attn_norm_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); struct ggml_tensor * q; struct ggml_tensor * k; @@ -8035,10 +7461,10 @@ struct llm_build_context { } break; case 5: { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm, layer.norm_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); } break; default: GGML_ABORT("unknown posnet layer"); }; @@ -8046,10 +7472,10 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.tok_norm, model.tok_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); @@ -8066,12 +7492,12 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm, layer.norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, layer.pw1, layer.pw1_b, NULL, NULL, NULL, NULL, layer.pw2, layer.pw2_b, NULL, @@ -8089,13 +7515,13 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cur = ggml_add(ctx0, cur, model.output_b); cb(cur, "result_embd", -1); @@ -8106,7 +7532,7 @@ struct llm_build_context { } }; -static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { +static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx) { llama_ubatch dummy = {}; dummy.equal_seqs = true; @@ -8116,7 +7542,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const llm.init(); - struct ggml_cgraph * result = llm.build_defrag(ids); + struct ggml_cgraph * result = llm.build_defrag(); llm.free(); @@ -8356,18 +7782,18 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_bitnet(); } break; - case LLM_ARCH_T5: - { - if (lctx.is_encoding) { - result = llm.build_t5_enc(); - } else { - result = llm.build_t5_dec(); - } - } break; - case LLM_ARCH_T5ENCODER: - { - result = llm.build_t5_enc(); - } break; + //case LLM_ARCH_T5: + // { + // if (lctx.is_encoding) { + // result = llm.build_t5_enc(); + // } else { + // result = llm.build_t5_dec(); + // } + // } break; + //case LLM_ARCH_T5ENCODER: + // { + // result = llm.build_t5_enc(); + // } break; case LLM_ARCH_JAIS: { result = llm.build_jais(); @@ -8380,14 +7806,14 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_exaone(); } break; - case LLM_ARCH_RWKV6: - { - result = llm.build_rwkv6(); - } break; - case LLM_ARCH_RWKV6QWEN2: - { - result = llm.build_rwkv6qwen2(); - } break; + //case LLM_ARCH_RWKV6: + // { + // result = llm.build_rwkv6(); + // } break; + //case LLM_ARCH_RWKV6QWEN2: + // { + // result = llm.build_rwkv6qwen2(); + // } break; case LLM_ARCH_CHAMELEON: { result = llm.build_chameleon(); @@ -8543,6 +7969,7 @@ static int llama_decode_impl( } else { ubatch = lctx.sbatch.split_simple(n_ubatch); } + const uint32_t n_tokens = ubatch.n_tokens; // count the outputs in this u_batch @@ -8567,6 +7994,8 @@ static int llama_decode_impl( GGML_ASSERT(n_threads > 0); + lctx.prepare_decode(ubatch); + // non-causal masks do not use the KV cache if (hparams.causal_attn) { llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update() @@ -8600,6 +8029,12 @@ static int llama_decode_impl( ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); + + lctx.set_inputs(ubatch); + // the output is always the last tensor in the graph struct ggml_tensor * res = ggml_graph_node(gf, -1); struct ggml_tensor * embd = ggml_graph_node(gf, -2); @@ -8623,12 +8058,6 @@ static int llama_decode_impl( GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - llama_set_inputs(lctx, ubatch); - const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); if (compute_status != GGML_STATUS_SUCCESS) { kv_slot_restorer.restore(kv_self); @@ -8850,11 +8279,17 @@ static int llama_encode_impl( GGML_ASSERT(n_threads > 0); + lctx.prepare_decode(ubatch); + ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); + + lctx.set_inputs(ubatch); + // the output embeddings after the final encoder normalization struct ggml_tensor * embd = nullptr; @@ -8875,10 +8310,6 @@ static int llama_encode_impl( } } - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - llama_set_inputs(lctx, ubatch); - const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); switch (compute_status) { case GGML_STATUS_SUCCESS: @@ -8966,227 +8397,6 @@ static int llama_encode_impl( return 0; } -// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache -static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { - auto & kv_self = lctx.kv_self; - - const auto & hparams = lctx.model.hparams; - - const uint32_t n_layer = hparams.n_layer; - - const uint32_t n_kv = kv_self.cell_max(); - const uint32_t n_used = kv_self.used; - - assert(n_used <= n_kv); - - //const int64_t t_start = ggml_time_us(); - - // number of cells moved - uint32_t n_moves = 0; - - // each move requires 6*n_layer tensors (see build_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - //const uint32_t max_moves = model.max_nodes()/(6*n_layer); - // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer); - - // determine which KV cells to move where - // - // cell i moves to ids[i] - // - // if ids[i] == i || ids[i] == n_kv, then cell i is not moved - // - std::vector ids(n_kv, n_kv); - - for (uint32_t i0 = 0; i0 < n_used; ++i0) { - const auto & cell0 = kv_self.cells[i0]; - - if (!cell0.is_empty()) { - ids[i0] = i0; - - continue; - } - - // found a hole - fill it with data from the end of the cache - - uint32_t nh = 1; - - // determine the size of the hole - while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { - nh++; - } - - uint32_t nf = 0; - uint32_t is = n_kv - 1; - - // starting from the end, find nh non-empty cells - for (; is > i0; --is) { - const auto & cell1 = kv_self.cells[is]; - - if (cell1.is_empty() || ids[is] != n_kv) { - continue; - } - - // non-empty cell which is not yet moved - nf++; - - if (nf == nh) { - break; - } - } - - // this can only happen if `n_used` is not accurate, which would be a bug - GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); - - nf = 0; - - uint32_t i1 = is; - - // are we moving a continuous block of memory? - bool cont = false; - - // should we stop searching for the next move? - bool stop = false; - - // go back and move the nf cells to the hole - for (; i1 < n_kv; ++i1) { - auto & cell1 = kv_self.cells[i1]; - - if (cell1.is_empty() || ids[i1] != n_kv) { - if (n_moves == max_moves) { - stop = true; - break; - } - - cont = false; - continue; - } - - // this cell goes to (i0 + nf) - ids[i1] = i0 + nf; - - // move the cell meta data - kv_self.cells[i0 + nf] = cell1; - - // clear the old cell and move the head there - cell1 = llama_kv_cell(); - kv_self.head = n_used; - - if (!cont) { - n_moves++; - cont = true; - } - - nf++; - - if (nf == nh) { - break; - } - } - - if (stop || n_moves == max_moves) { - break; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); - - i0 += nh - 1; - } - - if (n_moves == 0) { - return; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - - //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); - -#if 0 - // CPU defrag - // - // TODO: optimizations are possible: - // - multiple threads - // - avoid copying to the host memory when already there - // - // likely not worth the effort, as we have ggml_graph based defrag - // - - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - - const uint32_t kv_size = kv_self.size; - - std::vector buf_k; - std::vector buf_v; - - for (uint32_t il = 0; il < n_layer; ++il) { - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); - - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); - - buf_k.resize(k_size); - buf_v.resize(v_size); - - ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); - - // batch move [i, i+nm) to [id, id+nm) - // note: cells can move only to a lower index - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == n_kv) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < n_kv && ids[i + nm] == id + nm) { - nm++; - } - - // move keys - { - const int64_t os = i*k_size_row; - const int64_t od = id*k_size_row; - - memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); - } - - // move values (note: they are transposed) - { - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); - } - } - - i += nm - 1; - } - - ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); - } -#else - // ggml_graph defrag - - ggml_backend_sched_reset(lctx.sched.get()); - - ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); - - llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); -#endif - - //const int64_t t_end = ggml_time_us(); - - //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); -} - // TODO: move to llama_context static void llama_kv_self_update_impl(llama_context & lctx) { bool need_reserve = false; @@ -9200,13 +8410,15 @@ static void llama_kv_self_update_impl(llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { + lctx.prepare_k_shift(); + ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_k_shift(lctx); ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - lctx.set_k_shift(kv); + lctx.set_inputs({}); llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); @@ -9224,7 +8436,13 @@ static void llama_kv_self_update_impl(llama_context & lctx) { // defragment the KV cache if needed if (kv.do_defrag) { - llama_kv_cache_defrag_impl(lctx); + lctx.prepare_defrag(); + + ggml_backend_sched_reset(lctx.sched.get()); + + ggml_cgraph * gf = llama_build_graph_defrag(lctx); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); need_reserve = true; @@ -9253,16 +8471,16 @@ int32_t llama_set_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter, float scale) { - ctx->lora[adapter] = scale; + ctx->loras[adapter] = scale; return 0; } int32_t llama_rm_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter) { - auto pos = ctx->lora.find(adapter); - if (pos != ctx->lora.end()) { - ctx->lora.erase(pos); + auto pos = ctx->loras.find(adapter); + if (pos != ctx->loras.end()) { + ctx->loras.erase(pos); return 0; } @@ -9270,7 +8488,7 @@ int32_t llama_rm_adapter_lora( } void llama_clear_adapter_lora(struct llama_context * ctx) { - ctx->lora.clear(); + ctx->loras.clear(); } int32_t llama_apply_adapter_cvec( From b4ec1d44294b628a811cc97367bb7ace0a32c9fd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 16 Jan 2025 21:55:12 +0200 Subject: [PATCH 12/84] cont : move kv_self update to llama_context ggml-ci --- src/llama-context.cpp | 119 +++++++++++++++++++++++++++ src/llama-context.h | 10 +++ src/llama.cpp | 182 +++++++----------------------------------- 3 files changed, 157 insertions(+), 154 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 910e2243d7e8a..daea125fe0704 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -32,6 +32,38 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } +enum ggml_status llama_context::compute_graph( + ggml_cgraph * graph, + bool batched) { + int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; + ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; + + if (backend_cpu != nullptr) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(backend_cpu, tp); + } + + // set the number of threads for all the backends + for (const auto & set_n_threads_fn : set_n_threads_fns) { + set_n_threads_fn.second(set_n_threads_fn.first, n_threads); + } + + auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph); + if (status != GGML_STATUS_SUCCESS) { + LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); + } + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); + + return status; +} + + +llama_pos llama_context::pos_max() const { + return kv_self.pos_max(); +} + // TODO: improve void llama_context::reset() { inp_tokens = nullptr; @@ -540,6 +572,93 @@ ggml_tensor * llama_context::build_lora_mm_id( return res; } +bool llama_context::kv_self_update() { + bool need_reserve = false; + + auto & kv = kv_self; + + if (kv.has_shift) { + if (!kv.can_shift) { + GGML_ABORT("The current context does not support K-shift"); + } + + // apply K-shift if needed + if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { + prepare_k_shift(); + + ggml_backend_sched_reset(sched.get()); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ggml_context * ctx0 = ggml_init(params); + + reset(); + + ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + build_k_shift(ctx0, gf); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + set_inputs({}); + + compute_graph(gf, false); + + ggml_free(ctx0); + + need_reserve = true; + } + + { + kv.has_shift = false; + + for (uint32_t i = 0; i < kv.size; ++i) { + kv.cells[i].delta = 0; + } + } + } + + // defragment the KV cache if needed + if (kv.do_defrag) { + prepare_defrag(); + + ggml_backend_sched_reset(sched.get()); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ggml_context * ctx0 = ggml_init(params); + + reset(); + + ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + build_defrag(ctx0, gf); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + // no input + //set_inputs({}); + + compute_graph(gf, false); + + ggml_free(ctx0); + + need_reserve = true; + + kv.do_defrag = false; + } + + return need_reserve; +} + void llama_context::build_attn_inp( ggml_context * ctx0, int32_t n_tokens, diff --git a/src/llama-context.h b/src/llama-context.h index a2f41b5c8fc7d..bc33fc6ef4890 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -79,6 +79,13 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; + // returns the result of ggml_backend_sched_graph_compute_async execution + enum ggml_status compute_graph( + ggml_cgraph * graph, + bool batched); + + llama_pos pos_max() const; + void reset(); void prepare_k_shift(); @@ -129,6 +136,9 @@ struct llama_context { struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] struct ggml_tensor * inp_K_shift; // I32 [kv_size] + // return true if need to reserve new worst-case graph + bool kv_self_update(); + void build_attn_inp( ggml_context * ctx0, int32_t n_tokens, diff --git a/src/llama.cpp b/src/llama.cpp index a2e5e0bea0fb5..6e2faa71c342b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -110,7 +110,6 @@ struct llm_build_context { const llama_hparams & hparams; const llama_cparams & cparams; const llama_ubatch & ubatch; - //const llama_kv_cache & kv_self; const llama_adapter_cvec & cvec; const llama_loras & loras; @@ -137,8 +136,6 @@ struct llm_build_context { const float norm_rms_eps; const int32_t n_tokens; - //const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size) - //const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_outputs; const int32_t n_outputs_enc; const int32_t n_ctx_orig; @@ -166,7 +163,6 @@ struct llm_build_context { hparams (model.hparams), cparams (lctx.cparams), ubatch (ubatch), - //kv_self (lctx.kv_self), cvec (lctx.cvec), loras (lctx.loras), n_embd (hparams.n_embd), @@ -190,8 +186,6 @@ struct llm_build_context { norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (ubatch.n_tokens), - //n_kv (worst_case ? kv_self.size : kv_self.n), - //kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_outputs (worst_case ? n_tokens : lctx.n_outputs), n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd), n_ctx_orig (cparams.n_ctx_orig_yarn), @@ -7532,40 +7526,6 @@ struct llm_build_context { } }; -static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx) { - llama_ubatch dummy = {}; - dummy.equal_seqs = true; - - llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; - - struct llm_build_context llm(lctx, dummy, cb, false); - - llm.init(); - - struct ggml_cgraph * result = llm.build_defrag(); - - llm.free(); - - return result; -} - -static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { - llama_ubatch dummy = {}; - dummy.equal_seqs = true; - - llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; - - struct llm_build_context llm(lctx, dummy, cb, false); - - llm.init(); - - struct ggml_cgraph * result = llm.build_k_shift(); - - llm.free(); - - return result; -} - static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_ubatch & ubatch, @@ -7836,33 +7796,6 @@ static struct ggml_cgraph * llama_build_graph( return result; } -// returns the result of ggml_backend_sched_graph_compute_async execution -static enum ggml_status llama_graph_compute( - llama_context & lctx, - ggml_cgraph * gf, - int n_threads, - ggml_threadpool * threadpool) { - if (lctx.backend_cpu != nullptr) { - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu)); - auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); - set_threadpool_fn(lctx.backend_cpu, threadpool); - } - - // set the number of threads for all the backends - for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) { - set_n_threads_fn.second(set_n_threads_fn.first, n_threads); - } - - auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf); - if (status != GGML_STATUS_SUCCESS) { - LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); - } - - // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); - - return status; -} - // decode a batch of tokens by evaluating the transformer // in case of unsuccessful decoding (error or warning), // the kv_cache state will be returned to its original state @@ -7887,7 +7820,7 @@ static int llama_decode_impl( } // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; const uint32_t n_tokens_all = batch.n_tokens; @@ -7989,16 +7922,11 @@ static int llama_decode_impl( lctx.n_outputs = n_outputs_new; } - int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; - ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; - - GGML_ASSERT(n_threads > 0); - lctx.prepare_decode(ubatch); // non-causal masks do not use the KV cache if (hparams.causal_attn) { - llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update() + llama_kv_self_update(&lctx); // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it @@ -8058,7 +7986,7 @@ static int llama_decode_impl( GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } - const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); + const auto compute_status = lctx.compute_graph(gf, n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { kv_slot_restorer.restore(kv_self); switch (compute_status) { @@ -8226,7 +8154,7 @@ static int llama_encode_impl( } // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; const uint32_t n_tokens = batch.n_tokens; @@ -8274,11 +8202,6 @@ static int llama_encode_impl( lctx.inp_embd_enc = NULL; lctx.n_outputs = n_tokens; - int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; - ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; - - GGML_ASSERT(n_threads > 0); - lctx.prepare_decode(ubatch); ggml_backend_sched_reset(lctx.sched.get()); @@ -8310,7 +8233,7 @@ static int llama_encode_impl( } } - const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); + const auto compute_status = lctx.compute_graph(gf, n_tokens > 1); switch (compute_status) { case GGML_STATUS_SUCCESS: break; @@ -8397,76 +8320,6 @@ static int llama_encode_impl( return 0; } -// TODO: move to llama_context -static void llama_kv_self_update_impl(llama_context & lctx) { - bool need_reserve = false; - - auto & kv = lctx.kv_self; - - if (kv.has_shift) { - if (!kv.can_shift) { - GGML_ABORT("The current context does not support K-shift"); - } - - // apply K-shift if needed - if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { - lctx.prepare_k_shift(); - - ggml_backend_sched_reset(lctx.sched.get()); - - ggml_cgraph * gf = llama_build_graph_k_shift(lctx); - - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - lctx.set_inputs({}); - - llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); - - need_reserve = true; - } - - { - kv.has_shift = false; - - for (uint32_t i = 0; i < kv.size; ++i) { - kv.cells[i].delta = 0; - } - } - } - - // defragment the KV cache if needed - if (kv.do_defrag) { - lctx.prepare_defrag(); - - ggml_backend_sched_reset(lctx.sched.get()); - - ggml_cgraph * gf = llama_build_graph_defrag(lctx); - - llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); - - need_reserve = true; - - kv.do_defrag = false; - } - - // reserve a worst case graph again - if (need_reserve) { - // TODO: extract to a function - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch); - llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched.get()); - if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - } - } -} - int32_t llama_set_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter, @@ -9224,9 +9077,30 @@ void llama_kv_cache_update(llama_context * ctx) { llama_kv_self_update(ctx); } -// TODO: move to llama-context void llama_kv_self_update(llama_context * ctx) { - llama_kv_self_update_impl(*ctx); + const bool need_reserve = ctx->kv_self_update(); + + // reserve a worst case graph again + if (need_reserve) { + // TODO: extract to a function + const auto & cparams = ctx->cparams; + const auto & model = ctx->model; + + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(ctx->sched.get()); + if (!ggml_backend_sched_reserve(ctx->sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + } } /// From f0713498fd05afe117647c76f536866640b77b90 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 11:51:35 +0200 Subject: [PATCH 13/84] context : add get_ctx_padding() ggml-ci --- src/llama-context.cpp | 4 ++++ src/llama-context.h | 3 +++ src/llama.cpp | 4 +++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index daea125fe0704..6a73659d05136 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -64,6 +64,10 @@ llama_pos llama_context::pos_max() const { return kv_self.pos_max(); } +uint32_t llama_context::get_ctx_padding(const llama_cparams & cparams) const { + return kv_self.get_padding(cparams); +} + // TODO: improve void llama_context::reset() { inp_tokens = nullptr; diff --git a/src/llama-context.h b/src/llama-context.h index bc33fc6ef4890..45eaafaad16cb 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -84,8 +84,11 @@ struct llama_context { ggml_cgraph * graph, bool batched); + // max token position across all sequences in the current context llama_pos pos_max() const; + uint32_t get_ctx_padding(const llama_cparams & cparams) const; + void reset(); void prepare_k_shift(); diff --git a/src/llama.cpp b/src/llama.cpp index 6e2faa71c342b..569c67c028305 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7820,6 +7820,7 @@ static int llama_decode_impl( } // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; @@ -8154,6 +8155,7 @@ static int llama_encode_impl( } // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; @@ -8629,7 +8631,7 @@ struct llama_context * llama_init_from_model( cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; // this is necessary due to kv_self.n being padded later during inference - cparams.n_ctx = GGML_PAD(cparams.n_ctx, ctx->kv_self.get_padding(cparams)); + cparams.n_ctx = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams)); // with causal attention, the batch size is limited by the context size cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; From c75ba6851e1f6079ff7c823672908a2e5767418a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 12:41:16 +0200 Subject: [PATCH 14/84] context : move adapter code in the implementation [no ci] --- src/llama-context.cpp | 37 +++++++++++++++++++++++++++++++++++++ src/llama.cpp | 40 +++++----------------------------------- 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 6a73659d05136..5cb31abc085ee 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1788,6 +1788,43 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id return it->second.data(); } +// llama adapter API + +int32_t llama_set_adapter_lora( + struct llama_context * ctx, + struct llama_adapter_lora * adapter, + float scale) { + ctx->loras[adapter] = scale; + return 0; +} + +int32_t llama_rm_adapter_lora( + struct llama_context * ctx, + struct llama_adapter_lora * adapter) { + auto pos = ctx->loras.find(adapter); + if (pos != ctx->loras.end()) { + ctx->loras.erase(pos); + return 0; + } + + return -1; +} + +void llama_clear_adapter_lora(struct llama_context * ctx) { + ctx->loras.clear(); +} + +int32_t llama_apply_adapter_cvec( + struct llama_context * ctx, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end) { + return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); +} + + // llama state API // deprecated diff --git a/src/llama.cpp b/src/llama.cpp index 569c67c028305..b80b1c4d1688a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8322,40 +8322,6 @@ static int llama_encode_impl( return 0; } -int32_t llama_set_adapter_lora( - struct llama_context * ctx, - struct llama_adapter_lora * adapter, - float scale) { - ctx->loras[adapter] = scale; - return 0; -} - -int32_t llama_rm_adapter_lora( - struct llama_context * ctx, - struct llama_adapter_lora * adapter) { - auto pos = ctx->loras.find(adapter); - if (pos != ctx->loras.end()) { - ctx->loras.erase(pos); - return 0; - } - - return -1; -} - -void llama_clear_adapter_lora(struct llama_context * ctx) { - ctx->loras.clear(); -} - -int32_t llama_apply_adapter_cvec( - struct llama_context * ctx, - const float * data, - size_t len, - int32_t n_embd, - int32_t il_start, - int32_t il_end) { - return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); -} - // // interface implementation // @@ -8924,7 +8890,7 @@ struct llama_context * llama_new_context_with_model( } // -// kv cache +// kv cache view // struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { @@ -8935,6 +8901,10 @@ void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * llama_kv_cache_view_update(view, ctx->kv_self); } +// +// kv cache +// + // deprecated int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { return llama_kv_self_n_tokens(ctx); From 133ad6a7232914459afc902107a53342d3abfb3b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 14:42:09 +0200 Subject: [PATCH 15/84] context : initial need_reserve logic ggml-ci --- src/llama-context.cpp | 171 ++++++++++++++++++++- src/llama-context.h | 4 +- src/llama.cpp | 337 +++++++++++++----------------------------- 3 files changed, 268 insertions(+), 244 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 5cb31abc085ee..d696090cc5b3f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -576,9 +576,7 @@ ggml_tensor * llama_context::build_lora_mm_id( return res; } -bool llama_context::kv_self_update() { - bool need_reserve = false; - +void llama_context::kv_self_update() { auto & kv = kv_self; if (kv.has_shift) { @@ -655,12 +653,14 @@ bool llama_context::kv_self_update() { ggml_free(ctx0); - need_reserve = true; - kv.do_defrag = false; + + need_reserve = true; } +} - return need_reserve; +void llama_kv_self_update(llama_context * ctx) { + ctx->kv_self_update(); } void llama_context::build_attn_inp( @@ -1824,6 +1824,165 @@ int32_t llama_apply_adapter_cvec( return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); } +// +// kv cache view +// + +struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { + return llama_kv_cache_view_init(ctx->kv_self, n_seq_max); +} + +void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) { + llama_kv_cache_view_update(view, ctx->kv_self); +} + +// +// kv cache +// + +// deprecated +int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { + return llama_kv_self_n_tokens(ctx); +} + +int32_t llama_kv_self_n_tokens(const llama_context * ctx) { + return llama_kv_cache_n_tokens(&ctx->kv_self); +} + +// deprecated +int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) { + return llama_kv_self_used_cells(ctx); +} + +int32_t llama_kv_self_used_cells(const llama_context * ctx) { + return llama_kv_cache_used_cells(&ctx->kv_self); +} + +// deprecated +void llama_kv_cache_clear(llama_context * ctx) { + llama_kv_self_clear(ctx); +} + +void llama_kv_self_clear(llama_context * ctx) { + llama_kv_cache_clear(&ctx->kv_self); +} + +// deprecated +bool llama_kv_cache_seq_rm( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return llama_kv_self_seq_rm(ctx, seq_id, p0, p1); +} + +bool llama_kv_self_seq_rm( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1); +} + +// deprecated +void llama_kv_cache_seq_cp( + llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_self_seq_cp( + llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); +} + +// deprecated +void llama_kv_cache_seq_keep( + llama_context * ctx, + llama_seq_id seq_id) { + return llama_kv_self_seq_keep(ctx, seq_id); +} + +void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id); +} + +// deprecated +void llama_kv_cache_seq_add( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta); +} + +void llama_kv_self_seq_add( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta); +} + +// deprecated +void llama_kv_cache_seq_div( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d); +} + +void llama_kv_self_seq_div( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d); +} + +// deprecated +llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_self_seq_pos_max(ctx, seq_id); +} + +llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id); +} + +// deprecated +void llama_kv_cache_defrag(llama_context * ctx) { + return llama_kv_self_defrag(ctx); +} + +void llama_kv_self_defrag(llama_context * ctx) { + return llama_kv_cache_defrag(&ctx->kv_self); +} + +// deprecated +bool llama_kv_cache_can_shift(const llama_context * ctx) { + return llama_kv_self_can_shift(ctx); +} + +bool llama_kv_self_can_shift(const llama_context * ctx) { + return llama_kv_cache_can_shift(&ctx->kv_self); +} + +// deprecated +void llama_kv_cache_update(llama_context * ctx) { + llama_kv_self_update(ctx); +} // llama state API diff --git a/src/llama-context.h b/src/llama-context.h index 45eaafaad16cb..eb9a1739170dc 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -62,6 +62,7 @@ struct llama_context { int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch bool logits_all = false; + bool need_reserve = false; // embeddings output (2-dimensional array: [n_outputs][n_embd]) // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE @@ -87,6 +88,7 @@ struct llama_context { // max token position across all sequences in the current context llama_pos pos_max() const; + // certain implementations could require a padding for the context size uint32_t get_ctx_padding(const llama_cparams & cparams) const; void reset(); @@ -140,7 +142,7 @@ struct llama_context { struct ggml_tensor * inp_K_shift; // I32 [kv_size] // return true if need to reserve new worst-case graph - bool kv_self_update(); + void kv_self_update(); void build_attn_inp( ggml_context * ctx0, diff --git a/src/llama.cpp b/src/llama.cpp index b80b1c4d1688a..5807fa38802da 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -28,57 +28,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { - // loading time will be recalculated after the first eval, so - // we take page faults deferred by mmap() into consideration - model.t_load_us = 0; - time_meas tm(model.t_load_us); - - model.t_start_us = tm.t_start_us; - - try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); - - ml.print_info(); - - model.hparams.vocab_only = params.vocab_only; - - try { - model.load_arch(ml); - } catch(const std::exception & e) { - throw std::runtime_error("error loading model architecture: " + std::string(e.what())); - } - try { - model.load_hparams(ml); - } catch(const std::exception & e) { - throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); - } - try { - model.load_vocab(ml); - } catch(const std::exception & e) { - throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); - } - - model.load_stats(ml); - model.print_info(); - - if (params.vocab_only) { - LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return 0; - } - - if (!model.load_tensors(ml)) { - return -2; - } - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); - return -1; - } - - return 0; -} - // // llm_build // @@ -7951,6 +7900,30 @@ static int llama_decode_impl( } } + // reserve a worst case graph if needed + // TODO: extract to a function + if (lctx.need_reserve) { + const auto & cparams = lctx.cparams; + const auto & model = lctx.model; + + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(lctx.sched.get()); + if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + + lctx.need_reserve = false; + } + //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); ggml_backend_sched_reset(lctx.sched.get()); @@ -8206,6 +8179,31 @@ static int llama_encode_impl( lctx.prepare_decode(ubatch); + // reserve a worst case graph if needed + // TODO: extract to a function + if (lctx.need_reserve) { + // TODO: extract to a function + const auto & cparams = lctx.cparams; + const auto & model = lctx.model; + + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(lctx.sched.get()); + if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + + lctx.need_reserve = false; + } + ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); @@ -8419,6 +8417,57 @@ int64_t llama_time_us(void) { return ggml_time_us(); } +// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback +static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { + // loading time will be recalculated after the first eval, so + // we take page faults deferred by mmap() into consideration + model.t_load_us = 0; + time_meas tm(model.t_load_us); + + model.t_start_us = tm.t_start_us; + + try { + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); + + ml.print_info(); + + model.hparams.vocab_only = params.vocab_only; + + try { + model.load_arch(ml); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model architecture: " + std::string(e.what())); + } + try { + model.load_hparams(ml); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); + } + try { + model.load_vocab(ml); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); + } + + model.load_stats(ml); + model.print_info(); + + if (params.vocab_only) { + LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); + return 0; + } + + if (!model.load_tensors(ml)) { + return -2; + } + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); + return -1; + } + + return 0; +} + static struct llama_model * llama_model_load_from_file_impl( const std::string & path_model, std::vector & splits, @@ -8889,192 +8938,6 @@ struct llama_context * llama_new_context_with_model( return llama_init_from_model(model, params); } -// -// kv cache view -// - -struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { - return llama_kv_cache_view_init(ctx->kv_self, n_seq_max); -} - -void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) { - llama_kv_cache_view_update(view, ctx->kv_self); -} - -// -// kv cache -// - -// deprecated -int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { - return llama_kv_self_n_tokens(ctx); -} - -int32_t llama_kv_self_n_tokens(const llama_context * ctx) { - return llama_kv_cache_n_tokens(&ctx->kv_self); -} - -// deprecated -int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) { - return llama_kv_self_used_cells(ctx); -} - -int32_t llama_kv_self_used_cells(const llama_context * ctx) { - return llama_kv_cache_used_cells(&ctx->kv_self); -} - -// deprecated -void llama_kv_cache_clear(llama_context * ctx) { - llama_kv_self_clear(ctx); -} - -void llama_kv_self_clear(llama_context * ctx) { - llama_kv_cache_clear(&ctx->kv_self); -} - -// deprecated -bool llama_kv_cache_seq_rm( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1) { - return llama_kv_self_seq_rm(ctx, seq_id, p0, p1); -} - -bool llama_kv_self_seq_rm( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1) { - return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1); -} - -// deprecated -void llama_kv_cache_seq_cp( - llama_context * ctx, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1) { - return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1); -} - -void llama_kv_self_seq_cp( - llama_context * ctx, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1) { - return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); -} - -// deprecated -void llama_kv_cache_seq_keep( - llama_context * ctx, - llama_seq_id seq_id) { - return llama_kv_self_seq_keep(ctx, seq_id); -} - -void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id); -} - -// deprecated -void llama_kv_cache_seq_add( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta) { - return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta); -} - -void llama_kv_self_seq_add( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta) { - return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta); -} - -// deprecated -void llama_kv_cache_seq_div( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d) { - return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d); -} - -void llama_kv_self_seq_div( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d) { - return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d); -} - -// deprecated -llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_self_seq_pos_max(ctx, seq_id); -} - -llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id); -} - -// deprecated -void llama_kv_cache_defrag(llama_context * ctx) { - return llama_kv_self_defrag(ctx); -} - -void llama_kv_self_defrag(llama_context * ctx) { - return llama_kv_cache_defrag(&ctx->kv_self); -} - -// deprecated -bool llama_kv_cache_can_shift(const llama_context * ctx) { - return llama_kv_self_can_shift(ctx); -} - -bool llama_kv_self_can_shift(const llama_context * ctx) { - return llama_kv_cache_can_shift(&ctx->kv_self); -} - -// deprecated -void llama_kv_cache_update(llama_context * ctx) { - llama_kv_self_update(ctx); -} - -void llama_kv_self_update(llama_context * ctx) { - const bool need_reserve = ctx->kv_self_update(); - - // reserve a worst case graph again - if (need_reserve) { - // TODO: extract to a function - const auto & cparams = ctx->cparams; - const auto & model = ctx->model; - - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - - ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(ctx->sched.get()); - if (!ggml_backend_sched_reserve(ctx->sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - } - } -} - /// int32_t llama_encode( From cb8f2095c6f74d9fbb9bdfbb2ae1bf6178472150 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 19:37:52 +0200 Subject: [PATCH 16/84] wip --- src/llama.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5807fa38802da..6c8df8a112a0b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7773,6 +7773,7 @@ static int llama_decode_impl( llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; + const uint32_t n_tokens_all = batch.n_tokens; const auto & model = lctx.model; @@ -7800,9 +7801,6 @@ static int llama_decode_impl( } lctx.n_queued_tokens += n_tokens_all; - auto & kv_self = lctx.kv_self; - llama_kv_slot_restorer kv_slot_restorer(kv_self); - const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = vocab.n_tokens(); @@ -7828,16 +7826,19 @@ static int llama_decode_impl( n_outputs = 1; } - lctx.sbatch.from_batch(batch, n_embd, - /* simple_split */ !kv_self.recurrent, - /* logits_all */ n_outputs == n_tokens_all); - // reserve output buffer if (llama_output_reserve(lctx, n_outputs) < n_outputs) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs); return -2; }; + auto & kv_self = lctx.kv_self; + llama_kv_slot_restorer kv_slot_restorer(kv_self); + + lctx.sbatch.from_batch(batch, n_embd, + /* simple_split */ !kv_self.recurrent, + /* logits_all */ n_outputs == n_tokens_all); + while (lctx.sbatch.n_tokens > 0) { llama_ubatch ubatch; if (kv_self.recurrent) { @@ -8645,7 +8646,6 @@ struct llama_context * llama_init_from_model( cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; - // this is necessary due to kv_self.n being padded later during inference cparams.n_ctx = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams)); // with causal attention, the batch size is limited by the context size From 99422dfa3f0c686d89492958946a9b2ca91012da Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 20:30:16 +0200 Subject: [PATCH 17/84] context : introduce llama_batch_manager ggml-ci --- src/llama-context.cpp | 130 ++++++++++++++++++++++++++++++++++++++++-- src/llama-context.h | 18 +++++- src/llama.cpp | 87 ++++++---------------------- 3 files changed, 162 insertions(+), 73 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d696090cc5b3f..de54321df2f1a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -32,6 +32,132 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } +struct llama_batch_manager : public llama_batch_manager_i { + llama_batch_manager(llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { + const auto & hparams = lctx.model.hparams; + const auto & n_embd = hparams.n_embd; + + const auto & kv_self = lctx.kv_self; + + lctx.sbatch.from_batch(batch, n_embd, + /* simple_split */ !kv_self.recurrent, + /* logits_all */ logits_all); + } + + ~llama_batch_manager() override { + } + + virtual llama_ubatch next() override { + ubatch = llama_ubatch(); + + const auto & cparams = lctx.cparams; + const auto & kv_self = lctx.kv_self; + + const auto & n_ubatch = cparams.n_ubatch; + + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + + if (kv_self.recurrent) { + if (embd_pooled) { + // Pooled embeddings cannot be split across ubatches (yet) + ubatch = lctx.sbatch.split_seq(n_ubatch); + } else { + // recurrent model architectures are easier to implement + // with equal-length sequences + ubatch = lctx.sbatch.split_equal(n_ubatch); + } + } else { + ubatch = lctx.sbatch.split_simple(n_ubatch); + } + + return ubatch; + } + + virtual bool prepare() override { + const auto & cparams = lctx.cparams; + const auto & hparams = lctx.model.hparams; + + auto & kv_self = lctx.kv_self; + + // non-causal masks do not use the KV cache + if (hparams.causal_attn) { + llama_kv_self_update(&lctx); + + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) { + kv_self.head = 0; + } + + const auto slot_info = kv_self.find_slot(ubatch); + if (!slot_info) { + return false; + } + + kv_slot_restorer.save(slot_info); + + if (!kv_self.recurrent) { + // a heuristic, to avoid attending the full cache if it is not yet utilized + // after enough generations, the benefit from this heuristic disappears + // if we start defragmenting the cache, the benefit from this will be more important + const uint32_t pad = kv_self.get_padding(cparams); + kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); + //kv_self.n = llama_kv_cache_cell_max(kv_self); + } + } + + return true; + } + + virtual void restore() override { + kv_slot_restorer.restore(lctx.kv_self); + } + + virtual void update() override { + auto & kv_self = lctx.kv_self; + + // update the kv ring buffer + { + kv_self.head += ubatch.n_tokens; + + // Ensure kv cache head points to a valid index. + if (kv_self.head >= kv_self.size) { + kv_self.head = 0; + } + } + } + + virtual void finalize() override { + const auto & cparams = lctx.cparams; + + auto & kv_self = lctx.kv_self; + + // decide if we need to defrag the kv cache + if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { + const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; + + // queue defragmentation for next llama_kv_cache_update + if (fragmentation > cparams.defrag_thold) { + //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); + + kv_self.defrag(); + } + } + } + + llama_context & lctx; + + const llama_batch & batch; + + llama_ubatch ubatch; + + llama_kv_slot_restorer kv_slot_restorer; +}; + +std::unique_ptr llama_context::prepare_batch(const llama_batch & batch, bool logits_all) { + return std::make_unique(*this, batch, logits_all); +} + enum ggml_status llama_context::compute_graph( ggml_cgraph * graph, bool batched) { @@ -59,7 +185,6 @@ enum ggml_status llama_context::compute_graph( return status; } - llama_pos llama_context::pos_max() const { return kv_self.pos_max(); } @@ -94,9 +219,6 @@ void llama_context::prepare_k_shift() { void llama_context::prepare_defrag() { } -void llama_context::prepare_decode(const llama_ubatch & /*ubatch*/) { -} - // llama input void llama_context::set_inputs(const llama_ubatch & ubatch) { diff --git a/src/llama-context.h b/src/llama-context.h index eb9a1739170dc..47233f4f52497 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -16,6 +16,20 @@ using llama_loras = std::unordered_map; +// TODO: this is very WIP - improve +struct llama_batch_manager_i { + virtual ~llama_batch_manager_i() = default; + + //bool is_done() const; + + virtual llama_ubatch next() = 0; + + virtual bool prepare() = 0; + virtual void restore() = 0; + virtual void update() = 0; + virtual void finalize() = 0; +}; + struct llama_context { llama_context(const llama_model & model) : model(model) @@ -80,6 +94,9 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; + // TODO: do not pass logits_all explicitly + std::unique_ptr prepare_batch(const llama_batch & batch, bool logits_all); + // returns the result of ggml_backend_sched_graph_compute_async execution enum ggml_status compute_graph( ggml_cgraph * graph, @@ -95,7 +112,6 @@ struct llama_context { void prepare_k_shift(); void prepare_defrag(); - void prepare_decode(const llama_ubatch & ubatch); void set_inputs(const llama_ubatch & ubatch); diff --git a/src/llama.cpp b/src/llama.cpp index 6c8df8a112a0b..8f6de199a505c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7807,8 +7807,6 @@ static int llama_decode_impl( uint32_t n_outputs = 0; uint32_t n_outputs_prev = 0; - const auto n_ubatch = cparams.n_ubatch; - // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; @@ -7832,27 +7830,19 @@ static int llama_decode_impl( return -2; }; - auto & kv_self = lctx.kv_self; - llama_kv_slot_restorer kv_slot_restorer(kv_self); + const bool logits_all = n_outputs == n_tokens_all; + + //auto & kv_self = lctx.kv_self; + //llama_kv_slot_restorer kv_slot_restorer(kv_self); + + //lctx.sbatch.from_batch(batch, n_embd, + // /* simple_split */ !kv_self.recurrent, + // /* logits_all */ logits_all); - lctx.sbatch.from_batch(batch, n_embd, - /* simple_split */ !kv_self.recurrent, - /* logits_all */ n_outputs == n_tokens_all); + auto batch_manager = lctx.prepare_batch(batch, logits_all); while (lctx.sbatch.n_tokens > 0) { - llama_ubatch ubatch; - if (kv_self.recurrent) { - if (embd_pooled) { - // Pooled embeddings cannot be split across ubatches (yet) - ubatch = lctx.sbatch.split_seq(n_ubatch); - } else { - // recurrent model architectures are easier to implement - // with equal-length sequences - ubatch = lctx.sbatch.split_equal(n_ubatch); - } - } else { - ubatch = lctx.sbatch.split_simple(n_ubatch); - } + llama_ubatch ubatch = batch_manager->next(); const uint32_t n_tokens = ubatch.n_tokens; @@ -7873,32 +7863,10 @@ static int llama_decode_impl( lctx.n_outputs = n_outputs_new; } - lctx.prepare_decode(ubatch); - - // non-causal masks do not use the KV cache - if (hparams.causal_attn) { - llama_kv_self_update(&lctx); - - // if we have enough unused cells before the current head -> - // better to start searching from the beginning of the cache, hoping to fill it - if (kv_self.head > kv_self.used + 2*n_tokens) { - kv_self.head = 0; - } - - const auto slot_info = kv_self.find_slot(ubatch); - if (!slot_info) { - return 1; - } - kv_slot_restorer.save(slot_info); - - if (!kv_self.recurrent) { - // a heuristic, to avoid attending the full cache if it is not yet utilized - // after enough generations, the benefit from this heuristic disappears - // if we start defragmenting the cache, the benefit from this will be more important - const uint32_t pad = kv_self.get_padding(cparams); - kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); - //kv_self.n = llama_kv_cache_cell_max(kv_self); - } + if (!batch_manager->prepare()) { + LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); + batch_manager->restore(); + return -3; } // reserve a worst case graph if needed @@ -7963,7 +7931,7 @@ static int llama_decode_impl( const auto compute_status = lctx.compute_graph(gf, n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { - kv_slot_restorer.restore(kv_self); + batch_manager->restore(); switch (compute_status) { case GGML_STATUS_ABORTED: return 2; @@ -7975,15 +7943,7 @@ static int llama_decode_impl( } } - // update the kv ring buffer - { - kv_self.head += n_tokens; - - // Ensure kv cache head points to a valid index. - if (kv_self.head >= kv_self.size) { - kv_self.head = 0; - } - } + batch_manager->update(); // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { @@ -8061,6 +8021,7 @@ static int llama_decode_impl( } } } + n_outputs_prev += lctx.n_outputs; } @@ -8089,17 +8050,7 @@ static int llama_decode_impl( // wait for the computation to finish (automatically done when obtaining the model output) //llama_synchronize(&lctx); - // decide if we need to defrag the kv cache - if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { - const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; - - // queue defragmentation for next llama_kv_cache_update - if (fragmentation > cparams.defrag_thold) { - //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); - - kv_self.defrag(); - } - } + batch_manager->finalize(); // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. @@ -8178,7 +8129,7 @@ static int llama_encode_impl( lctx.inp_embd_enc = NULL; lctx.n_outputs = n_tokens; - lctx.prepare_decode(ubatch); + //batch_manager->prepare(ubatch); // reserve a worst case graph if needed // TODO: extract to a function From a0c500b4dc91b87acba2529d2db7a2d28f1c3bb6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 21:11:03 +0200 Subject: [PATCH 18/84] context : prepare for abstraction ggml-ci --- src/llama-context.cpp | 307 ++++++++++++++++++++++++++++++++++++++++- src/llama-context.h | 11 +- src/llama.cpp | 314 ++---------------------------------------- 3 files changed, 323 insertions(+), 309 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index de54321df2f1a..4e6033ff15640 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -32,6 +32,309 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } +llama_context::llama_context(const llama_model & model, const llama_context_params & params, std::function fn_build_graph_worst) : + model(model), + t_start_us(model.t_start_us), + t_load_us (model.t_load_us) { + + const auto & hparams = model.hparams; + + cparams.n_seq_max = std::max(1u, params.n_seq_max); + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.defrag_thold = params.defrag_thold; + cparams.embeddings = params.embeddings; + cparams.offload_kqv = params.offload_kqv; + cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; + cparams.pooling_type = params.pooling_type; + + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + + cparams.n_ctx = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams)); + + // with causal attention, the batch size is limited by the context size + cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; + + // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask + // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) + // ref: https://github.com/ggerganov/llama.cpp/pull/5021 + if (cparams.n_batch < GGML_KQ_MASK_PAD) { + LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); + cparams.n_batch = GGML_KQ_MASK_PAD; + } + + cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + + cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : + hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : + hparams.n_ctx_train; + + cparams.cb_eval = params.cb_eval; + cparams.cb_eval_user_data = params.cb_eval_user_data; + + auto rope_scaling_type = params.rope_scaling_type; + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { + rope_scaling_type = hparams.rope_scaling_type_train; + } + + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { + cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none + } + + if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; + } + + cparams.yarn_attn_factor *= hparams.rope_attn_factor; + + if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { + if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { + cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; + } else { + cparams.pooling_type = hparams.pooling_type; + } + } + + if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) { + cparams.causal_attn = hparams.causal_attn; + } else { + cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; + } + + const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; + + LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); + LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); + LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); + LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); + LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); + + if (n_ctx_per_seq < hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", + __func__, n_ctx_per_seq, hparams.n_ctx_train); + } + + if (n_ctx_per_seq > hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + __func__, n_ctx_per_seq, hparams.n_ctx_train); + } + + logits_all = params.logits_all; + + // build worst-case graph for encoder if a model contains encoder + is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder() + + uint32_t kv_size = cparams.n_ctx; + ggml_type type_k = params.type_k; + ggml_type type_v = params.type_v; + + // Mamba only needs a constant number of KV cache cells per sequence + if (llama_model_is_recurrent(&model)) { + // Mamba needs at least as many KV cells as there are sequences kept at any time + kv_size = std::max((uint32_t) 1, params.n_seq_max); + // it's probably best to keep as much precision as possible for the states + type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states + type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states + } + + GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); + GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); + + if (!hparams.vocab_only) { + // GPU backends + for (auto * dev : model.devices) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + throw std::runtime_error("failed to initialize backend"); + } + backends.emplace_back(backend); + } + + // add ACCEL backends (such as BLAS) + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + throw std::runtime_error("failed to initialize backend"); + } + backends.emplace_back(backend); + } + } + + // add CPU backend + backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + if (backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + throw std::runtime_error("failed to initialize CPU backend"); + } + backends.emplace_back(backend_cpu); + + // create a list of the set_n_threads functions in the backends + for (auto & backend : backends) { + ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (reg) { + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn) { + set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); + } + } + } + + llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data); + + if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { + LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); + throw std::runtime_error("failed to initialize self-attention cache"); + } + + { + const size_t memory_size_k = kv_self.size_k_bytes(); + const size_t memory_size_v = kv_self.size_v_bytes(); + + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), + ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); + } + + // graph outputs buffer + { + // resized during inference when a batch uses more outputs + if (llama_output_reserve(*this, params.n_seq_max) < params.n_seq_max) { + LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); + throw std::runtime_error("failed to reserve initial output buffer"); + } + + LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, + ggml_backend_buffer_name (buf_output.get()), + ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0); + } + + // scheduler and compute buffers + { + // buffer types used for the compute buffer of each backend + std::vector backend_buft; + std::vector backend_ptrs; + for (auto & backend : backends) { + auto * buft = ggml_backend_get_default_buffer_type(backend.get()); + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) { + // use the host buffer of the first device CPU for faster transfer of the intermediate state + auto * dev = model.devices[0]; + auto * host_buft = ggml_backend_dev_host_buffer_type(dev); + if (host_buft) { + buft = host_buft; + } + } + backend_buft.push_back(buft); + backend_ptrs.push_back(backend.get()); + } + + const size_t max_nodes = model.max_nodes(); + + // buffer used to store the computation graph and the tensor meta data + buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + + // TODO: move these checks to ggml_backend_sched + // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary + bool pipeline_parallel = + model.n_devices() > 1 && + model.params.n_gpu_layers > (int) model.hparams.n_layer && + model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && + params.offload_kqv; + + // pipeline parallelism requires support for async compute and events in all devices + if (pipeline_parallel) { + for (auto & backend : backends) { + auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + // ignore CPU backend + continue; + } + auto * dev = ggml_backend_get_device(backend.get()); + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + if (!props.caps.async || !props.caps.events) { + // device does not support async compute or events + pipeline_parallel = false; + break; + } + } + } + + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); + + if (pipeline_parallel) { + LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); + } + + // initialize scheduler with the worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + + llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + ggml_cgraph * gf_pp = fn_build_graph_worst(*this, ubatch_pp); + + // reserve pp graph first so that buffers are only allocated once + ggml_backend_sched_reserve(sched.get(), gf_pp); + int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); + int n_nodes_pp = ggml_graph_n_nodes(gf_pp); + + // reserve with tg graph to get the number of splits and nodes + llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + ggml_cgraph * gf_tg = fn_build_graph_worst(*this, ubatch_tg); + ggml_backend_sched_reserve(sched.get(), gf_tg); + int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); + int n_nodes_tg = ggml_graph_n_nodes(gf_tg); + + // reserve again with pp graph to avoid ggml-alloc reallocations during inference + gf_pp = fn_build_graph_worst(*this, ubatch_pp); + if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); + } + + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; + ggml_backend_buffer_type_t buft = backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (size > 1) { + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } + } + + if (n_nodes_pp == n_nodes_tg) { + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); + } else { + LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); + } + if (n_splits_pp == n_splits_tg) { + LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); + } else { + LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); + } + } + } + +} + struct llama_batch_manager : public llama_batch_manager_i { llama_batch_manager(llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { const auto & hparams = lctx.model.hparams; @@ -81,7 +384,7 @@ struct llama_batch_manager : public llama_batch_manager_i { // non-causal masks do not use the KV cache if (hparams.causal_attn) { - llama_kv_self_update(&lctx); + lctx.kv_self_update(); // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it @@ -106,6 +409,8 @@ struct llama_batch_manager : public llama_batch_manager_i { } } + //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + return true; } diff --git a/src/llama-context.h b/src/llama-context.h index 47233f4f52497..d0356e3ed28c3 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -30,11 +30,14 @@ struct llama_batch_manager_i { virtual void finalize() = 0; }; +// TODO: make implementation details private +// TODO: become abstract base class, split the current implementation into different child classes struct llama_context { - llama_context(const llama_model & model) - : model(model) - , t_start_us(model.t_start_us) - , t_load_us (model.t_load_us) {} + // TODO: store the worst-case graph build function and reuse it later + llama_context( + const llama_model & model, + const llama_context_params & params, + std::function fn_build_graph_worst); const struct llama_model & model; diff --git a/src/llama.cpp b/src/llama.cpp index 8f6de199a505c..408bd9030ffae 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7893,8 +7893,6 @@ static int llama_decode_impl( lctx.need_reserve = false; } - //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); @@ -8574,309 +8572,17 @@ struct llama_context * llama_init_from_model( return nullptr; } - llama_context * ctx = new llama_context(*model); - - const auto & hparams = model->hparams; - auto & cparams = ctx->cparams; - - cparams.n_seq_max = std::max(1u, params.n_seq_max); - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch; - cparams.yarn_ext_factor = params.yarn_ext_factor; - cparams.yarn_attn_factor = params.yarn_attn_factor; - cparams.yarn_beta_fast = params.yarn_beta_fast; - cparams.yarn_beta_slow = params.yarn_beta_slow; - cparams.defrag_thold = params.defrag_thold; - cparams.embeddings = params.embeddings; - cparams.offload_kqv = params.offload_kqv; - cparams.flash_attn = params.flash_attn; - cparams.no_perf = params.no_perf; - cparams.pooling_type = params.pooling_type; - - cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; - cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; - - cparams.n_ctx = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams)); - - // with causal attention, the batch size is limited by the context size - cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; - - // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask - // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) - // ref: https://github.com/ggerganov/llama.cpp/pull/5021 - if (cparams.n_batch < GGML_KQ_MASK_PAD) { - LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); - cparams.n_batch = GGML_KQ_MASK_PAD; - } - - cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - - cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : - hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : - hparams.n_ctx_train; - - cparams.cb_eval = params.cb_eval; - cparams.cb_eval_user_data = params.cb_eval_user_data; - - auto rope_scaling_type = params.rope_scaling_type; - if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { - rope_scaling_type = hparams.rope_scaling_type_train; - } - - if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { - cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none - } - - if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' - cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; - } - - cparams.yarn_attn_factor *= hparams.rope_attn_factor; - - if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { - if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { - cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; - } else { - cparams.pooling_type = hparams.pooling_type; - } - } + llama_context * ctx = nullptr; - if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) { - cparams.causal_attn = hparams.causal_attn; - } else { - cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; - } - - const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; - - LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); - LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); - LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); - LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); - LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); - LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - - if (n_ctx_per_seq < hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", - __func__, n_ctx_per_seq, hparams.n_ctx_train); - } - - if (n_ctx_per_seq > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", - __func__, n_ctx_per_seq, hparams.n_ctx_train); - } - - ctx->logits_all = params.logits_all; - - // build worst-case graph for encoder if a model contains encoder - ctx->is_encoding = llama_model_has_encoder(model); - - uint32_t kv_size = cparams.n_ctx; - ggml_type type_k = params.type_k; - ggml_type type_v = params.type_v; - - // Mamba only needs a constant number of KV cache cells per sequence - if (llama_model_is_recurrent(model)) { - // Mamba needs at least as many KV cells as there are sequences kept at any time - kv_size = std::max((uint32_t) 1, params.n_seq_max); - // it's probably best to keep as much precision as possible for the states - type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states - type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states - } - - GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); - GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); - - if (!hparams.vocab_only) { - // GPU backends - for (auto * dev : model->devices) { - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); - llama_free(ctx); - return nullptr; - } - ctx->backends.emplace_back(backend); - } - - // add ACCEL backends (such as BLAS) - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - ggml_backend_dev_t dev = ggml_backend_dev_get(i); - if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); - llama_free(ctx); - return nullptr; - } - ctx->backends.emplace_back(backend); - } - } - - // add CPU backend - ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - if (ctx->backend_cpu == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); - llama_free(ctx); - return nullptr; - } - ctx->backends.emplace_back(ctx->backend_cpu); - - // create a list of the set_n_threads functions in the backends - for (auto & backend : ctx->backends) { - ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); - ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; - if (reg) { - auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); - if (ggml_backend_set_n_threads_fn) { - ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); - } - } - } - - llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); - - if (!ctx->kv_self.init(ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { - LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); - llama_free(ctx); - return nullptr; - } - - { - const size_t memory_size_k = ctx->kv_self.size_k_bytes(); - const size_t memory_size_v = ctx->kv_self.size_v_bytes(); - - LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, - (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), - ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); - } - - // graph outputs buffer - { - // resized during inference when a batch uses more outputs - if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) { - LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); - llama_free(ctx); - return nullptr; - } - - LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, - ggml_backend_buffer_name(ctx->buf_output.get()), - ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0); - } - - // scheduler and compute buffers - { - // buffer types used for the compute buffer of each backend - std::vector backend_buft; - std::vector backend_ptrs; - for (auto & backend : ctx->backends) { - auto * buft = ggml_backend_get_default_buffer_type(backend.get()); - auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); - if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) { - // use the host buffer of the first device CPU for faster transfer of the intermediate state - auto * dev = model->devices[0]; - auto * host_buft = ggml_backend_dev_host_buffer_type(dev); - if (host_buft) { - buft = host_buft; - } - } - backend_buft.push_back(buft); - backend_ptrs.push_back(backend.get()); - } - - const size_t max_nodes = model->max_nodes(); - - // buffer used to store the computation graph and the tensor meta data - ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); - - // TODO: move these checks to ggml_backend_sched - // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary - bool pipeline_parallel = - model->n_devices() > 1 && - model->params.n_gpu_layers > (int)model->hparams.n_layer && - model->params.split_mode == LLAMA_SPLIT_MODE_LAYER && - params.offload_kqv; - - // pipeline parallelism requires support for async compute and events in all devices - if (pipeline_parallel) { - for (auto & backend : ctx->backends) { - auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); - if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { - // ignore CPU backend - continue; - } - auto * dev = ggml_backend_get_device(backend.get()); - ggml_backend_dev_props props; - ggml_backend_dev_get_props(dev, &props); - if (!props.caps.async || !props.caps.events) { - // device does not support async compute or events - pipeline_parallel = false; - break; - } - } - } - - ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); - - if (pipeline_parallel) { - LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get())); - } - - // initialize scheduler with the worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - - llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true); - - // reserve pp graph first so that buffers are only allocated once - ggml_backend_sched_reserve(ctx->sched.get(), gf_pp); - int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get()); - int n_nodes_pp = ggml_graph_n_nodes(gf_pp); - - // reserve with tg graph to get the number of splits and nodes - llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true); - ggml_backend_sched_reserve(ctx->sched.get(), gf_tg); - int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get()); - int n_nodes_tg = ggml_graph_n_nodes(gf_tg); - - // reserve again with pp graph to avoid ggml-alloc reallocations during inference - gf_pp = llama_build_graph(*ctx, ubatch_pp, true); - if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - llama_free(ctx); - return nullptr; - } - - for (size_t i = 0; i < backend_ptrs.size(); ++i) { - ggml_backend_t backend = backend_ptrs[i]; - ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend); - if (size > 1) { - LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); - } - } - - if (n_nodes_pp == n_nodes_tg) { - LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); - } else { - LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); - } - if (n_splits_pp == n_splits_tg) { - LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); - } else { - LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); - } - } + try { + // TODO: add logic which llama_context implementation to construct + ctx = new llama_context(*model, params, + [](llama_context & lctx, const llama_ubatch & ubatch) { + return llama_build_graph(lctx, ubatch, true); + }); + } catch (const std::exception & e) { + LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what()); + return nullptr; } return ctx; From 918885697e4409208b8157ffd18a6c347ca5b04d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Jan 2025 14:45:04 +0200 Subject: [PATCH 19/84] llama : resolve rwkv conflict ggml-ci --- src/llama.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index f410f7a2f1259..0ca8070cd56f1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7076,19 +7076,13 @@ struct llm_build_context { // 1 // ); + // struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); // ggml_build_forward_expand( // gf, // ggml_cpy( // ctx0, - // wkv_states, - // ggml_view_1d( - // ctx0, - // kv_self.v_l[il], - // hparams.n_embd_v_s() * n_seqs, - // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - // ) - // ) - // ); + // ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0), + // ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); // ggml_build_forward_expand(gf, ffn_inp); From 3e23be7911704f8474e7dcb32424bb043be63b06 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 2 Feb 2025 10:17:42 +0200 Subject: [PATCH 20/84] context : store graph build function callback ggml-ci --- src/llama-context.cpp | 37 +++++++++++++++++++++++++++++++++---- src/llama-context.h | 8 ++++++-- src/llama.cpp | 4 ++-- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 37e43213aaaec..1cd168db23fb7 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -33,8 +33,12 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } -llama_context::llama_context(const llama_model & model, const llama_context_params & params, std::function fn_build_graph_worst) : +llama_context::llama_context( + const llama_model & model, + const llama_context_params & params, + build_graph_callback && cb_build_graph) : model(model), + cb_build_graph(std::move(cb_build_graph)), t_start_us(model.t_start_us), t_load_us (model.t_load_us) { @@ -289,7 +293,7 @@ llama_context::llama_context(const llama_model & model, const llama_context_para llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_pp = fn_build_graph_worst(*this, ubatch_pp); + ggml_cgraph * gf_pp = this->cb_build_graph(*this, ubatch_pp, true); // reserve pp graph first so that buffers are only allocated once ggml_backend_sched_reserve(sched.get(), gf_pp); @@ -298,13 +302,13 @@ llama_context::llama_context(const llama_model & model, const llama_context_para // reserve with tg graph to get the number of splits and nodes llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_tg = fn_build_graph_worst(*this, ubatch_tg); + ggml_cgraph * gf_tg = this->cb_build_graph(*this, ubatch_tg, true); ggml_backend_sched_reserve(sched.get(), gf_tg); int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); int n_nodes_tg = ggml_graph_n_nodes(gf_tg); // reserve again with pp graph to avoid ggml-alloc reallocations during inference - gf_pp = fn_build_graph_worst(*this, ubatch_pp); + gf_pp = this->cb_build_graph(*this, ubatch_pp, true); if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); throw std::runtime_error("failed to allocate compute buffers"); @@ -475,6 +479,31 @@ struct llama_batch_manager : public llama_batch_manager_i { //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + // reserve a worst case graph if needed + if (lctx.need_reserve) { + LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); + + const auto & cparams = lctx.cparams; + const auto & model = lctx.model; + + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + ggml_cgraph * gf = lctx.cb_build_graph(lctx, ubatch, true); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(lctx.sched.get()); + if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + + lctx.need_reserve = false; + } + return true; } diff --git a/src/llama-context.h b/src/llama-context.h index 1277645de4a35..5958deaef21a9 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -36,11 +36,13 @@ struct llama_batch_manager_i { // TODO: make implementation details private // TODO: become abstract base class, split the current implementation into different child classes struct llama_context { - // TODO: store the worst-case graph build function and reuse it later + // TODO: tmp until llama-model starts implementing the graph build function + typedef std::function build_graph_callback; + llama_context( const llama_model & model, const llama_context_params & params, - std::function fn_build_graph_worst); + build_graph_callback && cb_build_graph); const struct llama_model & model; @@ -49,6 +51,8 @@ struct llama_context { llama_adapter_cvec cvec; llama_loras loras; + build_graph_callback cb_build_graph; + std::vector backends; std::vector> set_n_threads_fns; diff --git a/src/llama.cpp b/src/llama.cpp index 0ca8070cd56f1..6268249f21f7a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8508,8 +8508,8 @@ struct llama_context * llama_init_from_model( try { // TODO: add logic which llama_context implementation to construct ctx = new llama_context(*model, params, - [](llama_context & lctx, const llama_ubatch & ubatch) { - return llama_build_graph(lctx, ubatch, true); + [](llama_context & lctx, const llama_ubatch & ubatch, bool worst_case) { + return llama_build_graph(lctx, ubatch, worst_case); }); } catch (const std::exception & e) { LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what()); From 1eca8916b51a6952a304e68f312b63649a6cead9 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 3 Feb 2025 20:17:50 +0800 Subject: [PATCH 21/84] llama : fix rwkv inference (#11618) Signed-off-by: Molly Sophia --- src/llama-context.cpp | 222 +++++++++++++++++ src/llama-context.h | 27 +++ src/llama.cpp | 547 ++++++++++++++---------------------------- 3 files changed, 428 insertions(+), 368 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 1cd168db23fb7..3bc0513ca1be0 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1970,6 +1970,228 @@ ggml_tensor * llama_context::build_mamba_layer( } +ggml_tensor * llama_context::build_rwkv_token_shift_load( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto token_shift_count = hparams.token_shift_count; + + const auto & n_tokens = ubatch.n_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + struct ggml_tensor * token_shift_all = kv_self.k_l[il]; + + struct ggml_tensor * token_shift = build_copy_mask_state( + ctx0, graph, token_shift_all, state_copy, state_mask, + n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); + + token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs); + + return token_shift; +} + + +ggml_tensor * llama_context::build_rwkv_token_shift_store( + ggml_context * ctx0, + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto token_shift_count = hparams.token_shift_count; + const auto n_embd = hparams.n_embd; + + const auto & n_tokens = ubatch.n_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + return ggml_cpy( + ctx0, + ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0), + ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) + ); +} + + +ggml_tensor * llama_context::build_rwkv6_time_mix( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto n_tokens = ubatch.n_tokens; + const auto n_seqs = ubatch.n_seqs; + const auto n_embd = hparams.n_embd; + const auto head_size = hparams.wkv_head_size; + const auto n_head = n_embd / head_size; + const auto n_head_kv = hparams.n_head_kv(il); + + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + const auto layer = &model.layers[il]; + + bool is_qrwkv = layer->time_mix_first == nullptr; + + struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur); + + xxx = ggml_reshape_4d( + ctx0, + ggml_tanh( + ctx0, + ggml_mul_mat(ctx0, layer->time_mix_w1, xxx) + ), + layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens + ); + + xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2)); + + xxx = ggml_mul_mat( + ctx0, + ggml_reshape_4d( + ctx0, + layer->time_mix_w2, + layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 + ), + xxx + ); + + struct ggml_tensor *xw, *xk, *xv, *xr, *xg; + if (layer->time_mix_lerp_fused) { + // fusing these weights makes some performance improvement + sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens); + cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); + xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur); + xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + } else { + // for backward compatibility + xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + + xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur); + xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur); + xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur); + xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur); + xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur); + } + + struct ggml_tensor * r = build_lora_mm(ctx0, layer->time_mix_receptance, xr); + struct ggml_tensor * k = build_lora_mm(ctx0, layer->time_mix_key, xk); + struct ggml_tensor * v = build_lora_mm(ctx0, layer->time_mix_value, xv); + if (layer->time_mix_receptance_b) { + r = ggml_add(ctx0, r, layer->time_mix_receptance_b); + } + if (layer->time_mix_key_b) { + k = ggml_add(ctx0, k, layer->time_mix_key_b); + } + if (layer->time_mix_value_b) { + v = ggml_add(ctx0, v, layer->time_mix_value_b); + } + + struct ggml_tensor * g = build_lora_mm(ctx0, layer->time_mix_gate, xg); + if (is_qrwkv) { + g = ggml_sigmoid(ctx0, g); + } else { + g = ggml_silu(ctx0, g); + } + + if (n_head_kv != 0 && n_head_kv != n_head) { + GGML_ASSERT(n_head % n_head_kv == 0); + k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens); + v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens); + struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens); + k = ggml_repeat(ctx0, k, tmp); + v = ggml_repeat(ctx0, v, tmp); + } + + k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens); + v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens); + r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens); + + struct ggml_tensor * w = ggml_mul_mat( + ctx0, + layer->time_mix_decay_w2, + ggml_tanh( + ctx0, + ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw) + ) + ); + + w = ggml_add(ctx0, w, layer->time_mix_decay); + w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w))); + w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens); + + if (is_qrwkv) { + // k = k * (1 - w) + k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w)); + } + + struct ggml_tensor * wkv_state = build_copy_mask_state( + ctx0, graph, kv_self.v_l[il], state_copy, state_mask, + n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); + + struct ggml_tensor * wkv_output; + if (is_qrwkv) { + wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f)); + } else { + wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, wkv_state); + } + cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); + wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); + + ggml_build_forward_expand( + graph, + ggml_cpy( + ctx0, + wkv_state, + ggml_view_1d( + ctx0, + kv_self.v_l[il], + hparams.n_embd_v_s() * n_seqs, + hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + ) + ) + ); + + if (!is_qrwkv) { + // group norm with head_count groups + cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens); + cur = ggml_norm(ctx0, cur, 64e-5f); + + // Convert back to regular vectors. + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b); + } else { + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + } + + cur = ggml_mul(ctx0, cur, g); + cur = build_lora_mm(ctx0, layer->time_mix_output, cur); + + return cur; +} + // llama output size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { diff --git a/src/llama-context.h b/src/llama-context.h index 5958deaef21a9..4cf4a6312ede0 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -248,6 +248,33 @@ struct llama_context { int il, bool worst_case); + ggml_tensor * build_rwkv_token_shift_load( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case); + + ggml_tensor * build_rwkv_token_shift_store( + ggml_context * ctx0, + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il, + bool worst_case); + + ggml_tensor * build_rwkv6_time_mix( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case); + struct ggml_tensor * inp_s_copy; // I32 [kv_size] struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] diff --git a/src/llama.cpp b/src/llama.cpp index 64a5efd2da06d..171ea20178d0f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -574,175 +574,34 @@ struct llm_build_context { return cur; } - //struct ggml_tensor * build_rwkv6_time_mix( - // const struct llama_layer * layer, - // struct ggml_tensor * cur, - // struct ggml_tensor * x_prev, - // struct ggml_tensor ** wkv_state, - // size_t wkv_head_size, - // size_t head_count_kv) { - // size_t n_embd = cur->ne[0]; - // size_t n_seq_tokens = cur->ne[1]; - // size_t n_seqs = cur->ne[2]; - - // size_t head_size = wkv_head_size; - // size_t head_count = n_embd / head_size; - - // size_t n_tokens = n_seqs * n_seq_tokens; - - // bool is_qrwkv = layer->time_mix_first == nullptr; - - // struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); - - // sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens); - // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - - // struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur); - - // xxx = ggml_reshape_4d( - // ctx0, - // ggml_tanh( - // ctx0, - // ggml_mul_mat(ctx0, layer->time_mix_w1, xxx) - // ), - // layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens - // ); - - // xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2)); - - // xxx = ggml_mul_mat( - // ctx0, - // ggml_reshape_4d( - // ctx0, - // layer->time_mix_w2, - // layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 - // ), - // xxx - // ); - - // struct ggml_tensor *xw, *xk, *xv, *xr, *xg; - // if (layer->time_mix_lerp_fused) { - // // fusing these weights makes some performance improvement - // sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens); - // cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); - // xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur); - // xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); - // xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - // xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - // xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - // xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - // } else { - // // for backward compatibility - // xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); - // xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - // xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - // xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - // xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - - // xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur); - // xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur); - // xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur); - // xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur); - // xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur); - // } - - // struct ggml_tensor * r = build_lora_mm(layer->time_mix_receptance, xr); - // struct ggml_tensor * k = build_lora_mm(layer->time_mix_key, xk); - // struct ggml_tensor * v = build_lora_mm(layer->time_mix_value, xv); - // if (layer->time_mix_receptance_b) { - // r = ggml_add(ctx0, r, layer->time_mix_receptance_b); - // } - // if (layer->time_mix_key_b) { - // k = ggml_add(ctx0, k, layer->time_mix_key_b); - // } - // if (layer->time_mix_value_b) { - // v = ggml_add(ctx0, v, layer->time_mix_value_b); - // } - - // struct ggml_tensor * g = build_lora_mm(layer->time_mix_gate, xg); - // if (is_qrwkv) { - // g = ggml_sigmoid(ctx0, g); - // } else { - // g = ggml_silu(ctx0, g); - // } - - // if (head_count_kv != head_count) { - // GGML_ASSERT(head_count % head_count_kv == 0); - // k = ggml_reshape_4d(ctx0, k, head_size, 1, head_count_kv, n_tokens); - // v = ggml_reshape_4d(ctx0, v, head_size, 1, head_count_kv, n_tokens); - // struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens); - // k = ggml_repeat(ctx0, k, tmp); - // v = ggml_repeat(ctx0, v, tmp); - // } - - // k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens); - // v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens); - // r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens); - - // struct ggml_tensor * w = ggml_mul_mat( - // ctx0, - // layer->time_mix_decay_w2, - // ggml_tanh( - // ctx0, - // ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw) - // ) - // ); - - // w = ggml_add(ctx0, w, layer->time_mix_decay); - // w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w))); - // w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens); - - // if (is_qrwkv) { - // // k = k * (1 - w) - // k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w)); - // } - - // struct ggml_tensor * wkv_output; - // if (!layer->time_mix_first) { - // wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, *wkv_state, pow(head_size, -0.5f)); - // } else { - // wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, *wkv_state); - // } - // cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); - // *wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); - - // if (!is_qrwkv) { - // // group norm with head_count groups - // cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens); - // cur = ggml_norm(ctx0, cur, 64e-5f); - - // // Convert back to regular vectors. - // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - // cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b); - // } else { - // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - // } - - // cur = ggml_mul(ctx0, cur, g); - // cur = build_lora_mm(layer->time_mix_output, cur); - - // return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs); - //} + struct ggml_tensor * build_rwkv_channel_mix( + const struct llama_layer * layer, + struct ggml_tensor * cur, + struct ggml_tensor * x_prev, + const llm_arch arch) { + struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + switch (arch) { + case LLM_ARCH_RWKV6: + { + struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); + struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); + + struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); + struct ggml_tensor * k = ggml_sqr( + ctx0, + ggml_relu( + ctx0, + build_lora_mm(layer->channel_mix_key, xk) + ) + ); + cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k)); + } break; + default: + GGML_ABORT("fatal error"); + } - //struct ggml_tensor * build_rwkv6_channel_mix( - // const struct llama_layer * layer, - // struct ggml_tensor * cur, - // struct ggml_tensor * x_prev) { - // struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); - // struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); - // struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); - - // struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); - // struct ggml_tensor * k = ggml_sqr( - // ctx0, - // ggml_relu( - // ctx0, - // build_lora_mm(layer->channel_mix_key, xk) - // ) - // ); - - // return ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k)); - //} + return cur; + } struct ggml_cgraph * build_k_shift() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -6935,226 +6794,178 @@ struct llm_build_context { return gf; } - //ggml_cgraph * build_rwkv6() { - // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + ggml_cgraph * build_rwkv6() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // // Token shift state dimensions should be 2 * n_emb - // GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); + GGML_ASSERT(hparams.token_shift_count == 2); - // const int64_t n_seqs = ubatch.n_seqs; - // const int64_t n_seq_tokens = ubatch.n_seq_tokens; - // const int64_t n_tokens = ubatch.n_tokens; - // GGML_ASSERT(n_seqs != 0); - // GGML_ASSERT(ubatch.equal_seqs); - // GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - // struct ggml_tensor * cur; - // struct ggml_tensor * inpL; - // struct ggml_tensor * state_copy = build_inp_s_copy(); - // struct ggml_tensor * state_mask = build_inp_s_mask(); + inpL = build_inp_embd(model.tok_embd); + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - // inpL = build_inp_embd(model.tok_embd); - // inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); + struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case); - // for (int il = 0; il < n_layer; ++il) { - // const llama_layer * layer = &model.layers[il]; - - // // (ab)using the KV cache to store the states - // struct ggml_tensor * token_shift = build_copy_mask_state( - // gf, kv_self.k_l[il], state_copy, state_mask, - // hparams.n_embd_k_s(), n_seqs); - - // struct ggml_tensor * wkv_states = build_copy_mask_state( - // gf, kv_self.v_l[il], state_copy, state_mask, - // hparams.n_embd_v_s(), n_seqs); - - // cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - // token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs); - - // struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); - // struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); - - // struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); - // struct ggml_tensor * x_prev = ggml_concat( - // ctx0, - // att_shift, - // ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), - // 1 - // ); - - // cur = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size)); - // ggml_build_forward_expand(gf, cur); - // ggml_build_forward_expand( - // gf, - // ggml_cpy( - // ctx0, - // wkv_states, - // ggml_view_1d( - // ctx0, - // kv_self.v_l[il], - // hparams.n_embd_v_s() * n_seqs, - // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - // ) - // ) - // ); - - // struct ggml_tensor * x_norm_ffn = build_norm(cur, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); - // x_prev = ggml_concat( - // ctx0, - // ffn_shift, - // ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), - // 1 - // ); - // cur = ggml_add(ctx0, cur, build_rwkv6_channel_mix(layer, x_norm_ffn, x_prev)); - // ggml_build_forward_expand(gf, cur); - - // struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); - // struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); - - // token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1); - - // ggml_build_forward_expand( - // gf, - // ggml_cpy( - // ctx0, - // ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), - // ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) - // ) - // ); - - // if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { - // cur = ggml_scale(ctx0, cur, 0.5F); - // } + const auto n_embd = hparams.n_embd; + const auto n_seq_tokens = ubatch.n_seq_tokens; + const auto n_seqs = ubatch.n_seqs; - // cur = lctx.cvec.apply_to(ctx0, cur, il); - // cb(cur, "l_out", il); + for (int il = 0; il < n_layer; ++il) { + const llama_layer * layer = &model.layers[il]; - // // input for next layer - // inpL = cur; - // } + struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load( + ctx0, gf, state_copy, state_mask, ubatch, il, worst_case + ); - // cur = inpL; - // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); + struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); - // cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); - // cb(cur, "result_norm", -1); + struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); + cb(att_norm, "attn_norm", il); - // cur = build_lora_mm(model.output, cur); - // cb(cur, "result_output", -1); + struct ggml_tensor * x_prev = ggml_concat( + ctx0, + att_shift, + ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), + 1 + ); - // ggml_build_forward_expand(gf, cur); + cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); - // return gf; - //} + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + struct ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); + cb(ffn_norm, "ffn_norm", il); + + x_prev = ggml_concat( + ctx0, + ffn_shift, + ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), + 1 + ); + + cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6); + cur = ggml_add(ctx0, cur, ffn_inp); + + token_shift = ggml_concat(ctx0, + ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)), + ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), + 1 + ); + ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); + + if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { + cur = ggml_scale(ctx0, cur, 0.5F); + } + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + + cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py - //ggml_cgraph * build_rwkv6qwen2() { - // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + ggml_cgraph * build_rwkv6qwen2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // GGML_ASSERT(n_embd == hparams.n_embd_k_s()); + GGML_ASSERT(n_embd == hparams.n_embd_k_s()); - // const int64_t n_seqs = ubatch.n_seqs; - // const int64_t n_seq_tokens = ubatch.n_seq_tokens; - // const int64_t n_tokens = ubatch.n_tokens; - // GGML_ASSERT(n_seqs != 0); - // GGML_ASSERT(ubatch.equal_seqs); - // GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - // struct ggml_tensor * cur; - // struct ggml_tensor * inpL; - // struct ggml_tensor * state_copy = build_inp_s_copy(); - // struct ggml_tensor * state_mask = build_inp_s_mask(); + inpL = build_inp_embd(model.tok_embd); - // inpL = build_inp_embd(model.tok_embd); + struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case); - // for (int il = 0; il < n_layer; ++il) { - // const llama_layer * layer = &model.layers[il]; - - // // (ab)using the KV cache to store the states - // struct ggml_tensor * token_shift = build_copy_mask_state( - // gf, kv_self.k_l[il], state_copy, state_mask, - // hparams.n_embd_k_s(), n_seqs); - - // struct ggml_tensor * wkv_states = build_copy_mask_state( - // gf, kv_self.v_l[il], state_copy, state_mask, - // hparams.n_embd_v_s(), n_seqs); - - // cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - // token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs); - - // struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); - // struct ggml_tensor * x_prev = ggml_concat( - // ctx0, - // token_shift, - // ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), - // 1 - // ); - - // struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); - // ggml_build_forward_expand( - // gf, - // ggml_cpy( - // ctx0, - // ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0), - // ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) - - // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); - // ggml_build_forward_expand(gf, ffn_inp); - // ggml_build_forward_expand( - // gf, - // ggml_cpy( - // ctx0, - // wkv_states, - // ggml_view_1d( - // ctx0, - // kv_self.v_l[il], - // hparams.n_embd_v_s() * n_seqs, - // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - // ) - // ) - // ); + const auto n_embd = hparams.n_embd; + const auto n_seq_tokens = ubatch.n_seq_tokens; + const auto n_seqs = ubatch.n_seqs; - // cb(ffn_inp, "ffn_inp", il); + inpL = build_inp_embd(model.tok_embd); - // // feed-forward network - // cur = build_norm(ffn_inp, - // model.layers[il].ffn_norm, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "ffn_norm", il); - - // cur = build_ffn(cur, - // model.layers[il].ffn_up, NULL, NULL, - // model.layers[il].ffn_gate, NULL, NULL, - // model.layers[il].ffn_down, NULL, NULL, - // NULL, - // LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - // cb(cur, "ffn_out", il); + for (int il = 0; il < n_layer; ++il) { + const llama_layer * layer = &model.layers[il]; - // cur = ggml_add(ctx0, cur, ffn_inp); - // cur = lctx.cvec.apply_to(ctx0, cur, il); - // cb(cur, "l_out", il); + struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load( + ctx0, gf, state_copy, state_mask, ubatch, il, worst_case + ); - // // input for next layer - // inpL = cur; - // } + struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); + cb(att_norm, "attn_norm", il); - // cur = inpL; - // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + struct ggml_tensor * x_prev = ggml_concat( + ctx0, + token_shift, + ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), + 1 + ); - // cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); - // cb(cur, "result_norm", -1); + cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); - // cur = build_lora_mm(model.output, cur); - // cb(cur, "result_output", -1); + token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); + ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); - // ggml_build_forward_expand(gf, cur); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); - // return gf; - //} + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + + cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } // ref: https://github.com/facebookresearch/chameleon // based on the original build_llama() function, changes: @@ -7726,14 +7537,14 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_exaone(); } break; - //case LLM_ARCH_RWKV6: - // { - // result = llm.build_rwkv6(); - // } break; - //case LLM_ARCH_RWKV6QWEN2: - // { - // result = llm.build_rwkv6qwen2(); - // } break; + case LLM_ARCH_RWKV6: + { + result = llm.build_rwkv6(); + } break; + case LLM_ARCH_RWKV6QWEN2: + { + result = llm.build_rwkv6qwen2(); + } break; case LLM_ARCH_CHAMELEON: { result = llm.build_chameleon(); From e0d913fccbffe7913b2fa6a00590ca68800c9b59 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Feb 2025 10:02:50 +0200 Subject: [PATCH 22/84] llama : clear whitespaces --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 171ea20178d0f..f03386af42b9d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6850,7 +6850,7 @@ struct llm_build_context { cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6); cur = ggml_add(ctx0, cur, ffn_inp); - token_shift = ggml_concat(ctx0, + token_shift = ggml_concat(ctx0, ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)), ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), 1 From b15fede7a9a044d0a15da03b9ceb08f7007bfc95 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Feb 2025 14:34:45 +0200 Subject: [PATCH 23/84] kv-cache : fix defrag condition ggml-ci --- src/llama-context.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 3bc0513ca1be0..719622eaa74ec 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -532,11 +532,13 @@ struct llama_batch_manager : public llama_batch_manager_i { // decide if we need to defrag the kv cache if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { - const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; + // - do not defrag small contexts (i.e. < 2048 tokens) + // - count the padding towards the number of used tokens + const float fragmentation = kv_self.n >= 2048 ? 1.0f - float(kv_self.used + lctx.get_ctx_padding(cparams))/float(kv_self.n) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > cparams.defrag_thold) { - //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); + LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); kv_self.defrag(); } From f9971ef2e1754f8dde65d5fc0602b7719a0c5326 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Feb 2025 14:59:51 +0200 Subject: [PATCH 24/84] llama : dedup reserve code --- src/llama.cpp | 50 ++------------------------------------------------ 1 file changed, 2 insertions(+), 48 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 465938cf02ba1..e89e70bbec560 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7629,30 +7629,6 @@ static int llama_decode_impl( return -3; } - // reserve a worst case graph if needed - // TODO: extract to a function - if (lctx.need_reserve) { - const auto & cparams = lctx.cparams; - const auto & model = lctx.model; - - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - - ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched.get()); - if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - } - - lctx.need_reserve = false; - } - ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); @@ -7889,30 +7865,8 @@ static int llama_encode_impl( //batch_manager->prepare(ubatch); - // reserve a worst case graph if needed - // TODO: extract to a function - if (lctx.need_reserve) { - // TODO: extract to a function - const auto & cparams = lctx.cparams; - const auto & model = lctx.model; - - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - - ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched.get()); - if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - } - - lctx.need_reserve = false; - } + // TODO: do reserve + GGML_ASSERT(lctx.need_reserve == false); ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); From 879ba82777b93f30c32eca731d0bf03e7fd20be7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Feb 2025 15:00:02 +0200 Subject: [PATCH 25/84] server : increase context size for the tests ggml-ci --- examples/server/tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index ce06806620c0b..97d650a9c0cd0 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -280,7 +280,7 @@ def tinyllama2() -> ServerProcess: server.model_hf_repo = "ggml-org/models" server.model_hf_file = "tinyllamas/stories260K.gguf" server.model_alias = "tinyllama-2" - server.n_ctx = 256 + server.n_ctx = 512 server.n_batch = 32 server.n_slots = 2 server.n_predict = 64 From ef358ee78f08e4d7af3916e0d101925c5bc6e122 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Feb 2025 16:11:17 +0200 Subject: [PATCH 26/84] context : add decode/encode ggml-ci --- src/llama-context.cpp | 630 ++++++++++++++++++++++++++++++++++-------- src/llama-context.h | 32 +-- src/llama.cpp | 386 +------------------------- 3 files changed, 526 insertions(+), 522 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7705d583bb004..5d21dd5ef2cb3 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -9,6 +9,121 @@ #include #include +// llama output (TMP) + +// Make sure enough space is available for outputs. +// Returns max number of outputs for which space was reserved. +static size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { + const auto & cparams = lctx.cparams; + const auto & hparams = lctx.model.hparams; + const auto & vocab = lctx.model.vocab; + + const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); + + const auto n_batch = cparams.n_batch; + const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; + + // TODO: use a per-batch flag for logits presence instead + const bool has_logits = !cparams.embeddings; + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + + const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; + const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; + + if (lctx.output_ids.empty()) { + // init, never resized afterwards + lctx.output_ids.resize(n_batch); + } + + const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; + const size_t new_size = (logits_size + embd_size) * sizeof(float); + + // alloc only when more than the current capacity is required + // TODO: also consider shrinking the buffer + if (!lctx.buf_output || prev_size < new_size) { + if (lctx.buf_output) { +#ifndef NDEBUG + // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) + LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + lctx.buf_output = nullptr; + lctx.logits = nullptr; + lctx.embd = nullptr; + } + + auto * buft = ggml_backend_cpu_buffer_type(); + // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory + auto * output_dev = lctx.model.dev_output(); + auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; + if (output_dev_host_buft) { + buft = output_dev_host_buft; + } + lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); + if (lctx.buf_output == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); + return 0; + } + } + + float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get()); + + lctx.logits = has_logits ? output_base : nullptr; + lctx.embd = has_embd ? output_base + logits_size : nullptr; + + lctx.output_size = n_outputs_max; + lctx.logits_size = logits_size; + lctx.embd_size = embd_size; + + // set all ids as invalid (negative) + std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1); + + ggml_backend_buffer_clear(lctx.buf_output.get(), 0); + + lctx.n_outputs = 0; + + return n_outputs_max; +} + +// make the outputs have the same order they had in the user-provided batch +static void llama_output_reorder(struct llama_context & ctx) { + std::vector & out_ids = ctx.sbatch.out_ids; + if (!out_ids.empty()) { + const uint32_t n_vocab = ctx.model.vocab.n_tokens(); + const uint32_t n_embd = ctx.model.hparams.n_embd; + + const int32_t n_outputs = ctx.n_outputs; + GGML_ASSERT((size_t) n_outputs == out_ids.size()); + + // TODO: is there something more efficient which also minimizes swaps? + // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) + for (int32_t i = 0; i < n_outputs - 1; ++i) { + int32_t j_min = i; + for (int32_t j = i + 1; j < n_outputs; ++j) { + if (out_ids[j] < out_ids[j_min]) { + j_min = j; + } + } + if (j_min == i) { continue; } + std::swap(out_ids[i], out_ids[j_min]); + if (ctx.logits_size > 0) { + for (uint32_t k = 0; k < n_vocab; k++) { + std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]); + } + } + if (ctx.embd_size > 0) { + for (uint32_t k = 0; k < n_embd; k++) { + std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]); + } + } + } + std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1); + for (int32_t i = 0; i < n_outputs; ++i) { + ctx.output_ids[out_ids[i]] = i; + } + out_ids.clear(); + } +} static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { // TODO move to hparams if a T5 variant appears that uses a different value const int64_t max_distance = 128; @@ -340,6 +455,20 @@ llama_context::llama_context( } +struct llama_batch_manager_i { + virtual ~llama_batch_manager_i() = default; + + virtual bool is_done() const = 0; + virtual llama_ubatch next() = 0; + virtual bool prepare() = 0; + virtual void restore() = 0; + virtual void update() = 0; + virtual void finalize() = 0; + + // TODO: might be temporary + int64_t n_outputs_all = 0; +}; + struct llama_batch_manager : public llama_batch_manager_i { llama_batch_manager(llama_context & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { const auto & model = lctx.model; @@ -398,6 +527,10 @@ struct llama_batch_manager : public llama_batch_manager_i { ~llama_batch_manager() override { } + virtual bool is_done() const override { + return lctx.sbatch.n_tokens == 0; + } + virtual llama_ubatch next() override { ubatch = llama_ubatch(); @@ -558,6 +691,390 @@ std::unique_ptr llama_context::prepare_batch(const llama_ return std::make_unique(*this, batch); } +int llama_context::decode(llama_batch & inp_batch) { + is_encoding = false; + + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } + + // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); + + const llama_batch & batch = batch_allocr.batch; + + const auto & vocab = model.vocab; + const auto & hparams = model.hparams; + + const int32_t n_vocab = vocab.n_tokens(); + const int64_t n_embd = hparams.n_embd; + + // TODO: try catch + auto bman = prepare_batch(batch); + + const auto n_outputs_all = bman->n_outputs_all; + + // reserve output buffer + // TODO: move to batch manager? + if (llama_output_reserve(*this, bman->n_outputs_all) < (size_t) n_outputs_all) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + return -2; + }; + + int64_t n_outputs_prev = 0; + + while (!bman->is_done()) { + llama_ubatch ubatch = bman->next(); + + if (!bman->prepare()) { + LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); + bman->restore(); + return -3; + } + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + ggml_cgraph * gf = cb_build_graph(*this, ubatch, false); + + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + set_inputs(ubatch); + + // the output is always the last tensor in the graph + struct ggml_tensor * t_logits = ggml_graph_node(gf, -1); + struct ggml_tensor * t_embd = ggml_graph_node(gf, -2); + + if (n_outputs == 0) { + // no output + t_logits = nullptr; + t_embd = nullptr; + } else if (cparams.embeddings) { + t_logits = nullptr; // do not extract logits for embedding case + t_embd = nullptr; + for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { + t_embd = ggml_graph_node(gf, i); + break; + } + } + GGML_ASSERT(t_embd != nullptr && "missing embeddings tensor"); + } else { + t_embd = nullptr; // do not extract embeddings when not needed + GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor"); + } + + const auto compute_status = compute_graph(gf, ubatch.n_tokens > 1); + if (compute_status != GGML_STATUS_SUCCESS) { + bman->restore(); + switch (compute_status) { + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + } + + bman->update(); + + // plot the computation graph in dot format (for debugging purposes) + //if (n_past%100 == 0) { + // ggml_graph_dump_dot(gf, NULL, "llama.dot"); + //} + + // extract logits + if (t_logits) { + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); + GGML_ASSERT(backend_res != nullptr); + GGML_ASSERT(logits != nullptr); + + float * logits_out = logits + n_outputs_prev*n_vocab; + const int32_t n_outputs_new = n_outputs; + + if (n_outputs_new) { + GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) logits_size); + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs_new*n_vocab*sizeof(float)); + } + } + + // extract embeddings + if (t_embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(embd != nullptr); + float * embd_out = embd + n_outputs_prev*n_embd; + const int32_t n_outputs_new = n_outputs; + + if (n_outputs_new) { + GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings (cleared before processing each batch) + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // extract the rerank score - a single float per sequence + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(1); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } + } + } + + n_outputs_prev += n_outputs; + } + + // set output mappings + { + bool sorted_output = true; + + GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all); + + for (size_t i = 0; i < (size_t) n_outputs_all; ++i) { + size_t out_id = sbatch.out_ids[i]; + output_ids[out_id] = i; + if (out_id != i) { + sorted_output = false; + } + } + + if (sorted_output) { + sbatch.out_ids.clear(); + } + } + + // set to total number of outputs in the batch, for use in llama_get_logits_ith + n_outputs = n_outputs_all; + + // wait for the computation to finish (automatically done when obtaining the model output) + //llama_synchronize(&; + + bman->finalize(); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + return 0; +} + +int llama_context::encode(llama_batch & inp_batch) { + is_encoding = true; + + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } + + // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); + + const llama_batch & batch = batch_allocr.batch; + const uint32_t n_tokens = batch.n_tokens; + + const auto & hparams = model.hparams; + + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + + if (batch.token) { + for (uint32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + return -1; + } + } + } + + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot + GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); + + if (t_compute_start_us == 0) { + t_compute_start_us = ggml_time_us(); + } + + n_queued_tokens += n_tokens; + + const int64_t n_embd = hparams.n_embd; + + sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); + + const llama_ubatch ubatch = sbatch.split_simple(n_tokens); + + // reserve output buffer + if (llama_output_reserve(*this, n_tokens) < n_tokens) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); + return -2; + }; + + for (uint32_t i = 0; i < n_tokens; ++i) { + output_ids[i] = i; + } + + inp_embd_enc = NULL; + n_outputs = n_tokens; + + //batch_manager->prepare(ubatch); + + // TODO: do reserve + GGML_ASSERT(need_reserve == false); + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + ggml_cgraph * gf = cb_build_graph(*this, ubatch, false); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + set_inputs(ubatch); + + // the output embeddings after the final encoder normalization + struct ggml_tensor * t_embd = nullptr; + + // there are two cases here + if (llama_model_has_decoder(&model)) { + // first case is an encoder-decoder T5 model where embeddings are passed to decoder + t_embd = ggml_graph_node(gf, -1); + GGML_ASSERT(strcmp(t_embd->name, "result_norm") == 0 && "missing result_output tensor"); + } else { + // second case is an encoder-only T5 model + if (cparams.embeddings) { + // only output embeddings if required + t_embd = ggml_graph_node(gf, -1); + if (strcmp(t_embd->name, "result_embd_pooled") != 0) { + t_embd = ggml_graph_node(gf, -2); + } + GGML_ASSERT(strcmp(t_embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + } + } + + const auto compute_status = compute_graph(gf, n_tokens > 1); + switch (compute_status) { + case GGML_STATUS_SUCCESS: + break; + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + + // extract embeddings + if (t_embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); + + if (llama_model_has_decoder(&model)) { + embd_enc.resize(n_tokens*n_embd); + float * embd_out = embd_enc.data(); + + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + // remember the sequence ids used during the encoding - needed for cross attention later + seq_ids_enc.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + for (int s = 0; s < ubatch.n_seq_id[i]; s++) { + llama_seq_id seq_id = ubatch.seq_id[i][s]; + seq_ids_enc[i].insert(seq_id); + } + } + } else { + GGML_ASSERT(embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(embd != nullptr); + float * embd_out = embd; + + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = embd_seq; + embd_seq_out.clear(); + + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + for (uint32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = ubatch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // TODO: this likely should be the same logic as in llama_decoder_internal, but better to + // wait for an encoder model that requires this pooling type in order to test it + // https://github.com/ggerganov/llama.cpp/pull/9510 + GGML_ABORT("RANK pooling not implemented yet"); + } + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } + } + } + } + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + return 0; +} + enum ggml_status llama_context::compute_graph( ggml_cgraph * graph, bool batched) { @@ -2194,119 +2711,6 @@ ggml_tensor * llama_context::build_rwkv6_time_mix( return cur; } -// llama output - -size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { - const auto & cparams = lctx.cparams; - const auto & hparams = lctx.model.hparams; - const auto & vocab = lctx.model.vocab; - - const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); - - const auto n_batch = cparams.n_batch; - const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; - - // TODO: use a per-batch flag for logits presence instead - const bool has_logits = !cparams.embeddings; - const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); - - const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; - const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; - - if (lctx.output_ids.empty()) { - // init, never resized afterwards - lctx.output_ids.resize(n_batch); - } - - const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; - const size_t new_size = (logits_size + embd_size) * sizeof(float); - - // alloc only when more than the current capacity is required - // TODO: also consider shrinking the buffer - if (!lctx.buf_output || prev_size < new_size) { - if (lctx.buf_output) { -#ifndef NDEBUG - // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) - LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); -#endif - lctx.buf_output = nullptr; - lctx.logits = nullptr; - lctx.embd = nullptr; - } - - auto * buft = ggml_backend_cpu_buffer_type(); - // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory - auto * output_dev = lctx.model.dev_output(); - auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; - if (output_dev_host_buft) { - buft = output_dev_host_buft; - } - lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); - if (lctx.buf_output == nullptr) { - LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); - return 0; - } - } - - float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get()); - - lctx.logits = has_logits ? output_base : nullptr; - lctx.embd = has_embd ? output_base + logits_size : nullptr; - - lctx.output_size = n_outputs_max; - lctx.logits_size = logits_size; - lctx.embd_size = embd_size; - - // set all ids as invalid (negative) - std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1); - - ggml_backend_buffer_clear(lctx.buf_output.get(), 0); - - lctx.n_outputs = 0; - - return n_outputs_max; -} - -void llama_output_reorder(struct llama_context & ctx) { - std::vector & out_ids = ctx.sbatch.out_ids; - if (!out_ids.empty()) { - const uint32_t n_vocab = ctx.model.vocab.n_tokens(); - const uint32_t n_embd = ctx.model.hparams.n_embd; - - const int32_t n_outputs = ctx.n_outputs; - GGML_ASSERT((size_t) n_outputs == out_ids.size()); - - // TODO: is there something more efficient which also minimizes swaps? - // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) - for (int32_t i = 0; i < n_outputs - 1; ++i) { - int32_t j_min = i; - for (int32_t j = i + 1; j < n_outputs; ++j) { - if (out_ids[j] < out_ids[j_min]) { - j_min = j; - } - } - if (j_min == i) { continue; } - std::swap(out_ids[i], out_ids[j_min]); - if (ctx.logits_size > 0) { - for (uint32_t k = 0; k < n_vocab; k++) { - std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]); - } - } - if (ctx.embd_size > 0) { - for (uint32_t k = 0; k < n_embd; k++) { - std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]); - } - } - } - std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1); - for (int32_t i = 0; i < n_outputs; ++i) { - ctx.output_ids[out_ids[i]] = i; - } - out_ids.clear(); - } -} - // // interface implementation // diff --git a/src/llama-context.h b/src/llama-context.h index 4cf4a6312ede0..f6d63eb3cebfc 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -16,22 +16,7 @@ using llama_loras = std::unordered_map; -// TODO: this is very WIP - improve -struct llama_batch_manager_i { - virtual ~llama_batch_manager_i() = default; - - //bool is_done() const; - - virtual llama_ubatch next() = 0; - - virtual bool prepare() = 0; - virtual void restore() = 0; - virtual void update() = 0; - virtual void finalize() = 0; - - // TODO: might be temporary - int64_t n_outputs_all = 0; -}; +struct llama_batch_manager_i; // TODO: make implementation details private // TODO: become abstract base class, split the current implementation into different child classes @@ -44,6 +29,8 @@ struct llama_context { const llama_context_params & params, build_graph_callback && cb_build_graph); + virtual ~llama_context() = default; + const struct llama_model & model; llama_cparams cparams; @@ -104,8 +91,10 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; - // TODO: do not pass logits_all explicitly - std::unique_ptr prepare_batch(const llama_batch & batch); + virtual std::unique_ptr prepare_batch(const llama_batch & batch); + + virtual int decode(llama_batch & inp_batch); + virtual int encode(llama_batch & inp_batch); // returns the result of ggml_backend_sched_graph_compute_async execution enum ggml_status compute_graph( @@ -286,13 +275,6 @@ struct llama_context { int n_pos_per_token = 1; }; -// Make sure enough space is available for outputs. -// Returns max number of outputs for which space was reserved. -size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs); - -// make the outputs have the same order they had in the user-provided batch -void llama_output_reorder(struct llama_context & ctx); - // For internal test use // TODO: remove const std::vector> & llama_internal_get_tensor_map(struct llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index e89e70bbec560..ed5e1e5254e7a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7584,213 +7584,7 @@ static struct ggml_cgraph * llama_build_graph( static int llama_decode_impl( llama_context & lctx, llama_batch inp_batch) { - - lctx.is_encoding = false; - - if (inp_batch.n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); - return -1; - } - - // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); - - const llama_batch & batch = batch_allocr.batch; - - const auto & model = lctx.model; - const auto & vocab = model.vocab; - const auto & cparams = lctx.cparams; - const auto & hparams = lctx.model.hparams; - - const int32_t n_vocab = vocab.n_tokens(); - const int64_t n_embd = hparams.n_embd; - - // TODO: try catch - auto bman = lctx.prepare_batch(batch); - - const auto n_outputs_all = bman->n_outputs_all; - - // reserve output buffer - // TODO: move to batch manager? - if (llama_output_reserve(lctx, bman->n_outputs_all) < (size_t) n_outputs_all) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); - return -2; - }; - - int64_t n_outputs_prev = 0; - - while (lctx.sbatch.n_tokens > 0) { - llama_ubatch ubatch = bman->next(); - - if (!bman->prepare()) { - LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); - bman->restore(); - return -3; - } - - ggml_backend_sched_reset(lctx.sched.get()); - ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); - - ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); - - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - lctx.set_inputs(ubatch); - - // the output is always the last tensor in the graph - struct ggml_tensor * res = ggml_graph_node(gf, -1); - struct ggml_tensor * embd = ggml_graph_node(gf, -2); - - if (lctx.n_outputs == 0) { - // no output - res = nullptr; - embd = nullptr; - } else if (cparams.embeddings) { - res = nullptr; // do not extract logits for embedding case - embd = nullptr; - for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { - if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { - embd = ggml_graph_node(gf, i); - break; - } - } - GGML_ASSERT(embd != nullptr && "missing embeddings tensor"); - } else { - embd = nullptr; // do not extract embeddings when not needed - GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); - } - - const auto compute_status = lctx.compute_graph(gf, ubatch.n_tokens > 1); - if (compute_status != GGML_STATUS_SUCCESS) { - bman->restore(); - switch (compute_status) { - case GGML_STATUS_ABORTED: - return 2; - case GGML_STATUS_ALLOC_FAILED: - return -2; - case GGML_STATUS_FAILED: - default: - return -3; - } - } - - bman->update(); - - // plot the computation graph in dot format (for debugging purposes) - //if (n_past%100 == 0) { - // ggml_graph_dump_dot(gf, NULL, "llama.dot"); - //} - - // extract logits - if (res) { - ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res); - GGML_ASSERT(backend_res != nullptr); - GGML_ASSERT(lctx.logits != nullptr); - - float * logits_out = lctx.logits + n_outputs_prev*n_vocab; - const int32_t n_outputs_new = lctx.n_outputs; - - if (n_outputs_new) { - GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size); - ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float)); - } - } - - // extract embeddings - if (embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd); - GGML_ASSERT(backend_embd != nullptr); - - switch (cparams.pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - // extract token embeddings - GGML_ASSERT(lctx.embd != nullptr); - float * embd_out = lctx.embd + n_outputs_prev*n_embd; - const int32_t n_outputs_new = lctx.n_outputs; - - if (n_outputs_new) { - GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size); - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_MEAN: - case LLAMA_POOLING_TYPE_CLS: - case LLAMA_POOLING_TYPE_LAST: - { - // extract sequence embeddings (cleared before processing each batch) - auto & embd_seq_out = lctx.embd_seq; - - for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_RANK: - { - // extract the rerank score - a single float per sequence - auto & embd_seq_out = lctx.embd_seq; - - for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(1); - ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_UNSPECIFIED: - { - GGML_ABORT("unknown pooling type"); - } - } - } - - n_outputs_prev += lctx.n_outputs; - } - - // set output mappings - { - bool sorted_output = true; - - GGML_ASSERT(lctx.sbatch.out_ids.size() == (size_t) n_outputs_all); - - for (size_t i = 0; i < (size_t) n_outputs_all; ++i) { - size_t out_id = lctx.sbatch.out_ids[i]; - lctx.output_ids[out_id] = i; - if (out_id != i) { - sorted_output = false; - } - } - - if (sorted_output) { - lctx.sbatch.out_ids.clear(); - } - } - - // set to total number of outputs in the batch, for use in llama_get_logits_ith - lctx.n_outputs = n_outputs_all; - - // wait for the computation to finish (automatically done when obtaining the model output) - //llama_synchronize(&lctx); - - bman->finalize(); - - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - ggml_backend_sched_reset(lctx.sched.get()); - - return 0; + return lctx.decode(inp_batch); } // encode a batch of tokens by evaluating the encoder part of the transformer @@ -7805,183 +7599,7 @@ static int llama_decode_impl( static int llama_encode_impl( llama_context & lctx, llama_batch inp_batch) { - - lctx.is_encoding = true; - - if (inp_batch.n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); - return -1; - } - - // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); - - const llama_batch & batch = batch_allocr.batch; - const uint32_t n_tokens = batch.n_tokens; - - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - - if (batch.token) { - for (uint32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); - return -1; - } - } - } - - // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot - GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); - - if (lctx.t_compute_start_us == 0) { - lctx.t_compute_start_us = ggml_time_us(); - } - - lctx.n_queued_tokens += n_tokens; - - const int64_t n_embd = hparams.n_embd; - - lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); - - const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens); - - // reserve output buffer - if (llama_output_reserve(lctx, n_tokens) < n_tokens) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); - return -2; - }; - - for (uint32_t i = 0; i < n_tokens; ++i) { - lctx.output_ids[i] = i; - } - - lctx.inp_embd_enc = NULL; - lctx.n_outputs = n_tokens; - - //batch_manager->prepare(ubatch); - - // TODO: do reserve - GGML_ASSERT(lctx.need_reserve == false); - - ggml_backend_sched_reset(lctx.sched.get()); - ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); - - ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); - - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - lctx.set_inputs(ubatch); - - // the output embeddings after the final encoder normalization - struct ggml_tensor * embd = nullptr; - - // there are two cases here - if (llama_model_has_decoder(&lctx.model)) { - // first case is an encoder-decoder T5 model where embeddings are passed to decoder - embd = ggml_graph_node(gf, -1); - GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); - } else { - // second case is an encoder-only T5 model - if (cparams.embeddings) { - // only output embeddings if required - embd = ggml_graph_node(gf, -1); - if (strcmp(embd->name, "result_embd_pooled") != 0) { - embd = ggml_graph_node(gf, -2); - } - GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); - } - } - - const auto compute_status = lctx.compute_graph(gf, n_tokens > 1); - switch (compute_status) { - case GGML_STATUS_SUCCESS: - break; - case GGML_STATUS_ABORTED: - return 2; - case GGML_STATUS_ALLOC_FAILED: - return -2; - case GGML_STATUS_FAILED: - default: - return -3; - } - - // extract embeddings - if (embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd); - GGML_ASSERT(backend_embd != nullptr); - - if (llama_model_has_decoder(&lctx.model)) { - lctx.embd_enc.resize(n_tokens*n_embd); - float * embd_out = lctx.embd_enc.data(); - - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - - // remember the sequence ids used during the encoding - needed for cross attention later - lctx.seq_ids_enc.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - for (int s = 0; s < ubatch.n_seq_id[i]; s++) { - llama_seq_id seq_id = ubatch.seq_id[i][s]; - lctx.seq_ids_enc[i].insert(seq_id); - } - } - } else { - GGML_ASSERT(lctx.embd != nullptr); - - switch (cparams.pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - // extract token embeddings - GGML_ASSERT(lctx.embd != nullptr); - float * embd_out = lctx.embd; - - GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size); - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - } break; - case LLAMA_POOLING_TYPE_MEAN: - case LLAMA_POOLING_TYPE_CLS: - case LLAMA_POOLING_TYPE_LAST: - { - // extract sequence embeddings - auto & embd_seq_out = lctx.embd_seq; - embd_seq_out.clear(); - - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - - for (uint32_t i = 0; i < n_tokens; i++) { - const llama_seq_id seq_id = ubatch.seq_id[i][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_RANK: - { - // TODO: this likely should be the same logic as in llama_decoder_internal, but better to - // wait for an encoder model that requires this pooling type in order to test it - // https://github.com/ggerganov/llama.cpp/pull/9510 - GGML_ABORT("RANK pooling not implemented yet"); - } - case LLAMA_POOLING_TYPE_UNSPECIFIED: - { - GGML_ABORT("unknown pooling type"); - } - } - } - } - - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - ggml_backend_sched_reset(lctx.sched.get()); - - return 0; + return lctx.encode(inp_batch); } // From d1d8d530083a9bf3ada2427bf59e97fa58667365 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Feb 2025 16:50:14 +0200 Subject: [PATCH 27/84] bman : remove ubatch member ggml-ci --- src/llama-context.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 5d21dd5ef2cb3..4387128fedf15 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -460,9 +460,9 @@ struct llama_batch_manager_i { virtual bool is_done() const = 0; virtual llama_ubatch next() = 0; - virtual bool prepare() = 0; + virtual bool prepare(const llama_ubatch & ubatch) = 0; virtual void restore() = 0; - virtual void update() = 0; + virtual void update(const llama_ubatch & ubatch) = 0; virtual void finalize() = 0; // TODO: might be temporary @@ -532,7 +532,7 @@ struct llama_batch_manager : public llama_batch_manager_i { } virtual llama_ubatch next() override { - ubatch = llama_ubatch(); + llama_ubatch ubatch = llama_ubatch(); const auto & cparams = lctx.cparams; const auto & kv_self = lctx.kv_self; @@ -557,7 +557,7 @@ struct llama_batch_manager : public llama_batch_manager_i { return ubatch; } - virtual bool prepare() override { + virtual bool prepare(const llama_ubatch & ubatch) override { const auto & cparams = lctx.cparams; const auto & hparams = lctx.model.hparams; const auto & batch = lctx.sbatch.batch; @@ -644,7 +644,7 @@ struct llama_batch_manager : public llama_batch_manager_i { kv_slot_restorer.restore(lctx.kv_self); } - virtual void update() override { + virtual void update(const llama_ubatch & ubatch) override { auto & kv_self = lctx.kv_self; // update the kv ring buffer @@ -682,8 +682,6 @@ struct llama_batch_manager : public llama_batch_manager_i { const llama_batch & batch; - llama_ubatch ubatch; - llama_kv_slot_restorer kv_slot_restorer; }; @@ -728,7 +726,7 @@ int llama_context::decode(llama_batch & inp_batch) { while (!bman->is_done()) { llama_ubatch ubatch = bman->next(); - if (!bman->prepare()) { + if (!bman->prepare(ubatch)) { LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); bman->restore(); return -3; @@ -782,7 +780,7 @@ int llama_context::decode(llama_batch & inp_batch) { } } - bman->update(); + bman->update(ubatch); // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { From 2cd8a903c84b9fbf91f256a6349e05e492a47421 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Feb 2025 17:01:27 +0200 Subject: [PATCH 28/84] context : make output functions members ggml-ci --- src/llama-context.cpp | 238 ++++++++++++++++++++---------------------- src/llama-context.h | 8 ++ 2 files changed, 122 insertions(+), 124 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4387128fedf15..87d6642da778f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -9,121 +9,6 @@ #include #include -// llama output (TMP) - -// Make sure enough space is available for outputs. -// Returns max number of outputs for which space was reserved. -static size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { - const auto & cparams = lctx.cparams; - const auto & hparams = lctx.model.hparams; - const auto & vocab = lctx.model.vocab; - - const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); - - const auto n_batch = cparams.n_batch; - const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; - - // TODO: use a per-batch flag for logits presence instead - const bool has_logits = !cparams.embeddings; - const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); - - const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; - const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; - - if (lctx.output_ids.empty()) { - // init, never resized afterwards - lctx.output_ids.resize(n_batch); - } - - const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; - const size_t new_size = (logits_size + embd_size) * sizeof(float); - - // alloc only when more than the current capacity is required - // TODO: also consider shrinking the buffer - if (!lctx.buf_output || prev_size < new_size) { - if (lctx.buf_output) { -#ifndef NDEBUG - // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) - LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); -#endif - lctx.buf_output = nullptr; - lctx.logits = nullptr; - lctx.embd = nullptr; - } - - auto * buft = ggml_backend_cpu_buffer_type(); - // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory - auto * output_dev = lctx.model.dev_output(); - auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; - if (output_dev_host_buft) { - buft = output_dev_host_buft; - } - lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); - if (lctx.buf_output == nullptr) { - LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); - return 0; - } - } - - float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get()); - - lctx.logits = has_logits ? output_base : nullptr; - lctx.embd = has_embd ? output_base + logits_size : nullptr; - - lctx.output_size = n_outputs_max; - lctx.logits_size = logits_size; - lctx.embd_size = embd_size; - - // set all ids as invalid (negative) - std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1); - - ggml_backend_buffer_clear(lctx.buf_output.get(), 0); - - lctx.n_outputs = 0; - - return n_outputs_max; -} - -// make the outputs have the same order they had in the user-provided batch -static void llama_output_reorder(struct llama_context & ctx) { - std::vector & out_ids = ctx.sbatch.out_ids; - if (!out_ids.empty()) { - const uint32_t n_vocab = ctx.model.vocab.n_tokens(); - const uint32_t n_embd = ctx.model.hparams.n_embd; - - const int32_t n_outputs = ctx.n_outputs; - GGML_ASSERT((size_t) n_outputs == out_ids.size()); - - // TODO: is there something more efficient which also minimizes swaps? - // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) - for (int32_t i = 0; i < n_outputs - 1; ++i) { - int32_t j_min = i; - for (int32_t j = i + 1; j < n_outputs; ++j) { - if (out_ids[j] < out_ids[j_min]) { - j_min = j; - } - } - if (j_min == i) { continue; } - std::swap(out_ids[i], out_ids[j_min]); - if (ctx.logits_size > 0) { - for (uint32_t k = 0; k < n_vocab; k++) { - std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]); - } - } - if (ctx.embd_size > 0) { - for (uint32_t k = 0; k < n_embd; k++) { - std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]); - } - } - } - std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1); - for (int32_t i = 0; i < n_outputs; ++i) { - ctx.output_ids[out_ids[i]] = i; - } - out_ids.clear(); - } -} static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { // TODO move to hparams if a T5 variant appears that uses a different value const int64_t max_distance = 128; @@ -334,7 +219,7 @@ llama_context::llama_context( // graph outputs buffer { // resized during inference when a batch uses more outputs - if (llama_output_reserve(*this, params.n_seq_max) < params.n_seq_max) { + if (reserve_outputs(params.n_seq_max) < params.n_seq_max) { LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); throw std::runtime_error("failed to reserve initial output buffer"); } @@ -716,7 +601,7 @@ int llama_context::decode(llama_batch & inp_batch) { // reserve output buffer // TODO: move to batch manager? - if (llama_output_reserve(*this, bman->n_outputs_all) < (size_t) n_outputs_all) { + if (reserve_outputs(bman->n_outputs_all) < (size_t) n_outputs_all) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); return -2; }; @@ -940,7 +825,7 @@ int llama_context::encode(llama_batch & inp_batch) { const llama_ubatch ubatch = sbatch.split_simple(n_tokens); // reserve output buffer - if (llama_output_reserve(*this, n_tokens) < n_tokens) { + if (reserve_outputs(n_tokens) < n_tokens) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); return -2; }; @@ -1555,6 +1440,113 @@ void llama_context::set_inputs(const llama_ubatch & ubatch) { } } +void llama_context::reorder_outputs() { + std::vector & out_ids = sbatch.out_ids; + if (!out_ids.empty()) { + const uint32_t n_vocab = model.vocab.n_tokens(); + const uint32_t n_embd = model.hparams.n_embd; + + GGML_ASSERT((size_t) n_outputs == out_ids.size()); + + // TODO: is there something more efficient which also minimizes swaps? + // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) + for (int32_t i = 0; i < n_outputs - 1; ++i) { + int32_t j_min = i; + for (int32_t j = i + 1; j < n_outputs; ++j) { + if (out_ids[j] < out_ids[j_min]) { + j_min = j; + } + } + if (j_min == i) { continue; } + std::swap(out_ids[i], out_ids[j_min]); + if (logits_size > 0) { + for (uint32_t k = 0; k < n_vocab; k++) { + std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); + } + } + if (embd_size > 0) { + for (uint32_t k = 0; k < n_embd; k++) { + std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); + } + } + } + std::fill(output_ids.begin(), output_ids.end(), -1); + for (int32_t i = 0; i < n_outputs; ++i) { + output_ids[out_ids[i]] = i; + } + out_ids.clear(); + } +} + +size_t llama_context::reserve_outputs(size_t n_outputs) { + const auto & hparams = model.hparams; + const auto & vocab = model.vocab; + + const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); + + const auto n_batch = cparams.n_batch; + const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; + + // TODO: use a per-batch flag for logits presence instead + const bool has_logits = !cparams.embeddings; + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + + logits_size = has_logits ? n_vocab*n_outputs_max : 0; + embd_size = has_embd ? n_embd*n_outputs_max : 0; + + if (output_ids.empty()) { + // init, never resized afterwards + output_ids.resize(n_batch); + } + + const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; + const size_t new_size = (logits_size + embd_size) * sizeof(float); + + // alloc only when more than the current capacity is required + // TODO: also consider shrinking the buffer + if (!buf_output || prev_size < new_size) { + if (buf_output) { +#ifndef NDEBUG + // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) + LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + buf_output = nullptr; + logits = nullptr; + embd = nullptr; + } + + auto * buft = ggml_backend_cpu_buffer_type(); + // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory + auto * output_dev = model.dev_output(); + auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; + if (output_dev_host_buft) { + buft = output_dev_host_buft; + } + buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); + if (buf_output == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); + return 0; + } + } + + float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get()); + + logits = has_logits ? output_base : nullptr; + embd = has_embd ? output_base + logits_size : nullptr; + + output_size = n_outputs_max; + + // set all ids as invalid (negative) + std::fill(output_ids.begin(), output_ids.end(), -1); + + ggml_backend_buffer_clear(buf_output.get(), 0); + + n_outputs = 0; + + return n_outputs_max; +} + // do mat_mul, while optionally apply lora ggml_tensor * llama_context::build_lora_mm( ggml_context * ctx0, @@ -2827,8 +2819,7 @@ float * llama_get_logits(struct llama_context * ctx) { llama_synchronize(ctx); // reorder logits for backward compatibility - // TODO: maybe deprecate this - llama_output_reorder(*ctx); + ctx->reorder_outputs(); return ctx->logits; } @@ -2877,8 +2868,7 @@ float * llama_get_embeddings(struct llama_context * ctx) { llama_synchronize(ctx); // reorder embeddings for backward compatibility - // TODO: maybe deprecate this - llama_output_reorder(*ctx); + ctx->reorder_outputs(); return ctx->embd; } @@ -3187,7 +3177,7 @@ struct llama_data_write { //} void write_output_ids(struct llama_context * ctx) { - llama_output_reorder(*ctx); + ctx->reorder_outputs(); const uint32_t n_outputs = ctx->n_outputs; @@ -3281,7 +3271,7 @@ struct llama_data_read { uint32_t n_outputs; read_to(&n_outputs, sizeof(n_outputs)); - if (n_outputs > llama_output_reserve(*ctx, n_outputs)) { + if (n_outputs > ctx->reserve_outputs(n_outputs)) { throw std::runtime_error("could not reserve outputs"); } diff --git a/src/llama-context.h b/src/llama-context.h index f6d63eb3cebfc..8f22fd3b1d3a1 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -114,6 +114,14 @@ struct llama_context { void set_inputs(const llama_ubatch & ubatch); + // make the outputs have the same order they had in the user-provided batch + // TODO: maybe deprecate this + void reorder_outputs(); + + // Make sure enough space is available for outputs. + // Returns max number of outputs for which space was reserved. + size_t reserve_outputs(size_t n_outputs); + ggml_tensor * build_lora_mm( ggml_context * ctx0, ggml_tensor * w, From 02ef4be975bd7549971caa3149061008790112bb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 11 Feb 2025 11:25:18 +0200 Subject: [PATCH 29/84] context : initial abstraction ggml-ci --- src/llama-context.cpp | 2077 +++++++++++++++++++++++------------------ src/llama-context.h | 480 ++++++++-- src/llama.cpp | 240 +---- 3 files changed, 1570 insertions(+), 1227 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 87d6642da778f..13beb097cbadd 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -33,14 +33,68 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } -llama_context::llama_context( +// llama_context + +llama_context::llama_context(const llama_model & model) : + model (model), + t_start_us(model.t_start_us), + t_load_us (model.t_load_us) { +} + +llama_context::~llama_context() = default; + +void llama_context::synchronize() { + ggml_backend_sched_synchronize(sched.get()); + + // FIXME: if multiple single tokens are evaluated without a synchronization, + // the stats will be added to the prompt evaluation stats + // this should only happen when using batch size 1 to evaluate a batch + + // add the evaluation to the stats + if (n_queued_tokens == 1) { + if (!cparams.no_perf) { + t_eval_us += ggml_time_us() - t_compute_start_us; + } + n_eval++; + } else if (n_queued_tokens > 1) { + if (!cparams.no_perf) { + t_p_eval_us += ggml_time_us() - t_compute_start_us; + } + n_p_eval += n_queued_tokens; + } + + // get a more accurate load time, upon first eval + if (n_queued_tokens > 0 && !has_evaluated_once) { + t_load_us = ggml_time_us() - t_start_us; + has_evaluated_once = true; + } + + n_queued_tokens = 0; + t_compute_start_us = 0; +} + +int64_t llama_context::n_pos_per_token() const { + return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; +} + +ggml_context_ptr llama_context::init() { + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + return ggml_context_ptr { ggml_init(params) }; +} + +// llama_context_unified + +llama_context_unified::llama_context_unified( const llama_model & model, const llama_context_params & params, build_graph_callback && cb_build_graph) : - model(model), - cb_build_graph(std::move(cb_build_graph)), - t_start_us(model.t_start_us), - t_load_us (model.t_load_us) { + llama_context(model), + cb_build_graph(std::move(cb_build_graph)){ const auto & hparams = model.hparams; @@ -252,6 +306,7 @@ llama_context::llama_context( const size_t max_nodes = model.max_nodes(); // buffer used to store the computation graph and the tensor meta data + // TODO: move to base class buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); // TODO: move these checks to ggml_backend_sched @@ -337,25 +392,161 @@ llama_context::llama_context( } } } +} + +llama_context_unified::~llama_context_unified() = default; +uint32_t llama_context_unified::n_ctx() const { + return cparams.n_ctx; } -struct llama_batch_manager_i { - virtual ~llama_batch_manager_i() = default; +uint32_t llama_context_unified::n_batch() const { + return cparams.n_batch; +} - virtual bool is_done() const = 0; - virtual llama_ubatch next() = 0; - virtual bool prepare(const llama_ubatch & ubatch) = 0; - virtual void restore() = 0; - virtual void update(const llama_ubatch & ubatch) = 0; - virtual void finalize() = 0; +uint32_t llama_context_unified::n_ubatch() const { + return cparams.n_ubatch; +} - // TODO: might be temporary - int64_t n_outputs_all = 0; -}; +uint32_t llama_context_unified::n_seq_max() const { + // TODO: add notion of n_seq_max to llama_kv_cache and use it here + return kv_self.size; +} + +llama_kv_cache * llama_context_unified::get_kv_self() { + return &kv_self; +} + +const llama_kv_cache * llama_context_unified::get_kv_self() const { + return &kv_self; +} + +enum llama_pooling_type llama_context_unified::pooling_type() const { + return cparams.pooling_type; +} + +float * llama_context_unified::get_logits() { + // reorder logits for backward compatibility + reorder_outputs(); + + return logits; +} + +float * llama_context_unified::get_logits_ith(int32_t i) { + int32_t j = -1; + + try { + if (logits == nullptr) { + throw std::runtime_error("no logits"); + } + + if (i < 0) { + j = n_outputs + i; + if (j < 0) { + throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs)); + } + } else if ((size_t) i >= output_ids.size()) { + throw std::runtime_error(format("out of range [0, %zu)", output_ids.size())); + } else { + j = output_ids[i]; + } + + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + } + if (j >= n_outputs) { + // This should not happen + throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + } + + return logits + j*model.vocab.n_tokens(); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ABORT("fatal error"); +#else + return nullptr; +#endif + } +} + +float * llama_context_unified::get_embeddings() { + // reorder embeddings for backward compatibility + reorder_outputs(); + + return embd; +} + +float * llama_context_unified::get_embeddings_ith(int32_t i) { + int32_t j = -1; + + try { + if (embd == nullptr) { + throw std::runtime_error("no embeddings"); + } + + if (i < 0) { + j = n_outputs + i; + if (j < 0) { + throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs)); + } + } else if ((size_t) i >= output_ids.size()) { + throw std::runtime_error(format("out of range [0, %zu)", output_ids.size())); + } else { + j = output_ids[i]; + } + + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + } + if (j >= n_outputs) { + // This should not happen + throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + } + + return embd + j*model.hparams.n_embd; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ABORT("fatal error"); +#else + return nullptr; +#endif + } +} + +float * llama_context_unified::get_embeddings_seq(llama_seq_id seq_id) { + auto it = embd_seq.find(seq_id); + if (it == embd_seq.end()) { + return nullptr; + } + + return it->second.data(); +} + +ggml_context_ptr llama_context_unified::init() { + inp_tokens = nullptr; + inp_embd = nullptr; + inp_pos = nullptr; + inp_out_ids = nullptr; + inp_mean = nullptr; + inp_cls = nullptr; + inp_embd_enc = nullptr; + inp_pos_bucket = nullptr; + inp_KQ_mask = nullptr; + inp_KQ_mask_cnv = nullptr; + inp_KQ_mask_swa = nullptr; + inp_KQ_mask_swa_cnv = nullptr; + inp_KQ_mask_cross = nullptr; + inp_K_shift = nullptr; + inp_s_copy = nullptr; + inp_s_mask = nullptr; + + return llama_context::init(); +} -struct llama_batch_manager : public llama_batch_manager_i { - llama_batch_manager(llama_context & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { +struct llama_context_unified::batch_manager { + batch_manager(llama_context_unified & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { const auto & model = lctx.model; const auto & cparams = lctx.cparams; const auto & hparams = lctx.model.hparams; @@ -409,14 +600,14 @@ struct llama_batch_manager : public llama_batch_manager_i { /* logits_all */ logits_all); } - ~llama_batch_manager() override { + ~batch_manager() { } - virtual bool is_done() const override { + bool is_done() const { return lctx.sbatch.n_tokens == 0; } - virtual llama_ubatch next() override { + llama_ubatch next() { llama_ubatch ubatch = llama_ubatch(); const auto & cparams = lctx.cparams; @@ -442,7 +633,7 @@ struct llama_batch_manager : public llama_batch_manager_i { return ubatch; } - virtual bool prepare(const llama_ubatch & ubatch) override { + bool prepare(const llama_ubatch & ubatch) { const auto & cparams = lctx.cparams; const auto & hparams = lctx.model.hparams; const auto & batch = lctx.sbatch.batch; @@ -525,11 +716,11 @@ struct llama_batch_manager : public llama_batch_manager_i { return true; } - virtual void restore() override { + void restore() { kv_slot_restorer.restore(lctx.kv_self); } - virtual void update(const llama_ubatch & ubatch) override { + void update(const llama_ubatch & ubatch) { auto & kv_self = lctx.kv_self; // update the kv ring buffer @@ -543,7 +734,7 @@ struct llama_batch_manager : public llama_batch_manager_i { } } - virtual void finalize() override { + void finalize() { const auto & cparams = lctx.cparams; auto & kv_self = lctx.kv_self; @@ -563,18 +754,20 @@ struct llama_batch_manager : public llama_batch_manager_i { } } - llama_context & lctx; + int64_t n_outputs_all = 0; + + llama_context_unified & lctx; const llama_batch & batch; llama_kv_slot_restorer kv_slot_restorer; }; -std::unique_ptr llama_context::prepare_batch(const llama_batch & batch) { - return std::make_unique(*this, batch); +std::unique_ptr llama_context_unified::prepare_batch(const llama_batch & batch) { + return std::make_unique(*this, batch); } -int llama_context::decode(llama_batch & inp_batch) { +int llama_context_unified::decode(llama_batch & inp_batch) { is_encoding = false; if (inp_batch.n_tokens == 0) { @@ -679,12 +872,11 @@ int llama_context::decode(llama_batch & inp_batch) { GGML_ASSERT(logits != nullptr); float * logits_out = logits + n_outputs_prev*n_vocab; - const int32_t n_outputs_new = n_outputs; - if (n_outputs_new) { - GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) logits_size); - ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs_new*n_vocab*sizeof(float)); + if (n_outputs) { + GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size); + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); } } @@ -699,12 +891,11 @@ int llama_context::decode(llama_batch & inp_batch) { // extract token embeddings GGML_ASSERT(embd != nullptr); float * embd_out = embd + n_outputs_prev*n_embd; - const int32_t n_outputs_new = n_outputs; - if (n_outputs_new) { - GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) embd_size); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); + if (n_outputs) { + GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_MEAN: @@ -770,7 +961,7 @@ int llama_context::decode(llama_batch & inp_batch) { n_outputs = n_outputs_all; // wait for the computation to finish (automatically done when obtaining the model output) - //llama_synchronize(&; + //synchronize(); bman->finalize(); @@ -781,7 +972,7 @@ int llama_context::decode(llama_batch & inp_batch) { return 0; } -int llama_context::encode(llama_batch & inp_batch) { +int llama_context_unified::encode(llama_batch & inp_batch) { is_encoding = true; if (inp_batch.n_tokens == 0) { @@ -958,7 +1149,7 @@ int llama_context::encode(llama_batch & inp_batch) { return 0; } -enum ggml_status llama_context::compute_graph( +enum ggml_status llama_context_unified::compute_graph( ggml_cgraph * graph, bool batched) { int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; @@ -985,43 +1176,23 @@ enum ggml_status llama_context::compute_graph( return status; } -llama_pos llama_context::pos_max() const { +llama_pos llama_context_unified::pos_max() const { return kv_self.pos_max(); } -uint32_t llama_context::get_ctx_padding(const llama_cparams & cparams) const { +uint32_t llama_context_unified::get_ctx_padding(const llama_cparams & cparams) const { return kv_self.get_padding(cparams); } -// TODO: improve -void llama_context::reset() { - inp_tokens = nullptr; - inp_embd = nullptr; - inp_pos = nullptr; - inp_out_ids = nullptr; - inp_mean = nullptr; - inp_cls = nullptr; - inp_embd_enc = nullptr; - inp_pos_bucket = nullptr; - inp_KQ_mask = nullptr; - inp_KQ_mask_cnv = nullptr; - inp_KQ_mask_swa = nullptr; - inp_KQ_mask_swa_cnv = nullptr; - inp_KQ_mask_cross = nullptr; - inp_K_shift = nullptr; - inp_s_copy = nullptr; - inp_s_mask = nullptr; -} - -void llama_context::prepare_k_shift() { +void llama_context_unified::prepare_k_shift() { } -void llama_context::prepare_defrag() { +void llama_context_unified::prepare_defrag() { } // llama input -void llama_context::set_inputs(const llama_ubatch & ubatch) { +void llama_context_unified::set_inputs(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; // @@ -1056,8 +1227,8 @@ void llama_context::set_inputs(const llama_ubatch & ubatch) { if (ubatch.pos && inp_pos) { const int64_t n_tokens = ubatch.n_tokens; - auto n_pos = n_pos_per_token; - ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(inp_pos)); + + ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos)); } if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { @@ -1440,7 +1611,7 @@ void llama_context::set_inputs(const llama_ubatch & ubatch) { } } -void llama_context::reorder_outputs() { +void llama_context_unified::reorder_outputs() { std::vector & out_ids = sbatch.out_ids; if (!out_ids.empty()) { const uint32_t n_vocab = model.vocab.n_tokens(); @@ -1478,7 +1649,7 @@ void llama_context::reorder_outputs() { } } -size_t llama_context::reserve_outputs(size_t n_outputs) { +size_t llama_context_unified::reserve_outputs(size_t n_outputs) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; @@ -1605,7 +1776,7 @@ ggml_tensor * llama_context::build_lora_mm_id( return res; } -void llama_context::kv_self_update() { +void llama_context_unified::kv_self_update() { auto & kv = kv_self; if (kv.has_shift) { @@ -1619,15 +1790,8 @@ void llama_context::kv_self_update() { ggml_backend_sched_reset(sched.get()); - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - ggml_context * ctx0 = ggml_init(params); - - reset(); + auto ctx = init(); + auto ctx0 = ctx.get(); ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -1639,8 +1803,6 @@ void llama_context::kv_self_update() { compute_graph(gf, false); - ggml_free(ctx0); - need_reserve = true; } @@ -1659,15 +1821,8 @@ void llama_context::kv_self_update() { ggml_backend_sched_reset(sched.get()); - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - ggml_context * ctx0 = ggml_init(params); - - reset(); + auto ctx = init(); + auto ctx0 = ctx.get(); ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -1680,19 +1835,13 @@ void llama_context::kv_self_update() { compute_graph(gf, false); - ggml_free(ctx0); - kv.do_defrag = false; need_reserve = true; } } -void llama_kv_self_update(llama_context * ctx) { - ctx->kv_self_update(); -} - -void llama_context::build_attn_inp( +void llama_context_unified::build_attn_inp( ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -1723,7 +1872,7 @@ void llama_context::build_attn_inp( } } -void llama_context::build_attn_kv_store( +void llama_context_unified::build_attn_kv_store( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * k_cur, @@ -1767,7 +1916,7 @@ void llama_context::build_attn_kv_store( ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view)); } -ggml_tensor * llama_context::build_attn_qkv( +ggml_tensor * llama_context_unified::build_attn_qkv( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * wo, @@ -1919,7 +2068,7 @@ ggml_tensor * llama_context::build_attn_qkv( return cur; } -ggml_tensor * llama_context::build_soft_max_ext( +ggml_tensor * llama_context_unified::build_soft_max_ext( ggml_context * ctx0, ggml_tensor * kq, float kq_scale) { @@ -1928,7 +2077,7 @@ ggml_tensor * llama_context::build_soft_max_ext( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } -ggml_tensor * llama_context::get_rope_factors(int il) { +ggml_tensor * llama_context_unified::get_rope_factors(int il) { const auto & hparams = model.hparams; // choose long/short freq factors based on the context size @@ -1945,7 +2094,96 @@ ggml_tensor * llama_context::get_rope_factors(int il) { return model.layers[il].rope_short; } -void llama_context::build_k_shift( +ggml_tensor * llama_context_unified::build_inp_embd( + ggml_context * ctx0, + ggml_tensor * tok_embd, + const llama_ubatch & ubatch) { + const auto & hparams = model.hparams; + + const int64_t n_embd = hparams.n_embd; + + struct ggml_tensor * inpL; + + if (ubatch.token) { + inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + //cb(inp_tokens, "inp_tokens", -1); + ggml_set_input(inp_tokens); + + inpL = ggml_get_rows(ctx0, tok_embd, inp_tokens); + + // apply lora for embedding tokens if needed + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( + ctx0, lw->b, // non-transposed lora_b + ggml_get_rows(ctx0, lw->a, inp_tokens) + ), scale); + + inpL = ggml_add(ctx0, inpL, inpL_delta); + } + } else { + inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); + inpL = inp_embd; + ggml_set_input(inp_embd); + } + + // For Granite architecture + if (hparams.f_embedding_scale != 0.0f) { + inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale); + } + + //cb(inpL, "inp_embd", -1); + + return inpL; +} + +ggml_tensor * llama_context_unified::build_inp_pos( + ggml_context * ctx0, + int32_t n_tokens) { + inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + ggml_set_input(inp_pos); + + return inp_pos; +} + +ggml_tensor * llama_context_unified::build_inp_out_ids( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) { + const int32_t n_out_ids = worst_case ? n_tokens : n_outputs; + + inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids); + ggml_set_input(inp_out_ids); + + return inp_out_ids; +} + +ggml_tensor * llama_context_unified::build_inp_mean( + ggml_context * ctx0, + int32_t n_tokens) { + inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); + ggml_set_input(inp_mean); + + return inp_mean; +} + +ggml_tensor * llama_context_unified::build_inp_cls( + ggml_context * ctx0, + int32_t n_tokens) { + inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp_cls); + + return inp_cls; +} + +void llama_context_unified::build_k_shift( ggml_context * ctx0, ggml_cgraph * graph) { const auto & n_ctx = cparams.n_ctx; @@ -2017,7 +2255,7 @@ void llama_context::build_k_shift( } } -void llama_context::build_defrag( +void llama_context_unified::build_defrag( ggml_context * ctx0, ggml_cgraph * graph) { const auto & hparams = model.hparams; @@ -2287,7 +2525,39 @@ void llama_context::build_defrag( #endif } -ggml_tensor * llama_context::build_inp_s_copy( +ggml_tensor * llama_context_unified::build_inp_embd_enc( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) { + const auto & hparams = model.hparams; + const int64_t n_embd = hparams.n_embd; + + // TODO: not sure if this is correct + const int32_t n_outputs_enc = worst_case ? n_tokens : embd_enc.size() / n_embd; + + inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); + ggml_set_input(inp_embd_enc); + + return inp_embd_enc; +} + +ggml_tensor * llama_context_unified::build_inp_KQ_mask_cross( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) { + const auto & hparams = model.hparams; + const int64_t n_embd = hparams.n_embd; + + // TODO: not sure if this is correct + const int32_t n_outputs_enc = worst_case ? n_tokens : embd_enc.size() / n_embd; + + inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + ggml_set_input(inp_KQ_mask_cross); + + return inp_KQ_mask_cross; +} + +ggml_tensor * llama_context_unified::build_inp_s_copy( ggml_context * ctx0, bool worst_case) { const auto n_kv = worst_case ? kv_self.size : kv_self.n; @@ -2298,7 +2568,7 @@ ggml_tensor * llama_context::build_inp_s_copy( return inp_s_copy; } -ggml_tensor * llama_context::build_inp_s_mask( +ggml_tensor * llama_context_unified::build_inp_s_mask( ggml_context * ctx0, bool worst_case) { const auto n_kv = worst_case ? kv_self.size : kv_self.n; @@ -2308,7 +2578,7 @@ ggml_tensor * llama_context::build_inp_s_mask( return inp_s_mask; } -ggml_tensor * llama_context::build_copy_mask_state( +ggml_tensor * llama_context_unified::build_copy_mask_state( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * s, @@ -2343,7 +2613,7 @@ ggml_tensor * llama_context::build_copy_mask_state( } // TODO: split -ggml_tensor * llama_context::build_mamba_layer( +ggml_tensor * llama_context_unified::build_mamba_layer( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * cur, @@ -2479,7 +2749,7 @@ ggml_tensor * llama_context::build_mamba_layer( } -ggml_tensor * llama_context::build_rwkv_token_shift_load( +ggml_tensor * llama_context_unified::build_rwkv_token_shift_load( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * state_copy, @@ -2506,7 +2776,7 @@ ggml_tensor * llama_context::build_rwkv_token_shift_load( } -ggml_tensor * llama_context::build_rwkv_token_shift_store( +ggml_tensor * llama_context_unified::build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, @@ -2530,7 +2800,7 @@ ggml_tensor * llama_context::build_rwkv_token_shift_store( } -ggml_tensor * llama_context::build_rwkv6_time_mix( +ggml_tensor * llama_context_unified::build_rwkv6_time_mix( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * cur, @@ -2702,1048 +2972,999 @@ ggml_tensor * llama_context::build_rwkv6_time_mix( } // -// interface implementation +// state // -void llama_free(struct llama_context * ctx) { - delete ctx; -} +// TODO: this needs a big rework -uint32_t llama_n_ctx(const struct llama_context * ctx) { - return ctx->cparams.n_ctx; -} +// TODO: replace all non-fatal assertions with returned errors or exceptions +struct llama_data_write { + llama_data_write(llama_context_unified * ctx) : ctx(ctx) {} + virtual ~llama_data_write() = default; -uint32_t llama_n_batch(const struct llama_context * ctx) { - return ctx->cparams.n_batch; -} + virtual void write(const void * src, size_t size) = 0; + virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0; + virtual size_t get_size_written() = 0; -uint32_t llama_n_ubatch(const struct llama_context * ctx) { - return ctx->cparams.n_ubatch; -} + void write_string(const std::string & str) { + uint32_t str_size = str.size(); -uint32_t llama_n_seq_max(const struct llama_context * ctx) { - // TODO: add notion of n_seq_max to llama_kv_cache and use it here - return ctx->kv_self.size; -} + write(&str_size, sizeof(str_size)); + write(str.data(), str_size); + } -const llama_model * llama_get_model(const llama_context * ctx) { - return &ctx->model; -} + void write_model_info() { + const std::string arch_str = llm_arch_name(ctx->model.arch); + write_string(arch_str); + // TODO: add more model-specific info which should prevent loading the session file if not identical + } -llama_kv_cache * llama_get_kv_self(llama_context * ctx) { - return &ctx->kv_self; -} + //void write_rng(const std::mt19937 & rng) { + // std::ostringstream rng_ss; + // rng_ss << rng; -enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { - return ctx->cparams.pooling_type; -} + // const std::string & rng_str = rng_ss.str(); -void llama_attach_threadpool( - struct llama_context * ctx, - ggml_threadpool_t threadpool, - ggml_threadpool_t threadpool_batch) { - ctx->threadpool = threadpool; - ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; -} + // write_string(rng_str); + //} -void llama_detach_threadpool(struct llama_context * ctx) { - ctx->threadpool = nullptr; - ctx->threadpool_batch = nullptr; -} + void write_output_ids() { + ctx->reorder_outputs(); -void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) { - ctx->cparams.n_threads = n_threads; - ctx->cparams.n_threads_batch = n_threads_batch; -} + const uint32_t n_outputs = ctx->n_outputs; -int32_t llama_n_threads(struct llama_context * ctx) { - return ctx->cparams.n_threads; -} + std::vector output_pos; -int32_t llama_n_threads_batch(struct llama_context * ctx) { - return ctx->cparams.n_threads_batch; -} + const size_t n_batch = ctx->cparams.n_batch; + const auto & output_ids = ctx->output_ids; -void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { - ctx->abort_callback = abort_callback; - ctx->abort_callback_data = abort_callback_data; + GGML_ASSERT(n_outputs <= ctx->output_size); - for (auto & backend : ctx->backends) { - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); - auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); - if (set_abort_callback_fn) { - set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data); + output_pos.resize(n_outputs); + + // build a more compact representation of the output ids + for (size_t i = 0; i < n_batch; ++i) { + // map an output id to a position in the batch + int32_t pos = output_ids[i]; + if (pos >= 0) { + GGML_ASSERT((uint32_t) pos < n_outputs); + output_pos[pos] = i; + } } - } -} -void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { - ctx->cparams.embeddings = embeddings; -} + write(&n_outputs, sizeof(n_outputs)); -void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) { - ctx->cparams.causal_attn = causal_attn; -} + if (n_outputs) { + write(output_pos.data(), n_outputs * sizeof(int32_t)); + } + } -void llama_synchronize(struct llama_context * ctx) { - ggml_backend_sched_synchronize(ctx->sched.get()); + void write_logits() { + const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens()); - // FIXME: if multiple single tokens are evaluated without a synchronization, - // the stats will be added to the prompt evaluation stats - // this should only happen when using batch size 1 to evaluate a batch + write(&logits_size, sizeof(logits_size)); - // add the evaluation to the stats - if (ctx->n_queued_tokens == 1) { - if (!ctx->cparams.no_perf) { - ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; - } - ctx->n_eval++; - } else if (ctx->n_queued_tokens > 1) { - if (!ctx->cparams.no_perf) { - ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (logits_size) { + write(ctx->logits, logits_size * sizeof(float)); } - ctx->n_p_eval += ctx->n_queued_tokens; } - // get a more accurate load time, upon first eval - if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) { - ctx->t_load_us = ggml_time_us() - ctx->t_start_us; - ctx->has_evaluated_once = true; - } + void write_embeddings() { + const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd); - ctx->n_queued_tokens = 0; - ctx->t_compute_start_us = 0; -} + write(&embeddings_size, sizeof(embeddings_size)); -float * llama_get_logits(struct llama_context * ctx) { - llama_synchronize(ctx); + if (embeddings_size) { + write(ctx->embd, embeddings_size * sizeof(float)); + } + } - // reorder logits for backward compatibility - ctx->reorder_outputs(); + llama_context_unified * ctx; +}; - return ctx->logits; -} +struct llama_data_read { + llama_data_read(llama_context_unified * ctx) : ctx(ctx) {} + virtual ~llama_data_read() = default; -float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { - int32_t j = -1; + virtual const uint8_t * read(size_t size) = 0; + virtual void read_to(void * dst, size_t size) = 0; + virtual size_t get_size_read() = 0; - llama_synchronize(ctx); + void read_string(std::string & str) { + uint32_t str_size; + read_to(&str_size, sizeof(str_size)); - try { - if (ctx->logits == nullptr) { - throw std::runtime_error("no logits"); - } + str.assign((const char *) read(str_size), str_size); + } - if (i < 0) { - j = ctx->n_outputs + i; - if (j < 0) { - throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs)); - } - } else if ((size_t) i >= ctx->output_ids.size()) { - throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size())); - } else { - j = ctx->output_ids[i]; - } + // validate model information + void read_model_info() { + const std::string cur_arch_str = llm_arch_name(ctx->model.arch); - if (j < 0) { - throw std::runtime_error(format("batch.logits[%d] != true", i)); - } - if (j >= ctx->n_outputs) { - // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs)); + std::string arch_str; + read_string(arch_str); + if (cur_arch_str != arch_str) { + throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); } - - return ctx->logits + j*ctx->model.vocab.n_tokens(); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); -#ifndef NDEBUG - GGML_ABORT("fatal error"); -#else - return nullptr; -#endif + // TODO: add more info which needs to be identical but which is not verified otherwise } -} -float * llama_get_embeddings(struct llama_context * ctx) { - llama_synchronize(ctx); + //void read_rng(std::mt19937 & rng) { + // std::string rng_str; + // read_string(rng_str); - // reorder embeddings for backward compatibility - ctx->reorder_outputs(); + // std::istringstream rng_ss(rng_str); + // rng_ss >> rng; - return ctx->embd; -} + // if (rng_ss.fail()) { + // throw std::runtime_error("failed to load RNG state"); + // } + //} -float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { - int32_t j = -1; + void read_output_ids() { + std::vector output_pos; - llama_synchronize(ctx); + uint32_t n_outputs; + read_to(&n_outputs, sizeof(n_outputs)); - try { - if (ctx->embd == nullptr) { - throw std::runtime_error("no embeddings"); + if (n_outputs > ctx->reserve_outputs(n_outputs)) { + throw std::runtime_error("could not reserve outputs"); } - if (i < 0) { - j = ctx->n_outputs + i; - if (j < 0) { - throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs)); + if (n_outputs) { + output_pos.resize(n_outputs); + read_to(output_pos.data(), n_outputs * sizeof(int32_t)); + + for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { + int32_t id = output_pos[i]; + if ((uint32_t) id >= ctx->cparams.n_batch) { + throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch)); + } + ctx->output_ids[id] = i; } - } else if ((size_t) i >= ctx->output_ids.size()) { - throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size())); - } else { - j = ctx->output_ids[i]; - } - if (j < 0) { - throw std::runtime_error(format("batch.logits[%d] != true", i)); - } - if (j >= ctx->n_outputs) { - // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs)); + ctx->n_outputs = n_outputs; } - - return ctx->embd + j*ctx->model.hparams.n_embd; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); -#ifndef NDEBUG - GGML_ABORT("fatal error"); -#else - return nullptr; -#endif } -} -float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) { - llama_synchronize(ctx); + void read_logits() { + uint64_t logits_size; + read_to(&logits_size, sizeof(logits_size)); - auto it = ctx->embd_seq.find(seq_id); - if (it == ctx->embd_seq.end()) { - return nullptr; - } + if (ctx->logits_size < logits_size) { + throw std::runtime_error("logits buffer too small"); + } - return it->second.data(); -} + if (logits_size) { + read_to(ctx->logits, logits_size * sizeof(float)); + } + } -// llama adapter API + void read_embeddings() { + uint64_t embeddings_size; + read_to(&embeddings_size, sizeof(embeddings_size)); -int32_t llama_set_adapter_lora( - struct llama_context * ctx, - struct llama_adapter_lora * adapter, - float scale) { - ctx->loras[adapter] = scale; - return 0; -} + if (ctx->embd_size < embeddings_size) { + throw std::runtime_error("embeddings buffer too small"); + } -int32_t llama_rm_adapter_lora( - struct llama_context * ctx, - struct llama_adapter_lora * adapter) { - auto pos = ctx->loras.find(adapter); - if (pos != ctx->loras.end()) { - ctx->loras.erase(pos); - return 0; + if (embeddings_size) { + read_to(ctx->embd, embeddings_size * sizeof(float)); + } } - return -1; -} + llama_context_unified * ctx; +}; -void llama_clear_adapter_lora(struct llama_context * ctx) { - ctx->loras.clear(); -} +struct llama_data_write_dummy : llama_data_write { + llama_data_write_dummy(llama_context_unified * ctx) : llama_data_write(ctx) {} -int32_t llama_apply_adapter_cvec( - struct llama_context * ctx, - const float * data, - size_t len, - int32_t n_embd, - int32_t il_start, - int32_t il_end) { - return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); -} + void write(const void * /* src */, size_t size) override { + size_written += size; + } -// -// kv cache view -// + void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { + size_written += size; + } -struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { - return llama_kv_cache_view_init(ctx->kv_self, n_seq_max); -} + size_t get_size_written() override { + return size_written; + } -void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) { - llama_kv_cache_view_update(view, ctx->kv_self); -} + size_t size_written = 0; +}; -// -// kv cache -// +struct llama_data_write_buffer : llama_data_write { + llama_data_write_buffer( + llama_context_unified * ctx, + uint8_t * p, size_t len) : llama_data_write(ctx), ptr(p), buf_size(len) {} -// deprecated -int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { - return llama_kv_self_n_tokens(ctx); -} + void write(const void * src, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + memcpy(ptr, src, size); + ptr += size; + size_written += size; + buf_size -= size; + } -int32_t llama_kv_self_n_tokens(const llama_context * ctx) { - return llama_kv_cache_n_tokens(&ctx->kv_self); -} + void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + ggml_backend_tensor_get(tensor, ptr, offset, size); + ptr += size; + size_written += size; + buf_size -= size; + } -// deprecated -int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) { - return llama_kv_self_used_cells(ctx); -} + size_t get_size_written() override { + return size_written; + } -int32_t llama_kv_self_used_cells(const llama_context * ctx) { - return llama_kv_cache_used_cells(&ctx->kv_self); -} + uint8_t * ptr; + size_t buf_size = 0; + size_t size_written = 0; +}; -// deprecated -void llama_kv_cache_clear(llama_context * ctx) { - llama_kv_self_clear(ctx); -} +struct llama_data_read_buffer : llama_data_read { + llama_data_read_buffer( + llama_context_unified * ctx, + const uint8_t * p, size_t len) : llama_data_read(ctx), ptr(p), buf_size(len) {} -void llama_kv_self_clear(llama_context * ctx) { - llama_kv_cache_clear(&ctx->kv_self); -} + const uint8_t * read(size_t size) override { + const uint8_t * base_ptr = ptr; + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + ptr += size; + size_read += size; + buf_size -= size; + return base_ptr; + } -// deprecated -bool llama_kv_cache_seq_rm( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1) { - return llama_kv_self_seq_rm(ctx, seq_id, p0, p1); -} + void read_to(void * dst, size_t size) override { + memcpy(dst, read(size), size); + } -bool llama_kv_self_seq_rm( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1) { - return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1); -} + size_t get_size_read() override { + return size_read; + } -// deprecated -void llama_kv_cache_seq_cp( - llama_context * ctx, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1) { - return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1); -} + const uint8_t * ptr; + size_t buf_size = 0; + size_t size_read = 0; +}; -void llama_kv_self_seq_cp( - llama_context * ctx, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1) { - return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); -} +struct llama_data_write_file : llama_data_write { + llama_data_write_file( + llama_context_unified * ctx, + llama_file * f) : llama_data_write(ctx), file(f) {} -// deprecated -void llama_kv_cache_seq_keep( - llama_context * ctx, - llama_seq_id seq_id) { - return llama_kv_self_seq_keep(ctx, seq_id); -} + void write(const void * src, size_t size) override { + file->write_raw(src, size); + size_written += size; + } -void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id); -} + void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + temp_buffer.resize(size); + ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); + write(temp_buffer.data(), temp_buffer.size()); + } -// deprecated -void llama_kv_cache_seq_add( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta) { - return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta); -} + size_t get_size_written() override { + return size_written; + } -void llama_kv_self_seq_add( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta) { - return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta); -} + llama_file * file; + size_t size_written = 0; + std::vector temp_buffer; +}; -// deprecated -void llama_kv_cache_seq_div( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d) { - return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d); +struct llama_data_read_file : llama_data_read { + llama_data_read_file( + llama_context_unified * ctx, + llama_file * f) : llama_data_read(ctx), file(f) {} + + void read_to(void * dst, size_t size) override { + file->read_raw(dst, size); + size_read += size; + } + + const uint8_t * read(size_t size) override { + temp_buffer.resize(size); + read_to(temp_buffer.data(), size); + return temp_buffer.data(); + } + + size_t get_size_read() override { + return size_read; + } + + llama_file * file; + size_t size_read = 0; + std::vector temp_buffer; +}; + +size_t llama_context_unified::state_get_size() { + llama_data_write_dummy data_ctx(this); + try { + return state_get_data(data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); + return 0; + } } -void llama_kv_self_seq_div( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d) { - return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d); +size_t llama_context_unified::state_get_data(uint8_t * dst, size_t size) { + llama_data_write_buffer data_ctx(this, dst, size); + try { + return state_get_data(data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); + return 0; + } } -// deprecated -llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_self_seq_pos_max(ctx, seq_id); +size_t llama_context_unified::state_set_data(const uint8_t * src, size_t size) { + llama_data_read_buffer data_ctx(this, src, size); + try { + return state_set_data(data_ctx); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); + return 0; + } } -llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id); +size_t llama_context_unified::state_seq_get_size(llama_seq_id seq_id) { + llama_data_write_dummy data_ctx(this); + try { + return state_seq_get_data(data_ctx, seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); + return 0; + } } -// deprecated -void llama_kv_cache_defrag(llama_context * ctx) { - return llama_kv_self_defrag(ctx); +size_t llama_context_unified::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { + llama_data_write_buffer data_ctx(this, dst, size); + try { + return state_seq_get_data(data_ctx, seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); + return 0; + } } -void llama_kv_self_defrag(llama_context * ctx) { - return llama_kv_cache_defrag(&ctx->kv_self); +size_t llama_context_unified::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { + llama_data_read_buffer data_ctx(this, src, size); + try { + return state_seq_set_data(data_ctx, seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); + return 0; + } } -// deprecated -bool llama_kv_cache_can_shift(const llama_context * ctx) { - return llama_kv_self_can_shift(ctx); +bool llama_context_unified::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(filepath, "rb"); + + // sanity checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { + LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return false; + } + } + + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); + + if (n_token_count > n_token_capacity) { + LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return false; + } + + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; + } + + // restore the context state + { + const size_t n_state_size_cur = file.size() - file.tell(); + + llama_data_read_file data_ctx(this, &file); + const size_t n_read = state_set_data(data_ctx); + + if (n_read != n_state_size_cur) { + LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read); + return false; + } + } + + return true; } -bool llama_kv_self_can_shift(const llama_context * ctx) { - return llama_kv_cache_can_shift(&ctx->kv_self); +bool llama_context_unified::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) { + llama_file file(filepath, "wb"); + + file.write_u32(LLAMA_SESSION_MAGIC); + file.write_u32(LLAMA_SESSION_VERSION); + + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); + + // save the context state using stream saving + llama_data_write_file data_ctx(this, &file); + state_get_data(data_ctx); + + return true; } -// deprecated -void llama_kv_cache_update(llama_context * ctx) { - llama_kv_self_update(ctx); -} +size_t llama_context_unified::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(filepath, "rb"); + + // version checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) { + LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version); + return 0; + } + } + + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); + + if (n_token_count > n_token_capacity) { + LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return 0; + } + + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; + } -// llama state API + // restore the context state + { + const size_t state_size = file.size() - file.tell(); + llama_data_read_file data_ctx(this, &file); + const size_t nread = state_seq_set_data(data_ctx, seq_id); + if (!nread) { + LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); + return 0; + } + GGML_ASSERT(nread <= state_size); + GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell()); + } -// deprecated -size_t llama_get_state_size(struct llama_context * ctx) { - return llama_state_get_size(ctx); + return file.tell(); } -// deprecated -size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { - return llama_state_get_data(ctx, dst, -1); -} +size_t llama_context_unified::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) { + llama_file file(filepath, "wb"); -// deprecated -size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { - return llama_state_set_data(ctx, src, -1); -} + file.write_u32(LLAMA_STATE_SEQ_MAGIC); + file.write_u32(LLAMA_STATE_SEQ_VERSION); -// deprecated -bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); -} + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); -// deprecated -bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - return llama_state_save_file(ctx, path_session, tokens, n_token_count); -} + // save the context state using stream saving + llama_data_write_file data_ctx(this, &file); + state_seq_get_data(data_ctx, seq_id); -// TODO: replace all non-fatal assertions with returned errors or exceptions -struct llama_data_write { - virtual void write(const void * src, size_t size) = 0; - virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0; - virtual size_t get_size_written() = 0; - virtual ~llama_data_write() = default; + const size_t res = file.tell(); + GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written()); - void write_string(const std::string & str) { - uint32_t str_size = str.size(); + return res; +} - write(&str_size, sizeof(str_size)); - write(str.data(), str_size); - } +/** copy state data into either a buffer or file depending on the passed in context + * + * file context: + * llama_file file("/path", "wb"); + * llama_data_write_file data_ctx(&file); + * llama_state_get_data_internal(ctx, data_ctx); + * + * buffer context: + * std::vector buf(max_size, 0); + * llama_data_write_buffer data_ctx(buf.data(), max_size); + * llama_state_get_data_internal(ctx, data_ctx); + * +*/ +size_t llama_context_unified::state_get_data(llama_data_write & data_ctx) { + synchronize(); - void write_model_info(const struct llama_context * ctx) { - const std::string arch_str = llm_arch_name(ctx->model.arch); - write_string(arch_str); - // TODO: add more model-specific info which should prevent loading the session file if not identical - } + data_ctx.write_model_info(); - //void write_rng(const std::mt19937 & rng) { - // std::ostringstream rng_ss; - // rng_ss << rng; + // copy outputs + data_ctx.write_output_ids(); + data_ctx.write_logits(); + data_ctx.write_embeddings(); - // const std::string & rng_str = rng_ss.str(); + llama_kv_cache::io io = { + /* .write = */ [&](const void * src, size_t size) { + data_ctx.write(src, size); + }, + /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { + data_ctx.write_tensor_data(tensor, offset, size); + }, + /* .read = */ nullptr, + /* .read_to = */ nullptr, + }; - // write_string(rng_str); - //} + kv_self.state_write(io, model.hparams); - void write_output_ids(struct llama_context * ctx) { - ctx->reorder_outputs(); + return data_ctx.get_size_written(); +} - const uint32_t n_outputs = ctx->n_outputs; +size_t llama_context_unified::state_set_data(llama_data_read & data_ctx) { + synchronize(); - std::vector output_pos; + data_ctx.read_model_info(); - const size_t n_batch = ctx->cparams.n_batch; - const auto & output_ids = ctx->output_ids; + // set outputs + data_ctx.read_output_ids(); + data_ctx.read_logits(); + data_ctx.read_embeddings(); - GGML_ASSERT(n_outputs <= ctx->output_size); + llama_kv_cache::io io = { + /* .write = */ nullptr, + /* .write_tensor_data = */ nullptr, + /* .read = */ [&](size_t size) { + return data_ctx.read(size); + }, + /* .read_to = */ [&](void * dst, size_t size) { + data_ctx.read_to(dst, size); + }, + }; - output_pos.resize(n_outputs); + kv_self.state_read(io, model.hparams); - // build a more compact representation of the output ids - for (size_t i = 0; i < n_batch; ++i) { - // map an output id to a position in the batch - int32_t pos = output_ids[i]; - if (pos >= 0) { - GGML_ASSERT((uint32_t) pos < n_outputs); - output_pos[pos] = i; - } - } + return data_ctx.get_size_read(); +} - write(&n_outputs, sizeof(n_outputs)); +size_t llama_context_unified::state_seq_get_data(llama_data_write & data_ctx, llama_seq_id seq_id) { + synchronize(); - if (n_outputs) { - write(output_pos.data(), n_outputs * sizeof(int32_t)); - } - } + llama_kv_cache::io io = { + /* .write = */ [&](const void * src, size_t size) { + data_ctx.write(src, size); + }, + /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { + data_ctx.write_tensor_data(tensor, offset, size); + }, + /* .read = */ nullptr, + /* .read_to = */ nullptr, + }; - void write_logits(const struct llama_context * ctx) { - const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens()); + kv_self.state_write(io, model.hparams, seq_id); - write(&logits_size, sizeof(logits_size)); + return data_ctx.get_size_written(); +} - if (logits_size) { - write(ctx->logits, logits_size * sizeof(float)); - } - } +size_t llama_context_unified::state_seq_set_data(llama_data_read & data_ctx, llama_seq_id seq_id) { + synchronize(); - void write_embeddings(const struct llama_context * ctx) { - const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd); + llama_kv_cache::io io = { + /* .write = */ nullptr, + /* .write_tensor_data = */ nullptr, + /* .read = */ [&](size_t size) { + return data_ctx.read(size); + }, + /* .read_to = */ [&](void * dst, size_t size) { + data_ctx.read_to(dst, size); + }, + }; - write(&embeddings_size, sizeof(embeddings_size)); + kv_self.state_read(io, model.hparams, seq_id); - if (embeddings_size) { - write(ctx->embd, embeddings_size * sizeof(float)); - } - } -}; + return data_ctx.get_size_read(); +} -struct llama_data_read { - virtual const uint8_t * read(size_t size) = 0; - virtual void read_to(void * dst, size_t size) = 0; - virtual size_t get_size_read() = 0; - virtual ~llama_data_read() = default; +// +// interface implementation +// - void read_string(std::string & str) { - uint32_t str_size; - read_to(&str_size, sizeof(str_size)); +void llama_free(struct llama_context * ctx) { + delete ctx; +} - str.assign((const char *) read(str_size), str_size); - } +uint32_t llama_n_ctx(const struct llama_context * ctx) { + return ctx->n_ctx(); +} - // validate model information - void read_model_info(const struct llama_context * ctx) { - const std::string cur_arch_str = llm_arch_name(ctx->model.arch); +uint32_t llama_n_batch(const struct llama_context * ctx) { + return ctx->n_batch(); +} - std::string arch_str; - read_string(arch_str); - if (cur_arch_str != arch_str) { - throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); - } - // TODO: add more info which needs to be identical but which is not verified otherwise - } +uint32_t llama_n_ubatch(const struct llama_context * ctx) { + return ctx->n_ubatch(); +} - //void read_rng(std::mt19937 & rng) { - // std::string rng_str; - // read_string(rng_str); +uint32_t llama_n_seq_max(const struct llama_context * ctx) { + return ctx->n_seq_max(); +} - // std::istringstream rng_ss(rng_str); - // rng_ss >> rng; +const llama_model * llama_get_model(const llama_context * ctx) { + return &ctx->model; +} - // if (rng_ss.fail()) { - // throw std::runtime_error("failed to load RNG state"); - // } - //} +llama_kv_cache * llama_get_kv_self(llama_context * ctx) { + return ctx->get_kv_self(); +} - void read_output_ids(struct llama_context * ctx) { - std::vector output_pos; +void llama_kv_self_update(llama_context * ctx) { + ctx->kv_self_update(); +} - uint32_t n_outputs; - read_to(&n_outputs, sizeof(n_outputs)); +enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { + return ctx->pooling_type(); +} - if (n_outputs > ctx->reserve_outputs(n_outputs)) { - throw std::runtime_error("could not reserve outputs"); - } +void llama_attach_threadpool( + struct llama_context * ctx, + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { + ctx->threadpool = threadpool; + ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; +} - if (n_outputs) { - output_pos.resize(n_outputs); - read_to(output_pos.data(), n_outputs * sizeof(int32_t)); +void llama_detach_threadpool(struct llama_context * ctx) { + ctx->threadpool = nullptr; + ctx->threadpool_batch = nullptr; +} - for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { - int32_t id = output_pos[i]; - if ((uint32_t) id >= ctx->cparams.n_batch) { - throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch)); - } - ctx->output_ids[id] = i; - } +void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) { + ctx->cparams.n_threads = n_threads; + ctx->cparams.n_threads_batch = n_threads_batch; +} - ctx->n_outputs = n_outputs; - } - } +int32_t llama_n_threads(struct llama_context * ctx) { + return ctx->cparams.n_threads; +} - void read_logits(struct llama_context * ctx) { - uint64_t logits_size; - read_to(&logits_size, sizeof(logits_size)); +int32_t llama_n_threads_batch(struct llama_context * ctx) { + return ctx->cparams.n_threads_batch; +} - if (ctx->logits_size < logits_size) { - throw std::runtime_error("logits buffer too small"); - } +void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = abort_callback_data; - if (logits_size) { - read_to(ctx->logits, logits_size * sizeof(float)); + for (auto & backend : ctx->backends) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); + auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); + if (set_abort_callback_fn) { + set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data); } } +} - void read_embeddings(struct llama_context * ctx) { - uint64_t embeddings_size; - read_to(&embeddings_size, sizeof(embeddings_size)); +void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { + ctx->cparams.embeddings = embeddings; +} - if (ctx->embd_size < embeddings_size) { - throw std::runtime_error("embeddings buffer too small"); - } +void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) { + ctx->cparams.causal_attn = causal_attn; +} - if (embeddings_size) { - read_to(ctx->embd, embeddings_size * sizeof(float)); - } - } -}; +void llama_synchronize(struct llama_context * ctx) { + ctx->synchronize(); +} -struct llama_data_write_dummy : llama_data_write { - size_t size_written = 0; +float * llama_get_logits(struct llama_context * ctx) { + ctx->synchronize(); - llama_data_write_dummy() {} + return ctx->get_logits(); +} - void write(const void * /* src */, size_t size) override { - size_written += size; - } +float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { + ctx->synchronize(); - void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { - size_written += size; - } + return ctx->get_logits_ith(i); +} - size_t get_size_written() override { - return size_written; - } -}; +float * llama_get_embeddings(struct llama_context * ctx) { + ctx->synchronize(); -struct llama_data_write_buffer : llama_data_write { - uint8_t * ptr; - size_t buf_size = 0; - size_t size_written = 0; + return ctx->get_embeddings(); +} - llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {} +float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { + ctx->synchronize(); - void write(const void * src, size_t size) override { - if (size > buf_size) { - throw std::runtime_error("unexpectedly reached end of buffer"); - } - memcpy(ptr, src, size); - ptr += size; - size_written += size; - buf_size -= size; - } + return ctx->get_embeddings_ith(i); +} - void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { - if (size > buf_size) { - throw std::runtime_error("unexpectedly reached end of buffer"); - } - ggml_backend_tensor_get(tensor, ptr, offset, size); - ptr += size; - size_written += size; - buf_size -= size; - } +float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) { + ctx->synchronize(); - size_t get_size_written() override { - return size_written; - } -}; + return ctx->get_embeddings_seq(seq_id); +} -struct llama_data_read_buffer : llama_data_read { - const uint8_t * ptr; - size_t buf_size = 0; - size_t size_read = 0; +// llama adapter API - llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {} +int32_t llama_set_adapter_lora( + struct llama_context * ctx, + struct llama_adapter_lora * adapter, + float scale) { + ctx->loras[adapter] = scale; + return 0; +} - const uint8_t * read(size_t size) override { - const uint8_t * base_ptr = ptr; - if (size > buf_size) { - throw std::runtime_error("unexpectedly reached end of buffer"); - } - ptr += size; - size_read += size; - buf_size -= size; - return base_ptr; +int32_t llama_rm_adapter_lora( + struct llama_context * ctx, + struct llama_adapter_lora * adapter) { + auto pos = ctx->loras.find(adapter); + if (pos != ctx->loras.end()) { + ctx->loras.erase(pos); + return 0; } - void read_to(void * dst, size_t size) override { - memcpy(dst, read(size), size); - } + return -1; +} - size_t get_size_read() override { - return size_read; - } -}; +void llama_clear_adapter_lora(struct llama_context * ctx) { + ctx->loras.clear(); +} -struct llama_data_write_file : llama_data_write { - llama_file * file; - size_t size_written = 0; - std::vector temp_buffer; +int32_t llama_apply_adapter_cvec( + struct llama_context * ctx, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end) { + return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); +} - llama_data_write_file(llama_file * f) : file(f) {} +// +// kv cache view +// - void write(const void * src, size_t size) override { - file->write_raw(src, size); - size_written += size; - } +struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { + return llama_kv_cache_view_init(*ctx->get_kv_self(), n_seq_max); +} - void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { - temp_buffer.resize(size); - ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); - write(temp_buffer.data(), temp_buffer.size()); - } +void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) { + llama_kv_cache_view_update(view, *ctx->get_kv_self()); +} - size_t get_size_written() override { - return size_written; - } -}; +// +// kv cache +// -struct llama_data_read_file : llama_data_read { - llama_file * file; - size_t size_read = 0; - std::vector temp_buffer; +// deprecated +int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { + return llama_kv_self_n_tokens(ctx); +} - llama_data_read_file(llama_file * f) : file(f) {} +int32_t llama_kv_self_n_tokens(const llama_context * ctx) { + return llama_kv_cache_n_tokens(ctx->get_kv_self()); +} - void read_to(void * dst, size_t size) override { - file->read_raw(dst, size); - size_read += size; - } +// deprecated +int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) { + return llama_kv_self_used_cells(ctx); +} - const uint8_t * read(size_t size) override { - temp_buffer.resize(size); - read_to(temp_buffer.data(), size); - return temp_buffer.data(); - } +int32_t llama_kv_self_used_cells(const llama_context * ctx) { + return llama_kv_cache_used_cells(ctx->get_kv_self()); +} - size_t get_size_read() override { - return size_read; - } -}; +// deprecated +void llama_kv_cache_clear(llama_context * ctx) { + llama_kv_self_clear(ctx); +} -/** copy state data into either a buffer or file depending on the passed in context - * - * file context: - * llama_file file("/path", "wb"); - * llama_data_write_file data_ctx(&file); - * llama_state_get_data_internal(ctx, data_ctx); - * - * buffer context: - * std::vector buf(max_size, 0); - * llama_data_write_buffer data_ctx(buf.data(), max_size); - * llama_state_get_data_internal(ctx, data_ctx); - * -*/ -static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) { - llama_synchronize(ctx); +void llama_kv_self_clear(llama_context * ctx) { + llama_kv_cache_clear(ctx->get_kv_self()); +} - data_ctx.write_model_info(ctx); +// deprecated +bool llama_kv_cache_seq_rm( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return llama_kv_self_seq_rm(ctx, seq_id, p0, p1); +} - // copy outputs - data_ctx.write_output_ids(ctx); - data_ctx.write_logits(ctx); - data_ctx.write_embeddings(ctx); +bool llama_kv_self_seq_rm( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return llama_kv_cache_seq_rm(ctx->get_kv_self(), seq_id, p0, p1); +} - llama_kv_cache::io io = { - /* .write = */ [&](const void * src, size_t size) { - data_ctx.write(src, size); - }, - /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { - data_ctx.write_tensor_data(tensor, offset, size); - }, - /* .read = */ nullptr, - /* .read_to = */ nullptr, - }; +// deprecated +void llama_kv_cache_seq_cp( + llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_self_seq_cp( + llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + return llama_kv_cache_seq_cp(ctx->get_kv_self(), seq_id_src, seq_id_dst, p0, p1); +} + +// deprecated +void llama_kv_cache_seq_keep( + llama_context * ctx, + llama_seq_id seq_id) { + return llama_kv_self_seq_keep(ctx, seq_id); +} - ctx->kv_self.state_write(io, ctx->model.hparams); +void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_keep(ctx->get_kv_self(), seq_id); +} - return data_ctx.get_size_written(); +// deprecated +void llama_kv_cache_seq_add( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta); } -size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) { - llama_data_write_buffer data_ctx(dst, size); - try { - return llama_state_get_data_internal(ctx, data_ctx); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); - return 0; - } +void llama_kv_self_seq_add( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + return llama_kv_cache_seq_add(ctx->get_kv_self(), seq_id, p0, p1, delta); } -// Returns the *actual* size of the state. -// Intended to be used when saving to state to a buffer. -size_t llama_state_get_size(struct llama_context * ctx) { - llama_data_write_dummy data_ctx; - try { - return llama_state_get_data_internal(ctx, data_ctx); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); - return 0; - } +// deprecated +void llama_kv_cache_seq_div( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d); } -static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) { - llama_synchronize(ctx); +void llama_kv_self_seq_div( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + return llama_kv_cache_seq_div(ctx->get_kv_self(), seq_id, p0, p1, d); +} - data_ctx.read_model_info(ctx); +// deprecated +llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_self_seq_pos_max(ctx, seq_id); +} - // set outputs - data_ctx.read_output_ids(ctx); - data_ctx.read_logits(ctx); - data_ctx.read_embeddings(ctx); +llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_pos_max(ctx->get_kv_self(), seq_id); +} - llama_kv_cache::io io = { - /* .write = */ nullptr, - /* .write_tensor_data = */ nullptr, - /* .read = */ [&](size_t size) { - return data_ctx.read(size); - }, - /* .read_to = */ [&](void * dst, size_t size) { - data_ctx.read_to(dst, size); - }, - }; +// deprecated +void llama_kv_cache_defrag(llama_context * ctx) { + return llama_kv_self_defrag(ctx); +} - ctx->kv_self.state_read(io, ctx->model.hparams); +void llama_kv_self_defrag(llama_context * ctx) { + return llama_kv_cache_defrag(ctx->get_kv_self()); +} - return data_ctx.get_size_read(); +// deprecated +bool llama_kv_cache_can_shift(const llama_context * ctx) { + return llama_kv_self_can_shift(ctx); } -// Sets the state reading from the specified source address -size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) { - llama_data_read_buffer data_ctx(src, size); - try { - return llama_state_set_data_internal(ctx, data_ctx); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); - return 0; - } +bool llama_kv_self_can_shift(const llama_context * ctx) { + return llama_kv_cache_can_shift(ctx->get_kv_self()); } -static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_file file(path_session, "rb"); +// deprecated +void llama_kv_cache_update(llama_context * ctx) { + llama_kv_self_update(ctx); +} - // sanity checks - { - const uint32_t magic = file.read_u32(); - const uint32_t version = file.read_u32(); +// llama state API - if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { - LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); - return false; - } - } +// deprecated +size_t llama_get_state_size(struct llama_context * ctx) { + return llama_state_get_size(ctx); +} - // load the prompt - { - const uint32_t n_token_count = file.read_u32(); +// deprecated +size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { + return llama_state_get_data(ctx, dst, -1); +} - if (n_token_count > n_token_capacity) { - LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); - return false; - } +// deprecated +size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { + return llama_state_set_data(ctx, src, -1); +} - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); - *n_token_count_out = n_token_count; - } +// deprecated +bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); +} - // restore the context state - { - const size_t n_state_size_cur = file.size() - file.tell(); +// deprecated +bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + return llama_state_save_file(ctx, path_session, tokens, n_token_count); +} - llama_data_read_file data_ctx(&file); - const size_t n_read = llama_state_set_data_internal(ctx, data_ctx); +// Returns the *actual* size of the state. +// Intended to be used when saving to state to a buffer. +size_t llama_state_get_size(struct llama_context * ctx) { + return ctx->state_get_size(); +} - if (n_read != n_state_size_cur) { - LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read); - return false; - } - } - return true; +size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) { + return ctx->state_get_data(dst, size); +} + +// Sets the state reading from the specified source address +size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) { + return ctx->state_set_data(src, size); } bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { try { - return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); + return ctx->state_load_file(path_session, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what()); return false; } } -static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - llama_file file(path_session, "wb"); - - file.write_u32(LLAMA_SESSION_MAGIC); - file.write_u32(LLAMA_SESSION_VERSION); - - // save the prompt - file.write_u32((uint32_t) n_token_count); - file.write_raw(tokens, sizeof(llama_token) * n_token_count); - - // save the context state using stream saving - llama_data_write_file data_ctx(&file); - llama_state_get_data_internal(ctx, data_ctx); - - return true; -} - bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { try { - return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count); + return ctx->state_save_file(path_session, tokens, n_token_count); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what()); return false; } } -static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) { - llama_synchronize(ctx); - - llama_kv_cache::io io = { - /* .write = */ [&](const void * src, size_t size) { - data_ctx.write(src, size); - }, - /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { - data_ctx.write_tensor_data(tensor, offset, size); - }, - /* .read = */ nullptr, - /* .read_to = */ nullptr, - }; - - ctx->kv_self.state_write(io, ctx->model.hparams, seq_id); - - return data_ctx.get_size_written(); -} - size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) { - llama_data_write_dummy data_ctx; - return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); + return ctx->state_seq_get_size(seq_id); } size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) { - llama_data_write_buffer data_ctx(dst, size); - try { - return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what()); - return 0; - } -} - -static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) { - llama_synchronize(ctx); - - llama_kv_cache::io io = { - /* .write = */ nullptr, - /* .write_tensor_data = */ nullptr, - /* .read = */ [&](size_t size) { - return data_ctx.read(size); - }, - /* .read_to = */ [&](void * dst, size_t size) { - data_ctx.read_to(dst, size); - }, - }; - - ctx->kv_self.state_read(io, ctx->model.hparams, dest_seq_id); - - return data_ctx.get_size_read(); -} - -size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) { - llama_data_read_buffer data_ctx(src, size); - try { - return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what()); - return 0; - } -} - -static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { - llama_file file(filepath, "wb"); - - file.write_u32(LLAMA_STATE_SEQ_MAGIC); - file.write_u32(LLAMA_STATE_SEQ_VERSION); - - // save the prompt - file.write_u32((uint32_t) n_token_count); - file.write_raw(tokens, sizeof(llama_token) * n_token_count); - - // save the context state using stream saving - llama_data_write_file data_ctx(&file); - llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); - - const size_t res = file.tell(); - GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written()); - return res; + return ctx->state_seq_get_data(seq_id, dst, size); } -static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_file file(filepath, "rb"); - - // version checks - { - const uint32_t magic = file.read_u32(); - const uint32_t version = file.read_u32(); - - if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) { - LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version); - return 0; - } - } - - // load the prompt - { - const uint32_t n_token_count = file.read_u32(); - - if (n_token_count > n_token_capacity) { - LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); - return 0; - } - - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); - *n_token_count_out = n_token_count; - } - - // restore the context state - { - const size_t state_size = file.size() - file.tell(); - llama_data_read_file data_ctx(&file); - const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id); - if (!nread) { - LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); - return 0; - } - GGML_ASSERT(nread <= state_size); - GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell()); - } - - return file.tell(); +size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) { + return ctx->state_seq_set_data(seq_id, src, size); } size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { try { - return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count); + return ctx->state_seq_save_file(seq_id, filepath, tokens, n_token_count); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what()); return 0; @@ -3752,7 +3973,7 @@ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepa size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { try { - return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out); + return ctx->state_seq_load_file(dest_seq_id, filepath, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what()); return 0; diff --git a/src/llama-context.h b/src/llama-context.h index 8f22fd3b1d3a1..f7e007f3273c5 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -16,38 +16,245 @@ using llama_loras = std::unordered_map; -struct llama_batch_manager_i; - -// TODO: make implementation details private -// TODO: become abstract base class, split the current implementation into different child classes struct llama_context { - // TODO: tmp until llama-model starts implementing the graph build function - typedef std::function build_graph_callback; + llama_context(const llama_model & model); + virtual ~llama_context(); - llama_context( - const llama_model & model, - const llama_context_params & params, - build_graph_callback && cb_build_graph); + virtual void synchronize(); + + virtual uint32_t n_ctx() const = 0; + virtual uint32_t n_batch() const = 0; + virtual uint32_t n_ubatch() const = 0; + virtual uint32_t n_seq_max() const = 0; + + virtual llama_kv_cache * get_kv_self() = 0; + virtual const llama_kv_cache * get_kv_self() const = 0; + + virtual void kv_self_update() = 0; + + virtual enum llama_pooling_type pooling_type() const = 0; + + virtual float * get_logits() = 0; + virtual float * get_logits_ith(int32_t i) = 0; + + virtual float * get_embeddings() = 0; + virtual float * get_embeddings_ith(int32_t i) = 0; + virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0; + + int64_t n_pos_per_token() const; // vision + + virtual ggml_context_ptr init(); + + virtual int decode(llama_batch & inp_batch) = 0; + virtual int encode(llama_batch & inp_batch) = 0; + + // graph build API (generic) + + // do mat_mul, while optionally apply lora + virtual ggml_tensor * build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur); + + // do mat_mul_id, while optionally apply lora + virtual ggml_tensor * build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, // struct ggml_tensor * as + ggml_tensor * cur, // struct ggml_tensor * b + ggml_tensor * ids); + + // graph build API (context-specific) + + virtual ggml_tensor * build_inp_embd( + ggml_context * ctx0, + ggml_tensor * tok_embd, + const llama_ubatch & ubatch) = 0; + + virtual ggml_tensor * build_inp_pos( + ggml_context * ctx0, + int32_t n_tokens) = 0; + + virtual ggml_tensor * build_inp_out_ids( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) = 0; + + virtual ggml_tensor * build_inp_mean( + ggml_context * ctx0, + int32_t n_tokens) = 0; + + virtual ggml_tensor * build_inp_cls( + ggml_context * ctx0, + int32_t n_tokens) = 0; + + virtual void build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa, + bool worst_case) = 0; + + virtual void build_attn_kv_store( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + int64_t il, + bool worst_case) = 0; + + virtual ggml_tensor * build_attn_qkv( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case) = 0; + + virtual ggml_tensor * build_soft_max_ext( + ggml_context * ctx0, + ggml_tensor * kq, + float kq_scale) = 0; + + virtual ggml_tensor * get_rope_factors(int il) = 0; + + virtual void build_k_shift( + ggml_context * ctx0, + ggml_cgraph * graph) = 0; + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + virtual void build_defrag( + ggml_context * ctx0, + ggml_cgraph * graph) = 0; + + virtual ggml_tensor * build_inp_embd_enc( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) = 0; + + virtual ggml_tensor * build_inp_KQ_mask_cross( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) = 0; + + virtual ggml_tensor * build_inp_s_copy( + ggml_context * ctx0, + bool worst_case) = 0; + + virtual ggml_tensor * build_inp_s_mask( + ggml_context * ctx0, + bool worst_case) = 0; + + virtual ggml_tensor * build_copy_mask_state( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_tokens, + int32_t n_state, + int32_t n_seqs, + bool worst_case) = 0; + + virtual ggml_tensor * build_mamba_layer( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) = 0; - virtual ~llama_context() = default; + virtual ggml_tensor * build_rwkv_token_shift_load( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) = 0; - const struct llama_model & model; + virtual ggml_tensor * build_rwkv_token_shift_store( + ggml_context * ctx0, + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il, + bool worst_case) = 0; + + virtual ggml_tensor * build_rwkv6_time_mix( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) = 0; + + // state save/load + + virtual size_t state_get_size() = 0; + virtual size_t state_get_data( uint8_t * dst, size_t size) = 0; + virtual size_t state_set_data(const uint8_t * src, size_t size) = 0; + + virtual size_t state_seq_get_size(llama_seq_id seq_id) = 0; + virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) = 0; + virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) = 0; + + virtual bool state_load_file( + const char * filepath, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) = 0; + + virtual bool state_save_file( + const char * filepath, + const llama_token * tokens, + size_t n_token_count) = 0; + + virtual size_t state_seq_load_file( + llama_seq_id seq_id, + const char * filepath, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) = 0; + + virtual size_t state_seq_save_file( + llama_seq_id seq_id, + const char * filepath, + const llama_token * tokens, + size_t n_token_count) = 0; + + // members + + const llama_model & model; llama_cparams cparams; - llama_sbatch sbatch; // TODO: revisit if needed llama_adapter_cvec cvec; llama_loras loras; - build_graph_callback cb_build_graph; + ggml_threadpool_t threadpool = nullptr; + ggml_threadpool_t threadpool_batch = nullptr; + + ggml_abort_callback abort_callback = nullptr; + void * abort_callback_data = nullptr; std::vector backends; std::vector> set_n_threads_fns; ggml_backend_t backend_cpu = nullptr; - ggml_threadpool_t threadpool = nullptr; - ggml_threadpool_t threadpool_batch = nullptr; + ggml_backend_sched_ptr sched; + + // memory buffers used to evaluate the model + std::vector buf_compute_meta; + // perf bool has_evaluated_once = false; mutable int64_t t_start_us; @@ -60,6 +267,49 @@ struct llama_context { mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) mutable int32_t n_eval = 0; // number of eval calls +}; + +// TODO: make implementation details private +struct llama_context_unified : public llama_context { + struct batch_manager; + + // TODO: tmp until llama-model starts implementing the graph build function + typedef std::function build_graph_callback; + + llama_context_unified( + const llama_model & model, + const llama_context_params & params, + build_graph_callback && cb_build_graph); + + virtual ~llama_context_unified(); + + virtual uint32_t n_ctx() const override; + virtual uint32_t n_batch() const override; + virtual uint32_t n_ubatch() const override; + virtual uint32_t n_seq_max() const override; + + virtual llama_kv_cache * get_kv_self() override; + virtual const llama_kv_cache * get_kv_self() const override; + + virtual void kv_self_update() override; + + virtual enum llama_pooling_type pooling_type() const override; + + virtual float * get_logits() override; + virtual float * get_logits_ith(int32_t i) override; + + virtual float * get_embeddings() override; + virtual float * get_embeddings_ith(int32_t i) override; + virtual float * get_embeddings_seq(llama_seq_id seq_id) override; + + virtual ggml_context_ptr init() override; + + virtual int decode(llama_batch & inp_batch) override; + virtual int encode(llama_batch & inp_batch) override; + + llama_sbatch sbatch; + + build_graph_callback cb_build_graph; // host buffer for the model output (logits and embeddings) ggml_backend_buffer_ptr buf_output; @@ -72,7 +322,7 @@ struct llama_context { size_t output_size = 0; // capacity (of tokens positions) for the output buffers int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch - bool logits_all = false; + bool logits_all = false; bool need_reserve = false; // embeddings output (2-dimensional array: [n_outputs][n_embd]) @@ -84,17 +334,7 @@ struct llama_context { // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE std::map> embd_seq; - // memory buffers used to evaluate the model - std::vector buf_compute_meta; - ggml_backend_sched_ptr sched; - - ggml_abort_callback abort_callback = nullptr; - void * abort_callback_data = nullptr; - - virtual std::unique_ptr prepare_batch(const llama_batch & batch); - - virtual int decode(llama_batch & inp_batch); - virtual int encode(llama_batch & inp_batch); + virtual std::unique_ptr prepare_batch(const llama_batch & batch); // returns the result of ggml_backend_sched_graph_compute_async execution enum ggml_status compute_graph( @@ -107,32 +347,19 @@ struct llama_context { // certain implementations could require a padding for the context size uint32_t get_ctx_padding(const llama_cparams & cparams) const; - void reset(); - void prepare_k_shift(); void prepare_defrag(); void set_inputs(const llama_ubatch & ubatch); // make the outputs have the same order they had in the user-provided batch - // TODO: maybe deprecate this + // TODO: maybe remove this void reorder_outputs(); // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. size_t reserve_outputs(size_t n_outputs); - ggml_tensor * build_lora_mm( - ggml_context * ctx0, - ggml_tensor * w, - ggml_tensor * cur); - - ggml_tensor * build_lora_mm_id( - ggml_context * ctx0, - ggml_tensor * w, // struct ggml_tensor * as - ggml_tensor * cur, // struct ggml_tensor * b - ggml_tensor * ids); - // input tensors struct ggml_tensor * inp_tokens; // I32 [n_batch] struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] @@ -141,49 +368,55 @@ struct llama_context { struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch] - // === encoder-decoder === - - // whether we are computing encoder output or decoder output - bool is_encoding = false; - - // output of the encoder part of the encoder-decoder models - std::vector embd_enc; - std::vector> seq_ids_enc; - - struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] - struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] - // === unified KV cache === - llama_kv_cache kv_self; + llama_kv_cache kv_self; struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] struct ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch] struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] struct ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch] - struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] struct ggml_tensor * inp_K_shift; // I32 [kv_size] - // return true if need to reserve new worst-case graph - void kv_self_update(); + virtual ggml_tensor * build_inp_embd( + ggml_context * ctx0, + ggml_tensor * tok_embd, + const llama_ubatch & ubatch) override; + + virtual ggml_tensor * build_inp_pos( + ggml_context * ctx0, + int32_t n_tokens) override; - void build_attn_inp( + virtual ggml_tensor * build_inp_out_ids( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) override; + + virtual ggml_tensor * build_inp_mean( + ggml_context * ctx0, + int32_t n_tokens) override; + + virtual ggml_tensor * build_inp_cls( + ggml_context * ctx0, + int32_t n_tokens) override; + + virtual void build_attn_inp( ggml_context * ctx0, int32_t n_tokens, bool causal, bool swa, - bool worst_case); + bool worst_case) override; - void build_attn_kv_store( + virtual void build_attn_kv_store( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * k_cur, ggml_tensor * v_cur, int32_t n_tokens, int64_t il, - bool worst_case); + bool worst_case) override; - ggml_tensor * build_attn_qkv( + virtual ggml_tensor * build_attn_qkv( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * wo, @@ -192,39 +425,65 @@ struct llama_context { int32_t n_tokens, float kq_scale, int il, - bool worst_case); + bool worst_case) override; - ggml_tensor * build_soft_max_ext( + virtual ggml_tensor * build_soft_max_ext( ggml_context * ctx0, ggml_tensor * kq, - float kq_scale); + float kq_scale) override; - ggml_tensor * get_rope_factors(int il); + virtual ggml_tensor * get_rope_factors(int il) override; - void build_k_shift( + virtual void build_k_shift( ggml_context * ctx0, - ggml_cgraph * graph); + ggml_cgraph * graph) override; // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - void build_defrag( + virtual void build_defrag( ggml_context * ctx0, - ggml_cgraph * graph); + ggml_cgraph * graph) override; + + // === encoder-decoder === + + // whether we are computing encoder output or decoder output + bool is_encoding = false; + + // output of the encoder part of the encoder-decoder models + std::vector embd_enc; + std::vector> seq_ids_enc; + + struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] + struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] + struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] + + virtual ggml_tensor * build_inp_embd_enc( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) override; + + virtual ggml_tensor * build_inp_KQ_mask_cross( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) override; // === recurrent === + struct ggml_tensor * inp_s_copy; // I32 [kv_size] + struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] + // TODO: add recurrent cache // TODO: add mamba-specific llama_context // TODO: change these to build_mamba_inp and hide `state_copy` and `state_mask` inside the llama_context impl - ggml_tensor * build_inp_s_copy( + virtual ggml_tensor * build_inp_s_copy( ggml_context * ctx0, - bool worst_case); + bool worst_case) override; - ggml_tensor * build_inp_s_mask( + virtual ggml_tensor * build_inp_s_mask( ggml_context * ctx0, - bool worst_case); + bool worst_case) override; - ggml_tensor * build_copy_mask_state( + virtual ggml_tensor * build_copy_mask_state( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * s, @@ -233,9 +492,9 @@ struct llama_context { int32_t n_tokens, int32_t n_state, int32_t n_seqs, - bool worst_case); + bool worst_case) override; - ggml_tensor * build_mamba_layer( + virtual ggml_tensor * build_mamba_layer( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * cur, @@ -243,25 +502,25 @@ struct llama_context { ggml_tensor * state_mask, const llama_ubatch & ubatch, int il, - bool worst_case); + bool worst_case) override; - ggml_tensor * build_rwkv_token_shift_load( + virtual ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, int il, - bool worst_case); + bool worst_case) override; - ggml_tensor * build_rwkv_token_shift_store( + virtual ggml_tensor * build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, int il, - bool worst_case); + bool worst_case) override; - ggml_tensor * build_rwkv6_time_mix( + virtual ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * cur, @@ -270,17 +529,48 @@ struct llama_context { ggml_tensor * state_mask, const llama_ubatch & ubatch, int il, - bool worst_case); - - struct ggml_tensor * inp_s_copy; // I32 [kv_size] - struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] - - // === vision === - - // TODO: find a better way to accommodate mutli-dimension position encoding methods - // number of position id each token get, 1 for each token in most cases. - // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate. - int n_pos_per_token = 1; + bool worst_case) override; + + // state save/load + + virtual size_t state_get_size() override; + virtual size_t state_get_data( uint8_t * dst, size_t size) override; + virtual size_t state_set_data(const uint8_t * src, size_t size) override; + + virtual size_t state_seq_get_size(llama_seq_id seq_id) override; + virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) override; + virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override; + + virtual bool state_load_file( + const char * filepath, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) override; + + virtual bool state_save_file( + const char * filepath, + const llama_token * tokens, + size_t n_token_count) override; + + virtual size_t state_seq_load_file( + llama_seq_id seq_id, + const char * filepath, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) override; + + virtual size_t state_seq_save_file( + llama_seq_id seq_id, + const char * filepath, + const llama_token * tokens, + size_t n_token_count) override; + +private: + size_t state_get_data(struct llama_data_write & data_ctx); + size_t state_set_data(struct llama_data_read & data_ctx); + + size_t state_seq_get_data(struct llama_data_write & data_ctx, llama_seq_id seq_id); + size_t state_seq_set_data(struct llama_data_read & data_ctx, llama_seq_id seq_id); }; // For internal test use diff --git a/src/llama.cpp b/src/llama.cpp index ed5e1e5254e7a..7c002f9bf8ff0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8,7 +8,6 @@ #include "llama-model.h" #include "ggml.h" -#include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml-cpp.h" @@ -86,8 +85,6 @@ struct llm_build_context { const float norm_rms_eps; const int32_t n_tokens; - const int32_t n_outputs; - const int32_t n_outputs_enc; const int32_t n_ctx_orig; const bool worst_case; @@ -98,9 +95,8 @@ struct llm_build_context { const llm_build_cb & cb; - std::vector & buf_compute_meta; - - struct ggml_context * ctx0 = nullptr; + const ggml_context_ptr ctx = nullptr; + ggml_context * ctx0 = nullptr; // TODO: consider making the entire interface noexcept llm_build_context( @@ -136,132 +132,37 @@ struct llm_build_context { norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (ubatch.n_tokens), - n_outputs (worst_case ? n_tokens : lctx.n_outputs), - n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd), n_ctx_orig (cparams.n_ctx_orig_yarn), worst_case (worst_case), flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), cb (cb), - buf_compute_meta (lctx.buf_compute_meta) { - // all initializations should be done in init() + ctx (lctx.init()), + ctx0 (ctx.get()) { } - void init() { - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - ctx0 = ggml_init(params); - - lctx.reset(); - } - - void free() { - ggml_free(ctx0); - ctx0 = nullptr; - } - + // TODO: tmp struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { - struct ggml_tensor * inpL; - - if (ubatch.token) { - lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - cb(lctx.inp_tokens, "inp_tokens", -1); - ggml_set_input(lctx.inp_tokens); - - inpL = ggml_get_rows(ctx0, tok_embd, lctx.inp_tokens); - - // apply lora for embedding tokens if needed - for (const auto & lora : loras) { - struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd); - if (lw == nullptr) { - continue; - } - - const float adapter_scale = lora.second; - const float scale = lw->get_scale(lora.first->alpha, adapter_scale); - - struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( - ctx0, lw->b, // non-transposed lora_b - ggml_get_rows(ctx0, lw->a, lctx.inp_tokens) - ), scale); - - inpL = ggml_add(ctx0, inpL, inpL_delta); - } - } else { - lctx.inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - inpL = lctx.inp_embd; - ggml_set_input(lctx.inp_embd); - } - - // For Granite architecture - if (hparams.f_embedding_scale != 0.0f) { - inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale); - } - + struct ggml_tensor * inpL = lctx.build_inp_embd(ctx0, tok_embd, ubatch); cb(inpL, "inp_embd", -1); return inpL; } - // do mat_mul, while optionally apply lora + // TODO: tmp struct ggml_tensor * build_lora_mm( struct ggml_tensor * w, struct ggml_tensor * cur) { - struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); - - for (const auto & lora : loras) { - struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); - if (lw == nullptr) { - continue; - } - - const float adapter_scale = lora.second; - const float scale = lw->get_scale(lora.first->alpha, adapter_scale); - - struct ggml_tensor * ab_cur = ggml_mul_mat( - ctx0, lw->b, - ggml_mul_mat(ctx0, lw->a, cur) - ); - - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - - return res; + return lctx.build_lora_mm(ctx0, w, cur); } - // do mat_mul_id, while optionally apply lora + // TODO: tmp struct ggml_tensor * build_lora_mm_id( struct ggml_tensor * w, // struct ggml_tensor * as struct ggml_tensor * cur, // struct ggml_tensor * b struct ggml_tensor * ids) { - struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); - for (const auto & lora : loras) { - struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); - if (lw == nullptr) { - continue; - } - - const float alpha = lora.first->alpha; - const float rank = (float) lw->b->ne[0]; - const float scale = alpha ? lora.second * alpha / rank : lora.second; - - struct ggml_tensor * ab_cur = ggml_mul_mat_id( - ctx0, lw->b, - ggml_mul_mat_id(ctx0, lw->a, cur, ids), - ids - ); - - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - - return res; + return lctx.build_lora_mm_id(ctx0, w, cur, ids); } struct ggml_tensor * build_norm( @@ -620,31 +521,31 @@ struct llm_build_context { } struct ggml_tensor * build_inp_pos() { - lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(lctx.inp_pos, "inp_pos", -1); - ggml_set_input(lctx.inp_pos); - return lctx.inp_pos; + ggml_tensor * cur = lctx.build_inp_pos(ctx0, n_tokens); + cb(cur, "inp_pos", -1); + + return cur; } struct ggml_tensor * build_inp_out_ids() { - lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); - cb(lctx.inp_out_ids, "inp_out_ids", -1); - ggml_set_input(lctx.inp_out_ids); - return lctx.inp_out_ids; + ggml_tensor * cur = lctx.build_inp_out_ids(ctx0, n_tokens, worst_case); + cb(cur, "inp_out_ids", -1); + + return cur; } struct ggml_tensor * build_inp_mean() { - lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); - cb(lctx.inp_mean, "inp_mean", -1); - ggml_set_input(lctx.inp_mean); - return lctx.inp_mean; + ggml_tensor * cur = lctx.build_inp_mean(ctx0, n_tokens); + cb(cur, "inp_mean", -1); + + return cur; } struct ggml_tensor * build_inp_cls() { - lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(lctx.inp_cls, "inp_cls", -1); - ggml_set_input(lctx.inp_cls); - return lctx.inp_cls; + ggml_tensor * cur = lctx.build_inp_cls(ctx0, n_tokens); + cb(cur, "inp_cls", -1); + + return cur; } struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { @@ -745,26 +646,22 @@ struct llm_build_context { //} struct ggml_tensor * build_inp_embd_enc() { - const int64_t n_embd = hparams.n_embd; - lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); - ggml_set_input(lctx.inp_embd_enc); - cb(lctx.inp_embd_enc, "embd_enc", -1); - return lctx.inp_embd_enc; + ggml_tensor * cur = lctx.build_inp_embd_enc(ctx0, n_tokens, worst_case); + cb(cur, "embd_enc", -1); + + return cur; } struct ggml_tensor * build_inp_KQ_mask_cross() { - lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - ggml_set_input(lctx.inp_KQ_mask_cross); - cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1); - return lctx.inp_KQ_mask_cross; + ggml_tensor * cur = lctx.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); + cb(cur, "KQ_mask_cross", -1); + + return cur; } struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -838,7 +735,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -927,9 +823,6 @@ struct llm_build_context { struct ggml_cgraph * build_deci() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -1014,7 +907,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -1422,9 +1314,6 @@ struct llm_build_context { struct ggml_cgraph * build_grok() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -1498,7 +1387,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -1580,9 +1468,6 @@ struct llm_build_context { struct ggml_cgraph * build_dbrx() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -1649,7 +1534,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -2716,10 +2600,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4); - cb(lctx.inp_pos, "inp_pos", -1); - ggml_set_input(lctx.inp_pos); - struct ggml_tensor * inp_pos = lctx.inp_pos; + struct ggml_tensor * inp_pos = build_inp_pos(); lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); @@ -2825,9 +2706,6 @@ struct llm_build_context { struct ggml_cgraph * build_qwen2moe() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -2891,7 +2769,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -4685,9 +4562,6 @@ struct llm_build_context { struct ggml_cgraph * build_olmo() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -4757,7 +4631,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -4808,9 +4681,6 @@ struct llm_build_context { struct ggml_cgraph * build_olmo2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -4880,7 +4750,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -4935,9 +4804,6 @@ struct llm_build_context { struct ggml_cgraph * build_olmoe() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5006,7 +4872,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -5325,9 +5190,6 @@ struct llm_build_context { struct ggml_cgraph * build_arctic() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5385,7 +5247,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -5458,9 +5319,6 @@ struct llm_build_context { struct ggml_cgraph * build_deepseek() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5535,7 +5393,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -5616,9 +5473,6 @@ struct llm_build_context { struct ggml_cgraph * build_deepseek2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - bool is_lite = (hparams.n_layer == 27); // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. @@ -5767,7 +5621,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -5996,9 +5849,6 @@ struct llm_build_context { //struct ggml_cgraph * build_t5_enc() { // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // // mutable variable, needed during the last layer of the computation to skip unused tokens - // int32_t n_tokens = this->n_tokens; - // const int64_t n_embd_head = hparams.n_embd_head_v; // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6072,7 +5922,6 @@ struct llm_build_context { // if (il == n_layer - 1) { // // skip computing output for unused tokens // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - // n_tokens = n_outputs; // cur = ggml_get_rows(ctx0, cur, inp_out_ids); // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); // } @@ -6128,9 +5977,6 @@ struct llm_build_context { //struct ggml_cgraph * build_t5_dec() { // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // // mutable variable, needed during the last layer of the computation to skip unused tokens - // int32_t n_tokens = this->n_tokens; - // const int64_t n_embd_head = hparams.n_embd_head_v; // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6272,7 +6118,6 @@ struct llm_build_context { // if (il == n_layer - 1) { // // skip computing output for unused tokens // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - // n_tokens = n_outputs; // cur = ggml_get_rows(ctx0, cur, inp_out_ids); // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); // inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); @@ -6673,9 +6518,6 @@ struct llm_build_context { struct ggml_cgraph * build_exaone() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6748,7 +6590,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -6978,9 +6819,6 @@ struct llm_build_context { struct ggml_cgraph * build_chameleon() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -7076,7 +6914,6 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -7341,8 +7178,6 @@ static struct ggml_cgraph * llama_build_graph( struct llm_build_context llm(lctx, ubatch, cb, worst_case); - llm.init(); - switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_MINICPM: @@ -7403,7 +7238,6 @@ static struct ggml_cgraph * llama_build_graph( } break; case LLM_ARCH_QWEN2VL: { - lctx.n_pos_per_token = 4; result = llm.build_qwen2vl(); } break; case LLM_ARCH_QWEN2MOE: @@ -7564,8 +7398,6 @@ static struct ggml_cgraph * llama_build_graph( result = llm.append_pooling(result); } - llm.free(); - return result; } @@ -7908,7 +7740,7 @@ struct llama_context * llama_init_from_model( try { // TODO: add logic which llama_context implementation to construct - ctx = new llama_context(*model, params, + ctx = new llama_context_unified(*model, params, [](llama_context & lctx, const llama_ubatch & ubatch, bool worst_case) { return llama_build_graph(lctx, ubatch, worst_case); }); From b52b79b048e3b82ea68c20de34ceac3fc3984786 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 11:23:38 +0200 Subject: [PATCH 30/84] context : move encode/decode to llama-context.cpp --- src/llama-context.cpp | 25 +++++++++++++++++++ src/llama-context.h | 23 +++++++++++++++++ src/llama.cpp | 57 ------------------------------------------- 3 files changed, 48 insertions(+), 57 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 13beb097cbadd..4e02f155b1a81 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -3980,6 +3980,31 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa } } +/// + +int32_t llama_encode( + struct llama_context * ctx, + struct llama_batch batch) { + const int ret = ctx->encode(batch); + if (ret != 0) { + LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret); + } + + return ret; +} + +int32_t llama_decode( + struct llama_context * ctx, + struct llama_batch batch) { + const int ret = ctx->decode(batch); + if (ret != 0) { + LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); + } + + return ret; +} + + const std::vector> & llama_internal_get_tensor_map( struct llama_context * ctx ) { diff --git a/src/llama-context.h b/src/llama-context.h index f7e007f3273c5..ac842dc8bc54c 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -45,7 +45,30 @@ struct llama_context { virtual ggml_context_ptr init(); + // decode a batch of tokens by evaluating the transformer + // in case of unsuccessful decoding (error or warning), + // the kv_cache state will be returned to its original state + // (for non-recurrent models) or cleaned (for recurrent models) + // + // - lctx: llama context + // - inp_batch: batch to evaluate + // + // return 0 on success + // return positive int on warning + // return negative int on error + // virtual int decode(llama_batch & inp_batch) = 0; + + + // encode a batch of tokens by evaluating the encoder part of the transformer + // + // - lctx: llama context + // - batch: batch to evaluate + // + // return 0 on success + // return positive int on warning + // return negative int on error + // virtual int encode(llama_batch & inp_batch) = 0; // graph build API (generic) diff --git a/src/llama.cpp b/src/llama.cpp index 7c002f9bf8ff0..f623dd385d917 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7401,39 +7401,6 @@ static struct ggml_cgraph * llama_build_graph( return result; } -// decode a batch of tokens by evaluating the transformer -// in case of unsuccessful decoding (error or warning), -// the kv_cache state will be returned to its original state -// (for non-recurrent models) or cleaned (for recurrent models) -// -// - lctx: llama context -// - inp_batch: batch to evaluate -// -// return 0 on success -// return positive int on warning -// return negative int on error -// -static int llama_decode_impl( - llama_context & lctx, - llama_batch inp_batch) { - return lctx.decode(inp_batch); -} - -// encode a batch of tokens by evaluating the encoder part of the transformer -// -// - lctx: llama context -// - batch: batch to evaluate -// -// return 0 on success -// return positive int on warning -// return negative int on error -// -static int llama_encode_impl( - llama_context & lctx, - llama_batch inp_batch) { - return lctx.encode(inp_batch); -} - // // interface implementation // @@ -7759,30 +7726,6 @@ struct llama_context * llama_new_context_with_model( return llama_init_from_model(model, params); } -/// - -int32_t llama_encode( - struct llama_context * ctx, - struct llama_batch batch) { - const int ret = llama_encode_impl(*ctx, batch); - if (ret != 0) { - LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret); - } - - return ret; -} - -int32_t llama_decode( - struct llama_context * ctx, - struct llama_batch batch) { - const int ret = llama_decode_impl(*ctx, batch); - if (ret != 0) { - LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); - } - - return ret; -} - // // chat templates // From 8da7f612b750851d7e13e4f1697ed8a98c46db3c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 12:11:30 +0200 Subject: [PATCH 31/84] context : improve llama_context encapsulation ggml-ci --- src/llama-adapter.cpp | 10 +- src/llama-adapter.h | 2 +- src/llama-context.cpp | 242 ++++++++++++++++++++++++++++++------------ src/llama-context.h | 79 ++++++++++---- src/llama.cpp | 151 +++++++++++++++----------- 5 files changed, 327 insertions(+), 157 deletions(-) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 8a0800463137e..3ce36886c0e1f 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -91,7 +91,7 @@ bool llama_adapter_cvec::init(const llama_model & model) { return true; } -int32_t llama_adapter_cvec::apply( +bool llama_adapter_cvec::apply( const llama_model & model, const float * data, size_t len, @@ -104,17 +104,17 @@ int32_t llama_adapter_cvec::apply( // disable the current control vector (but leave allocated for later) layer_start = -1; layer_end = -1; - return 0; + return true; } if (n_embd != (int) hparams.n_embd) { LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); - return 1; + return false; } if (tensors.empty()) { if (!init(model)) { - return 1; + return false; } } @@ -130,7 +130,7 @@ int32_t llama_adapter_cvec::apply( } } - return 0; + return true; } // lora diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 603fa08f6d186..4332ccd57f14b 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -19,7 +19,7 @@ struct llama_adapter_cvec { struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const; - int32_t apply( + bool apply( const llama_model & model, const float * data, size_t len, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4e02f155b1a81..353fc7feac66c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -33,7 +33,9 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } +// // llama_context +// llama_context::llama_context(const llama_model & model) : model (model), @@ -43,6 +45,52 @@ llama_context::llama_context(const llama_model & model) : llama_context::~llama_context() = default; +const llama_model & llama_context::get_model() const { + return model; +} + +const llama_cparams & llama_context::get_cparams() const { + return cparams; +} + +uint32_t llama_context::n_ctx() const { + return cparams.n_ctx; +} + +uint32_t llama_context::n_batch() const { + return cparams.n_batch; +} + +uint32_t llama_context::n_ubatch() const { + return cparams.n_ubatch; +} + +uint32_t llama_context::n_threads() const { + return cparams.n_threads; +} + +uint32_t llama_context::n_threads_batch() const { + return cparams.n_threads_batch; +} + +enum llama_pooling_type llama_context::pooling_type() const { + return cparams.pooling_type; +} + +int64_t llama_context::n_pos_per_token() const { + return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; +} + +ggml_context_ptr llama_context::init() { + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + return ggml_context_ptr { ggml_init(params) }; +} + void llama_context::synchronize() { ggml_backend_sched_synchronize(sched.get()); @@ -73,21 +121,96 @@ void llama_context::synchronize() { t_compute_start_us = 0; } -int64_t llama_context::n_pos_per_token() const { - return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; +void llama_context::attach_threadpool( + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { + this->threadpool = threadpool; + this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; } -ggml_context_ptr llama_context::init() { - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; +void llama_context::detach_threadpool() { + this->threadpool = nullptr; + this->threadpool_batch = nullptr; +} - return ggml_context_ptr { ggml_init(params) }; +void llama_context::set_n_threads(int32_t n_threads, int32_t n_threads_batch) { + cparams.n_threads = n_threads; + cparams.n_threads_batch = n_threads_batch; +} + +void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) { + this->abort_callback = abort_callback; + this->abort_callback_data = abort_callback_data; + + for (auto & backend : backends) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); + auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); + if (set_abort_callback_fn) { + set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data); + } + } +} + +void llama_context::set_embeddings(bool value) { + cparams.embeddings = value; +} + +void llama_context::set_causal_attn(bool value) { + cparams.causal_attn = value; +} + +void llama_context::set_adapter_lora( + struct llama_adapter_lora * adapter, + float scale) { + loras[adapter] = scale; +} + +bool llama_context::rm_adapter_lora( + struct llama_adapter_lora * adapter) { + auto pos = loras.find(adapter); + if (pos != loras.end()) { + loras.erase(pos); + return true; + } + + return false; +} + +void llama_context::clear_adapter_lora() { + loras.clear(); +} + +bool llama_context::apply_adapter_cvec( + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end) { + return cvec.apply(model, data, len, n_embd, il_start, il_end); +} + +llama_perf_context_data llama_context::get_perf() const { + llama_perf_context_data data = {}; + + data.t_start_ms = 1e-3 * t_start_us; + data.t_load_ms = 1e-3 * t_load_us; + data.t_p_eval_ms = 1e-3 * t_p_eval_us; + data.t_eval_ms = 1e-3 * t_eval_us; + data.n_p_eval = std::max(1, n_p_eval); + data.n_eval = std::max(1, n_eval); + + return data; } +void llama_context::perf_reset() { + t_start_us = ggml_time_us(); + t_eval_us = n_eval = 0; + t_p_eval_us = n_p_eval = 0; +} + +// // llama_context_unified +// llama_context_unified::llama_context_unified( const llama_model & model, @@ -396,18 +519,6 @@ llama_context_unified::llama_context_unified( llama_context_unified::~llama_context_unified() = default; -uint32_t llama_context_unified::n_ctx() const { - return cparams.n_ctx; -} - -uint32_t llama_context_unified::n_batch() const { - return cparams.n_batch; -} - -uint32_t llama_context_unified::n_ubatch() const { - return cparams.n_ubatch; -} - uint32_t llama_context_unified::n_seq_max() const { // TODO: add notion of n_seq_max to llama_kv_cache and use it here return kv_self.size; @@ -421,10 +532,6 @@ const llama_kv_cache * llama_context_unified::get_kv_self() const { return &kv_self; } -enum llama_pooling_type llama_context_unified::pooling_type() const { - return cparams.pooling_type; -} - float * llama_context_unified::get_logits() { // reorder logits for backward compatibility reorder_outputs(); @@ -1718,7 +1825,13 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) { return n_outputs_max; } -// do mat_mul, while optionally apply lora +ggml_tensor * llama_context::build_cvec( + ggml_context * ctx0, + ggml_tensor * cur, + int il) { + return cvec.apply_to(ctx0, cur, il); +} + ggml_tensor * llama_context::build_lora_mm( ggml_context * ctx0, ggml_tensor * w, @@ -1746,7 +1859,6 @@ ggml_tensor * llama_context::build_lora_mm( return res; } -// do mat_mul_id, while optionally apply lora ggml_tensor * llama_context::build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, @@ -2994,7 +3106,8 @@ struct llama_data_write { } void write_model_info() { - const std::string arch_str = llm_arch_name(ctx->model.arch); + const auto & model = ctx->get_model(); + const std::string arch_str = llm_arch_name(model.arch); write_string(arch_str); // TODO: add more model-specific info which should prevent loading the session file if not identical } @@ -3015,7 +3128,7 @@ struct llama_data_write { std::vector output_pos; - const size_t n_batch = ctx->cparams.n_batch; + const size_t n_batch = ctx->n_batch(); const auto & output_ids = ctx->output_ids; GGML_ASSERT(n_outputs <= ctx->output_size); @@ -3040,7 +3153,9 @@ struct llama_data_write { } void write_logits() { - const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens()); + const auto & model = ctx->get_model(); + + const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * model.vocab.n_tokens()); write(&logits_size, sizeof(logits_size)); @@ -3050,7 +3165,9 @@ struct llama_data_write { } void write_embeddings() { - const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd); + const auto & model = ctx->get_model(); + + const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * model.hparams.n_embd); write(&embeddings_size, sizeof(embeddings_size)); @@ -3079,7 +3196,9 @@ struct llama_data_read { // validate model information void read_model_info() { - const std::string cur_arch_str = llm_arch_name(ctx->model.arch); + const auto & model = ctx->get_model(); + + const std::string cur_arch_str = llm_arch_name(model.arch); std::string arch_str; read_string(arch_str); @@ -3117,8 +3236,8 @@ struct llama_data_read { for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { int32_t id = output_pos[i]; - if ((uint32_t) id >= ctx->cparams.n_batch) { - throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch)); + if ((uint32_t) id >= ctx->n_batch()) { + throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->n_batch())); } ctx->output_ids[id] = i; } @@ -3598,7 +3717,7 @@ uint32_t llama_n_seq_max(const struct llama_context * ctx) { } const llama_model * llama_get_model(const llama_context * ctx) { - return &ctx->model; + return &ctx->get_model(); } llama_kv_cache * llama_get_kv_self(llama_context * ctx) { @@ -3614,50 +3733,38 @@ enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { } void llama_attach_threadpool( - struct llama_context * ctx, - ggml_threadpool_t threadpool, - ggml_threadpool_t threadpool_batch) { - ctx->threadpool = threadpool; - ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; + struct llama_context * ctx, + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { + ctx->attach_threadpool(threadpool, threadpool_batch); } void llama_detach_threadpool(struct llama_context * ctx) { - ctx->threadpool = nullptr; - ctx->threadpool_batch = nullptr; + ctx->detach_threadpool(); } void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) { - ctx->cparams.n_threads = n_threads; - ctx->cparams.n_threads_batch = n_threads_batch; + ctx->set_n_threads(n_threads, n_threads_batch); } int32_t llama_n_threads(struct llama_context * ctx) { - return ctx->cparams.n_threads; + return ctx->n_threads(); } int32_t llama_n_threads_batch(struct llama_context * ctx) { - return ctx->cparams.n_threads_batch; + return ctx->n_threads_batch(); } void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { - ctx->abort_callback = abort_callback; - ctx->abort_callback_data = abort_callback_data; - - for (auto & backend : ctx->backends) { - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); - auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); - if (set_abort_callback_fn) { - set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data); - } - } + ctx->set_abort_callback(abort_callback, abort_callback_data); } void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { - ctx->cparams.embeddings = embeddings; + ctx->set_embeddings(embeddings); } void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) { - ctx->cparams.causal_attn = causal_attn; + ctx->set_causal_attn(causal_attn); } void llama_synchronize(struct llama_context * ctx) { @@ -3700,24 +3807,21 @@ int32_t llama_set_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter, float scale) { - ctx->loras[adapter] = scale; + ctx->set_adapter_lora(adapter, scale); + return 0; } int32_t llama_rm_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter) { - auto pos = ctx->loras.find(adapter); - if (pos != ctx->loras.end()) { - ctx->loras.erase(pos); - return 0; - } + bool res = ctx->rm_adapter_lora(adapter); - return -1; + return res ? 0 : -1; } void llama_clear_adapter_lora(struct llama_context * ctx) { - ctx->loras.clear(); + ctx->clear_adapter_lora(); } int32_t llama_apply_adapter_cvec( @@ -3727,7 +3831,9 @@ int32_t llama_apply_adapter_cvec( int32_t n_embd, int32_t il_start, int32_t il_end) { - return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); + bool res = ctx->apply_adapter_cvec(data, len, n_embd, il_start, il_end); + + return res ? 0 : -1; } // @@ -4008,5 +4114,5 @@ int32_t llama_decode( const std::vector> & llama_internal_get_tensor_map( struct llama_context * ctx ) { - return ctx->model.tensors_by_name; + return ctx->get_model().tensors_by_name; } diff --git a/src/llama-context.h b/src/llama-context.h index ac842dc8bc54c..7b7699952a6e2 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -20,19 +20,23 @@ struct llama_context { llama_context(const llama_model & model); virtual ~llama_context(); - virtual void synchronize(); + const llama_model & get_model() const; + const llama_cparams & get_cparams() const; - virtual uint32_t n_ctx() const = 0; - virtual uint32_t n_batch() const = 0; - virtual uint32_t n_ubatch() const = 0; + virtual uint32_t n_ctx() const; + virtual uint32_t n_batch() const; + virtual uint32_t n_ubatch() const; virtual uint32_t n_seq_max() const = 0; + virtual uint32_t n_threads() const; + virtual uint32_t n_threads_batch() const; + virtual llama_kv_cache * get_kv_self() = 0; virtual const llama_kv_cache * get_kv_self() const = 0; virtual void kv_self_update() = 0; - virtual enum llama_pooling_type pooling_type() const = 0; + virtual enum llama_pooling_type pooling_type() const; virtual float * get_logits() = 0; virtual float * get_logits_ith(int32_t i) = 0; @@ -41,10 +45,41 @@ struct llama_context { virtual float * get_embeddings_ith(int32_t i) = 0; virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0; - int64_t n_pos_per_token() const; // vision + virtual int64_t n_pos_per_token() const; // vision virtual ggml_context_ptr init(); + virtual void synchronize(); + + virtual void attach_threadpool( + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch); + + virtual void detach_threadpool(); + + virtual void set_n_threads(int32_t n_threads, int32_t n_threads_batch); + + virtual void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data); + + virtual void set_embeddings (bool value); + virtual void set_causal_attn(bool value); + + virtual void set_adapter_lora( + struct llama_adapter_lora * adapter, + float scale); + + virtual bool rm_adapter_lora( + struct llama_adapter_lora * adapter); + + virtual void clear_adapter_lora(); + + virtual bool apply_adapter_cvec( + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end); + // decode a batch of tokens by evaluating the transformer // in case of unsuccessful decoding (error or warning), // the kv_cache state will be returned to its original state @@ -73,6 +108,12 @@ struct llama_context { // graph build API (generic) + // apply control vector for layer il + virtual ggml_tensor * build_cvec( + ggml_context * ctx0, + ggml_tensor * cur, + int il); + // do mat_mul, while optionally apply lora virtual ggml_tensor * build_lora_mm( ggml_context * ctx0, @@ -221,11 +262,11 @@ struct llama_context { // state save/load - virtual size_t state_get_size() = 0; + virtual size_t state_get_size() = 0; virtual size_t state_get_data( uint8_t * dst, size_t size) = 0; virtual size_t state_set_data(const uint8_t * src, size_t size) = 0; - virtual size_t state_seq_get_size(llama_seq_id seq_id) = 0; + virtual size_t state_seq_get_size(llama_seq_id seq_id) = 0; virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) = 0; virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) = 0; @@ -253,8 +294,19 @@ struct llama_context { const llama_token * tokens, size_t n_token_count) = 0; + // perf + + virtual llama_perf_context_data get_perf() const; + virtual void perf_reset(); + // members + // TODO: temporary public until llama_context implements the graph build function + std::vector backends; + ggml_backend_t backend_cpu = nullptr; + ggml_backend_sched_ptr sched; + +protected: const llama_model & model; llama_cparams cparams; @@ -267,17 +319,11 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; - std::vector backends; std::vector> set_n_threads_fns; - ggml_backend_t backend_cpu = nullptr; - - ggml_backend_sched_ptr sched; - // memory buffers used to evaluate the model std::vector buf_compute_meta; - // perf bool has_evaluated_once = false; mutable int64_t t_start_us; @@ -306,9 +352,6 @@ struct llama_context_unified : public llama_context { virtual ~llama_context_unified(); - virtual uint32_t n_ctx() const override; - virtual uint32_t n_batch() const override; - virtual uint32_t n_ubatch() const override; virtual uint32_t n_seq_max() const override; virtual llama_kv_cache * get_kv_self() override; @@ -316,8 +359,6 @@ struct llama_context_unified : public llama_context { virtual void kv_self_update() override; - virtual enum llama_pooling_type pooling_type() const override; - virtual float * get_logits() override; virtual float * get_logits_ith(int32_t i) override; diff --git a/src/llama.cpp b/src/llama.cpp index f623dd385d917..ab6b7f5d3dae4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -59,8 +59,6 @@ struct llm_build_context { const llama_hparams & hparams; const llama_cparams & cparams; const llama_ubatch & ubatch; - const llama_adapter_cvec & cvec; - const llama_loras & loras; const int64_t n_embd; const int64_t n_layer; @@ -105,12 +103,10 @@ struct llm_build_context { const llm_build_cb & cb, bool worst_case) : lctx (lctx), - model (lctx.model), + model (lctx.get_model()), hparams (model.hparams), - cparams (lctx.cparams), + cparams (lctx.get_cparams()), ubatch (ubatch), - cvec (lctx.cvec), - loras (lctx.loras), n_embd (hparams.n_embd), n_layer (hparams.n_layer), n_rot (hparams.n_rot), @@ -791,7 +787,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -947,7 +943,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1067,7 +1063,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1171,7 +1168,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1287,7 +1285,8 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, inpL); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1436,7 +1435,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1564,7 +1563,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1670,7 +1669,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1761,7 +1761,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2057,7 +2058,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2194,7 +2196,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2342,7 +2345,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2454,7 +2458,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2565,7 +2570,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2680,7 +2686,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2823,7 +2830,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2944,7 +2952,8 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_output); cur = ggml_add(ctx0, cur, inpL); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3083,7 +3092,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, residual, cur); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3190,7 +3200,8 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); cur = ggml_add(ctx0, cur, inpL); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3296,7 +3307,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3406,7 +3418,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3521,7 +3534,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3638,7 +3652,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3842,7 +3857,8 @@ struct llm_build_context { cb(cur, "hidden_scaled_ffn", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3954,7 +3970,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, sa_out); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4077,7 +4094,8 @@ struct llm_build_context { cb(cur, "ffn_post_norm", -1); cur = ggml_add(ctx0, cur, sa_out); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4202,7 +4220,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4256,7 +4275,8 @@ struct llm_build_context { // residual cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4397,7 +4417,8 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4527,7 +4548,8 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4655,7 +4677,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4774,7 +4796,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4899,7 +4921,8 @@ struct llm_build_context { cb(cur, "ffn_moe_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5024,7 +5047,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); inpL = cur; @@ -5137,7 +5161,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, attn_out); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5165,7 +5190,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5293,7 +5319,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_out); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5446,7 +5472,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5673,7 +5700,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6492,7 +6520,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6614,7 +6642,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6704,7 +6732,7 @@ struct llm_build_context { cur = ggml_scale(ctx0, cur, 0.5F); } - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6787,7 +6815,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6947,7 +6976,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -7140,7 +7169,8 @@ static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_ubatch & ubatch, bool worst_case) { - const auto & model = lctx.model; + const auto & model = lctx.get_model(); + const auto & cparams = lctx.get_cparams(); // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { @@ -7150,7 +7180,7 @@ static struct ggml_cgraph * llama_build_graph( ggml_set_name(cur, name); } - if (!lctx.cparams.offload_kqv) { + if (!cparams.offload_kqv) { if (strcmp(name, "kqv_merged_cont") == 0) { // all nodes between the KV store and the attention output are run on the CPU ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu); @@ -7159,10 +7189,10 @@ static struct ggml_cgraph * llama_build_graph( // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - const bool full_offload = lctx.model.params.n_gpu_layers > (int) lctx.model.hparams.n_layer; + const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer; if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { - const auto & dev_layer = lctx.model.dev_layer(il); + const auto & dev_layer = model.dev_layer(il); for (auto & backend : lctx.backends) { if (ggml_backend_get_device(backend.get()) == dev_layer) { if (ggml_backend_supports_op(backend.get(), cur)) { @@ -7394,7 +7424,7 @@ static struct ggml_cgraph * llama_build_graph( } // add on pooling layer - if (lctx.cparams.embeddings) { + if (cparams.embeddings) { result = llm.append_pooling(result); } @@ -7824,12 +7854,7 @@ struct llama_perf_context_data llama_perf_context(const struct llama_context * c return data; } - data.t_start_ms = 1e-3 * ctx->t_start_us; - data.t_load_ms = 1e-3 * ctx->t_load_us; - data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us; - data.t_eval_ms = 1e-3 * ctx->t_eval_us; - data.n_p_eval = std::max(1, ctx->n_p_eval); - data.n_eval = std::max(1, ctx->n_eval); + data = ctx->get_perf(); return data; } @@ -7848,7 +7873,5 @@ void llama_perf_context_print(const struct llama_context * ctx) { } void llama_perf_context_reset(struct llama_context * ctx) { - ctx->t_start_us = ggml_time_us(); - ctx->t_eval_us = ctx->n_eval = 0; - ctx->t_p_eval_us = ctx->n_p_eval = 0; + ctx->perf_reset(); } From d146a14f77eb456d2082f0620e3b310b7bcee0a8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 12:41:36 +0200 Subject: [PATCH 32/84] context : minor naming fix --- src/llama-context.cpp | 2 +- src/llama-context.h | 12 ++++++------ src/llama.cpp | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 353fc7feac66c..f0d8bdaba073a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -189,7 +189,7 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } -llama_perf_context_data llama_context::get_perf() const { +llama_perf_context_data llama_context::perf_get_data() const { llama_perf_context_data data = {}; data.t_start_ms = 1e-3 * t_start_us; diff --git a/src/llama-context.h b/src/llama-context.h index 7b7699952a6e2..8ec7d3e2b1f69 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -94,7 +94,6 @@ struct llama_context { // virtual int decode(llama_batch & inp_batch) = 0; - // encode a batch of tokens by evaluating the encoder part of the transformer // // - lctx: llama context @@ -296,7 +295,7 @@ struct llama_context { // perf - virtual llama_perf_context_data get_perf() const; + virtual llama_perf_context_data perf_get_data() const; virtual void perf_reset(); // members @@ -326,20 +325,21 @@ struct llama_context { bool has_evaluated_once = false; - mutable int64_t t_start_us; - mutable int64_t t_load_us; + mutable int64_t t_start_us = 0; + mutable int64_t t_load_us = 0; mutable int64_t t_p_eval_us = 0; mutable int64_t t_eval_us = 0; mutable int64_t t_compute_start_us = 0; - mutable int64_t n_queued_tokens = 0; + mutable int64_t n_queued_tokens = 0; mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) mutable int32_t n_eval = 0; // number of eval calls }; // TODO: make implementation details private -struct llama_context_unified : public llama_context { +class llama_context_unified : public llama_context { +public: struct batch_manager; // TODO: tmp until llama-model starts implementing the graph build function diff --git a/src/llama.cpp b/src/llama.cpp index ab6b7f5d3dae4..c568f8d15c63c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -54,11 +54,11 @@ enum llm_norm_type { }; struct llm_build_context { - llama_context & lctx; - const llama_model & model; - const llama_hparams & hparams; - const llama_cparams & cparams; - const llama_ubatch & ubatch; + llama_context & lctx; + const llama_model & model; + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_ubatch & ubatch; const int64_t n_embd; const int64_t n_layer; @@ -7854,7 +7854,7 @@ struct llama_perf_context_data llama_perf_context(const struct llama_context * c return data; } - data = ctx->get_perf(); + data = ctx->perf_get_data(); return data; } From 5eae8e5183f80a8b669757bde7b26cec05923081 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 13:32:02 +0200 Subject: [PATCH 33/84] context : move build_rope_factors to base class ggml-ci --- src/llama-context.cpp | 172 +++++++++++++++++++++--------------------- src/llama-context.h | 19 +++-- src/llama.cpp | 14 ++-- 3 files changed, 104 insertions(+), 101 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f0d8bdaba073a..b29c98af63add 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -57,6 +57,10 @@ uint32_t llama_context::n_ctx() const { return cparams.n_ctx; } +uint32_t llama_context::n_ctx_per_seq() const { + return cparams.n_ctx / cparams.n_seq_max; +} + uint32_t llama_context::n_batch() const { return cparams.n_batch; } @@ -122,8 +126,8 @@ void llama_context::synchronize() { } void llama_context::attach_threadpool( - ggml_threadpool_t threadpool, - ggml_threadpool_t threadpool_batch) { + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { this->threadpool = threadpool; this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; } @@ -202,6 +206,86 @@ llama_perf_context_data llama_context::perf_get_data() const { return data; } +ggml_tensor * llama_context::build_cvec( + ggml_context * ctx0, + ggml_tensor * cur, + int il) { + return cvec.apply_to(ctx0, cur, il); +} + +ggml_tensor * llama_context::build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lw->b, + ggml_mul_mat(ctx0, lw->a, cur) + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; +} + +ggml_tensor * llama_context::build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur, + ggml_tensor * ids) { + struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float alpha = lora.first->alpha; + const float rank = (float) lw->b->ne[0]; + const float scale = alpha ? lora.second * alpha / rank : lora.second; + + struct ggml_tensor * ab_cur = ggml_mul_mat_id( + ctx0, lw->b, + ggml_mul_mat_id(ctx0, lw->a, cur, ids), + ids + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; +} + +ggml_tensor * llama_context::build_rope_factors(int il) { + const auto & hparams = model.hparams; + + // choose long/short freq factors based on the context size + const auto n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; + + if (model.layers[il].rope_freqs != nullptr) { + return model.layers[il].rope_freqs; + } + + if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) { + return model.layers[il].rope_long; + } + + return model.layers[il].rope_short; +} + void llama_context::perf_reset() { t_start_us = ggml_time_us(); t_eval_us = n_eval = 0; @@ -217,7 +301,7 @@ llama_context_unified::llama_context_unified( const llama_context_params & params, build_graph_callback && cb_build_graph) : llama_context(model), - cb_build_graph(std::move(cb_build_graph)){ + cb_build_graph(std::move(cb_build_graph)) { const auto & hparams = model.hparams; @@ -1825,69 +1909,6 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) { return n_outputs_max; } -ggml_tensor * llama_context::build_cvec( - ggml_context * ctx0, - ggml_tensor * cur, - int il) { - return cvec.apply_to(ctx0, cur, il); -} - -ggml_tensor * llama_context::build_lora_mm( - ggml_context * ctx0, - ggml_tensor * w, - ggml_tensor * cur) { - struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); - - for (const auto & lora : loras) { - struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); - if (lw == nullptr) { - continue; - } - - const float adapter_scale = lora.second; - const float scale = lw->get_scale(lora.first->alpha, adapter_scale); - - struct ggml_tensor * ab_cur = ggml_mul_mat( - ctx0, lw->b, - ggml_mul_mat(ctx0, lw->a, cur) - ); - - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - - return res; -} - -ggml_tensor * llama_context::build_lora_mm_id( - ggml_context * ctx0, - ggml_tensor * w, - ggml_tensor * cur, - ggml_tensor * ids) { - struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); - for (const auto & lora : loras) { - struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); - if (lw == nullptr) { - continue; - } - - const float alpha = lora.first->alpha; - const float rank = (float) lw->b->ne[0]; - const float scale = alpha ? lora.second * alpha / rank : lora.second; - - struct ggml_tensor * ab_cur = ggml_mul_mat_id( - ctx0, lw->b, - ggml_mul_mat_id(ctx0, lw->a, cur, ids), - ids - ); - - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - - return res; -} - void llama_context_unified::kv_self_update() { auto & kv = kv_self; @@ -2189,23 +2210,6 @@ ggml_tensor * llama_context_unified::build_soft_max_ext( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } -ggml_tensor * llama_context_unified::get_rope_factors(int il) { - const auto & hparams = model.hparams; - - // choose long/short freq factors based on the context size - const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; - - if (model.layers[il].rope_freqs != nullptr) { - return model.layers[il].rope_freqs; - } - - if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { - return model.layers[il].rope_long; - } - - return model.layers[il].rope_short; -} - ggml_tensor * llama_context_unified::build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, @@ -2327,7 +2331,7 @@ void llama_context_unified::build_k_shift( const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - struct ggml_tensor * rope_factors = get_rope_factors(il); + struct ggml_tensor * rope_factors = build_rope_factors(il); struct ggml_tensor * k = ggml_view_3d(ctx0, kv_self.k_l[il], diff --git a/src/llama-context.h b/src/llama-context.h index 8ec7d3e2b1f69..dd1030388e692 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -23,10 +23,11 @@ struct llama_context { const llama_model & get_model() const; const llama_cparams & get_cparams() const; - virtual uint32_t n_ctx() const; - virtual uint32_t n_batch() const; - virtual uint32_t n_ubatch() const; - virtual uint32_t n_seq_max() const = 0; + virtual uint32_t n_ctx() const; + virtual uint32_t n_ctx_per_seq() const; + virtual uint32_t n_batch() const; + virtual uint32_t n_ubatch() const; + virtual uint32_t n_seq_max() const = 0; virtual uint32_t n_threads() const; virtual uint32_t n_threads_batch() const; @@ -126,6 +127,8 @@ struct llama_context { ggml_tensor * cur, // struct ggml_tensor * b ggml_tensor * ids); + virtual ggml_tensor * build_rope_factors(int il); + // graph build API (context-specific) virtual ggml_tensor * build_inp_embd( @@ -182,8 +185,6 @@ struct llama_context { ggml_tensor * kq, float kq_scale) = 0; - virtual ggml_tensor * get_rope_factors(int il) = 0; - virtual void build_k_shift( ggml_context * ctx0, ggml_cgraph * graph) = 0; @@ -342,7 +343,7 @@ class llama_context_unified : public llama_context { public: struct batch_manager; - // TODO: tmp until llama-model starts implementing the graph build function + // TODO: tmp until llama_model starts implementing the graph build function typedef std::function build_graph_callback; llama_context_unified( @@ -496,8 +497,6 @@ class llama_context_unified : public llama_context { ggml_tensor * kq, float kq_scale) override; - virtual ggml_tensor * get_rope_factors(int il) override; - virtual void build_k_shift( ggml_context * ctx0, ggml_cgraph * graph) override; @@ -601,7 +600,7 @@ class llama_context_unified : public llama_context { virtual size_t state_get_data( uint8_t * dst, size_t size) override; virtual size_t state_set_data(const uint8_t * src, size_t size) override; - virtual size_t state_seq_get_size(llama_seq_id seq_id) override; + virtual size_t state_seq_get_size(llama_seq_id seq_id) override; virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) override; virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override; diff --git a/src/llama.cpp b/src/llama.cpp index c568f8d15c63c..9e37b0cd46dba 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -685,7 +685,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -857,7 +857,7 @@ struct llm_build_context { } else if (n_head > 0) { // self-attention // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -2999,7 +2999,7 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); struct ggml_tensor* attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, @@ -3706,7 +3706,7 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -4480,7 +4480,7 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -5373,7 +5373,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -6572,7 +6572,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); From e633dc171a8ae3d44c647bbd94a1921ed74c181c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 13:48:52 +0200 Subject: [PATCH 34/84] context : introduce llama_graph_i ggml-ci --- src/CMakeLists.txt | 1 + src/llama-context.h | 134 +----------------------------------- src/llama-graph.cpp | 1 + src/llama-graph.h | 164 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 168 insertions(+), 132 deletions(-) create mode 100644 src/llama-graph.cpp create mode 100644 src/llama-graph.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e1b02e4c08f07..f1f5d41d495a1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,6 +15,7 @@ add_library(llama llama-chat.cpp llama-context.cpp llama-grammar.cpp + llama-graph.cpp llama-hparams.cpp llama-impl.cpp llama-kv-cache.cpp diff --git a/src/llama-context.h b/src/llama-context.h index dd1030388e692..b446118ff2ffd 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -3,6 +3,7 @@ #include "llama.h" #include "llama-batch.h" #include "llama-cparams.h" +#include "llama-graph.h" #include "llama-model.h" #include "llama-kv-cache.h" #include "llama-adapter.h" @@ -16,7 +17,7 @@ using llama_loras = std::unordered_map; -struct llama_context { +struct llama_context : public llama_graph_i { llama_context(const llama_model & model); virtual ~llama_context(); @@ -129,137 +130,6 @@ struct llama_context { virtual ggml_tensor * build_rope_factors(int il); - // graph build API (context-specific) - - virtual ggml_tensor * build_inp_embd( - ggml_context * ctx0, - ggml_tensor * tok_embd, - const llama_ubatch & ubatch) = 0; - - virtual ggml_tensor * build_inp_pos( - ggml_context * ctx0, - int32_t n_tokens) = 0; - - virtual ggml_tensor * build_inp_out_ids( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) = 0; - - virtual ggml_tensor * build_inp_mean( - ggml_context * ctx0, - int32_t n_tokens) = 0; - - virtual ggml_tensor * build_inp_cls( - ggml_context * ctx0, - int32_t n_tokens) = 0; - - virtual void build_attn_inp( - ggml_context * ctx0, - int32_t n_tokens, - bool causal, - bool swa, - bool worst_case) = 0; - - virtual void build_attn_kv_store( - ggml_context * ctx0, - ggml_cgraph * graph, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - int32_t n_tokens, - int64_t il, - bool worst_case) = 0; - - virtual ggml_tensor * build_attn_qkv( - ggml_context * ctx0, - ggml_cgraph * graph, - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, - int32_t n_tokens, - float kq_scale, - int il, - bool worst_case) = 0; - - virtual ggml_tensor * build_soft_max_ext( - ggml_context * ctx0, - ggml_tensor * kq, - float kq_scale) = 0; - - virtual void build_k_shift( - ggml_context * ctx0, - ggml_cgraph * graph) = 0; - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_defrag( - ggml_context * ctx0, - ggml_cgraph * graph) = 0; - - virtual ggml_tensor * build_inp_embd_enc( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) = 0; - - virtual ggml_tensor * build_inp_KQ_mask_cross( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) = 0; - - virtual ggml_tensor * build_inp_s_copy( - ggml_context * ctx0, - bool worst_case) = 0; - - virtual ggml_tensor * build_inp_s_mask( - ggml_context * ctx0, - bool worst_case) = 0; - - virtual ggml_tensor * build_copy_mask_state( - ggml_context * ctx0, - ggml_cgraph * graph, - ggml_tensor * s, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - int32_t n_tokens, - int32_t n_state, - int32_t n_seqs, - bool worst_case) = 0; - - virtual ggml_tensor * build_mamba_layer( - ggml_context * ctx0, - ggml_cgraph * graph, - ggml_tensor * cur, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il, - bool worst_case) = 0; - - virtual ggml_tensor * build_rwkv_token_shift_load( - ggml_context * ctx0, - ggml_cgraph * graph, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il, - bool worst_case) = 0; - - virtual ggml_tensor * build_rwkv_token_shift_store( - ggml_context * ctx0, - ggml_tensor * token_shift, - const llama_ubatch & ubatch, - int il, - bool worst_case) = 0; - - virtual ggml_tensor * build_rwkv6_time_mix( - ggml_context * ctx0, - ggml_cgraph * graph, - ggml_tensor * cur, - ggml_tensor * x_prev, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il, - bool worst_case) = 0; - // state save/load virtual size_t state_get_size() = 0; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp new file mode 100644 index 0000000000000..20f2ee0bd56aa --- /dev/null +++ b/src/llama-graph.cpp @@ -0,0 +1 @@ +#include "llama-graph.h" diff --git a/src/llama-graph.h b/src/llama-graph.h new file mode 100644 index 0000000000000..37dff8db40541 --- /dev/null +++ b/src/llama-graph.h @@ -0,0 +1,164 @@ +#pragma once + +#include + +struct ggml_cgraph; +struct ggml_context; +struct ggml_tensor; +struct llama_ubatch; + +// TODO: pass to llama_model graph build +class llama_graph_i { +public: + // apply control vector for layer il + virtual ggml_tensor * build_cvec( + ggml_context * ctx0, + ggml_tensor * cur, + int il) = 0; + + // do mat_mul, while optionally apply lora + virtual ggml_tensor * build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur) = 0; + + // do mat_mul_id, while optionally apply lora + virtual ggml_tensor * build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, // struct ggml_tensor * as + ggml_tensor * cur, // struct ggml_tensor * b + ggml_tensor * ids) = 0; + + virtual ggml_tensor * build_rope_factors(int il) = 0; + + // graph build API (context-specific) + + virtual ggml_tensor * build_inp_embd( + ggml_context * ctx0, + ggml_tensor * tok_embd, + const llama_ubatch & ubatch) = 0; + + virtual ggml_tensor * build_inp_pos( + ggml_context * ctx0, + int32_t n_tokens) = 0; + + virtual ggml_tensor * build_inp_out_ids( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) = 0; + + virtual ggml_tensor * build_inp_mean( + ggml_context * ctx0, + int32_t n_tokens) = 0; + + virtual ggml_tensor * build_inp_cls( + ggml_context * ctx0, + int32_t n_tokens) = 0; + + virtual void build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa, + bool worst_case) = 0; + + virtual void build_attn_kv_store( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + int64_t il, + bool worst_case) = 0; + + virtual ggml_tensor * build_attn_qkv( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case) = 0; + + virtual ggml_tensor * build_soft_max_ext( + ggml_context * ctx0, + ggml_tensor * kq, + float kq_scale) = 0; + + virtual void build_k_shift( + ggml_context * ctx0, + ggml_cgraph * graph) = 0; + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + virtual void build_defrag( + ggml_context * ctx0, + ggml_cgraph * graph) = 0; + + virtual ggml_tensor * build_inp_embd_enc( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) = 0; + + virtual ggml_tensor * build_inp_KQ_mask_cross( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) = 0; + + virtual ggml_tensor * build_inp_s_copy( + ggml_context * ctx0, + bool worst_case) = 0; + + virtual ggml_tensor * build_inp_s_mask( + ggml_context * ctx0, + bool worst_case) = 0; + + virtual ggml_tensor * build_copy_mask_state( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_tokens, + int32_t n_state, + int32_t n_seqs, + bool worst_case) = 0; + + virtual ggml_tensor * build_mamba_layer( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) = 0; + + virtual ggml_tensor * build_rwkv_token_shift_load( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) = 0; + + virtual ggml_tensor * build_rwkv_token_shift_store( + ggml_context * ctx0, + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il, + bool worst_case) = 0; + + virtual ggml_tensor * build_rwkv6_time_mix( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) = 0; +}; From 0ab50f1bbb4770ac7575f261fa53df6ae0d68767 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 13:59:43 +0200 Subject: [PATCH 35/84] context : prepare llama_model graph build ggml-ci --- src/llama.cpp | 269 +++++++++++++++++++++++++------------------------- 1 file changed, 136 insertions(+), 133 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 9e37b0cd46dba..e71a87ee9fcdf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -54,7 +54,7 @@ enum llm_norm_type { }; struct llm_build_context { - llama_context & lctx; + llama_graph_i & lgf; const llama_model & model; const llama_hparams & hparams; const llama_cparams & cparams; @@ -98,14 +98,17 @@ struct llm_build_context { // TODO: consider making the entire interface noexcept llm_build_context( - llama_context & lctx, - const llama_ubatch & ubatch, - const llm_build_cb & cb, + llama_graph_i & lgf, + const llama_model & model, + const llama_cparams & cparams, + const llama_ubatch & ubatch, + llm_build_cb && cb, + ggml_context_ptr && ctx, bool worst_case) : - lctx (lctx), - model (lctx.get_model()), + lgf (lgf), + model (model), hparams (model.hparams), - cparams (lctx.get_cparams()), + cparams (cparams), ubatch (ubatch), n_embd (hparams.n_embd), n_layer (hparams.n_layer), @@ -133,14 +136,14 @@ struct llm_build_context { flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), - cb (cb), - ctx (lctx.init()), - ctx0 (ctx.get()) { + cb (std::move(cb)), + ctx (std::move(ctx)), + ctx0 (this->ctx.get()) { } // TODO: tmp struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { - struct ggml_tensor * inpL = lctx.build_inp_embd(ctx0, tok_embd, ubatch); + struct ggml_tensor * inpL = lgf.build_inp_embd(ctx0, tok_embd, ubatch); cb(inpL, "inp_embd", -1); return inpL; @@ -150,7 +153,7 @@ struct llm_build_context { struct ggml_tensor * build_lora_mm( struct ggml_tensor * w, struct ggml_tensor * cur) { - return lctx.build_lora_mm(ctx0, w, cur); + return lgf.build_lora_mm(ctx0, w, cur); } // TODO: tmp @@ -158,7 +161,7 @@ struct llm_build_context { struct ggml_tensor * w, // struct ggml_tensor * as struct ggml_tensor * cur, // struct ggml_tensor * b struct ggml_tensor * ids) { - return lctx.build_lora_mm_id(ctx0, w, cur, ids); + return lgf.build_lora_mm_id(ctx0, w, cur, ids); } struct ggml_tensor * build_norm( @@ -460,12 +463,12 @@ struct llm_build_context { ggml_build_forward_expand(graph, v_cur); //build_kv_store(graph, k_cur, v_cur, il); - lctx.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); + lgf.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); struct ggml_tensor * cur; //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il); - cur = lctx.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); + cur = lgf.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); cb(cur, "kqv_out", il); return cur; @@ -503,7 +506,7 @@ struct llm_build_context { struct ggml_cgraph * build_k_shift() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - lctx.build_k_shift(ctx0, gf); + lgf.build_k_shift(ctx0, gf); return gf; } @@ -511,34 +514,34 @@ struct llm_build_context { struct ggml_cgraph * build_defrag() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - lctx.build_defrag(ctx0, gf); + lgf.build_defrag(ctx0, gf); return gf; } struct ggml_tensor * build_inp_pos() { - ggml_tensor * cur = lctx.build_inp_pos(ctx0, n_tokens); + ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens); cb(cur, "inp_pos", -1); return cur; } struct ggml_tensor * build_inp_out_ids() { - ggml_tensor * cur = lctx.build_inp_out_ids(ctx0, n_tokens, worst_case); + ggml_tensor * cur = lgf.build_inp_out_ids(ctx0, n_tokens, worst_case); cb(cur, "inp_out_ids", -1); return cur; } struct ggml_tensor * build_inp_mean() { - ggml_tensor * cur = lctx.build_inp_mean(ctx0, n_tokens); + ggml_tensor * cur = lgf.build_inp_mean(ctx0, n_tokens); cb(cur, "inp_mean", -1); return cur; } struct ggml_tensor * build_inp_cls() { - ggml_tensor * cur = lctx.build_inp_cls(ctx0, n_tokens); + ggml_tensor * cur = lgf.build_inp_cls(ctx0, n_tokens); cb(cur, "inp_cls", -1); return cur; @@ -642,14 +645,14 @@ struct llm_build_context { //} struct ggml_tensor * build_inp_embd_enc() { - ggml_tensor * cur = lctx.build_inp_embd_enc(ctx0, n_tokens, worst_case); + ggml_tensor * cur = lgf.build_inp_embd_enc(ctx0, n_tokens, worst_case); cb(cur, "embd_enc", -1); return cur; } struct ggml_tensor * build_inp_KQ_mask_cross() { - ggml_tensor * cur = lctx.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); + ggml_tensor * cur = lgf.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); cb(cur, "KQ_mask_cross", -1); return cur; @@ -670,7 +673,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -685,7 +688,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -787,7 +790,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -831,7 +834,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -857,7 +860,7 @@ struct llm_build_context { } else if (n_head > 0) { // self-attention // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -943,7 +946,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -987,7 +990,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -1064,7 +1067,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1102,7 +1105,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -1169,7 +1172,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1206,7 +1209,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -1286,7 +1289,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1328,7 +1331,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -1435,7 +1438,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1480,7 +1483,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -1563,7 +1566,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1602,7 +1605,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -1670,7 +1673,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1702,7 +1705,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -1762,7 +1765,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1816,7 +1819,7 @@ struct llm_build_context { inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); cb(inpL, "inp_norm", -1); - lctx.build_attn_inp(ctx0, n_tokens, false, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, false, false, worst_case); // iterate layers for (int il = 0; il < n_layer; ++il) { @@ -1887,7 +1890,7 @@ struct llm_build_context { cb(kq, "kq", il); //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); - kq = lctx.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); + kq = lgf.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); cb(kq, "kq_soft_max_ext", il); struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); @@ -1991,7 +1994,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); inpL = build_norm(inpL, model.tok_norm, @@ -2059,7 +2062,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2093,7 +2096,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); if (model.pos_embd) { // inp_pos - contains the positions @@ -2197,7 +2200,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2234,7 +2237,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { @@ -2346,7 +2349,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2384,7 +2387,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -2459,7 +2462,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2497,7 +2500,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -2571,7 +2574,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2608,7 +2611,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -2687,7 +2690,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2725,7 +2728,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -2831,7 +2834,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2871,7 +2874,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { attn_norm_output = build_norm(inpL, @@ -2953,7 +2956,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_output); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2991,7 +2994,7 @@ struct llm_build_context { struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { auto residual = inpL; @@ -2999,7 +3002,7 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); struct ggml_tensor* attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, @@ -3093,7 +3096,7 @@ struct llm_build_context { cur = ggml_add(ctx0, residual, cur); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3135,7 +3138,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { @@ -3201,7 +3204,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3240,7 +3243,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -3308,7 +3311,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3345,7 +3348,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -3419,7 +3422,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3455,7 +3458,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -3535,7 +3538,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3573,7 +3576,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -3653,7 +3656,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3701,12 +3704,12 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -3858,7 +3861,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3902,7 +3905,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm @@ -3971,7 +3974,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4010,7 +4013,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { // norm @@ -4095,7 +4098,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4140,7 +4143,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4221,7 +4224,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4253,8 +4256,8 @@ struct llm_build_context { // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case); + struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); for (int il = 0; il < n_layer; ++il) { // norm @@ -4264,7 +4267,7 @@ struct llm_build_context { cb(cur, "attn_norm", il); //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il); - cur = lctx.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case); + cur = lgf.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case); if (il == n_layer - 1) { // skip computing output for unused tokens @@ -4276,7 +4279,7 @@ struct llm_build_context { // residual cur = ggml_add(ctx0, cur, inpL); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4314,7 +4317,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { @@ -4418,7 +4421,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4462,7 +4465,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); // sliding window switch pattern const int32_t sliding_window_pattern = 4; @@ -4480,7 +4483,7 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -4549,7 +4552,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4596,7 +4599,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4677,7 +4680,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4715,7 +4718,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4796,7 +4799,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4838,7 +4841,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4922,7 +4925,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4958,7 +4961,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { const int64_t n_head = hparams.n_head(il); @@ -5048,7 +5051,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); inpL = cur; @@ -5085,7 +5088,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -5162,7 +5165,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5191,7 +5194,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5228,7 +5231,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5319,7 +5322,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_out); cb(cur, "ffn_out", il); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5357,7 +5360,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -5373,7 +5376,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -5473,7 +5476,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5521,7 +5524,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5701,7 +5704,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5738,7 +5741,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6215,7 +6218,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -6309,7 +6312,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6438,7 +6441,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6520,7 +6523,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6558,7 +6561,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6572,7 +6575,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -6642,7 +6645,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6676,8 +6679,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case); + struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -6686,7 +6689,7 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { const llama_layer * layer = &model.layers[il]; - struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load( + struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load( ctx0, gf, state_copy, state_mask, ubatch, il, worst_case ); @@ -6703,7 +6706,7 @@ struct llm_build_context { 1 ); - cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); + cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -6726,13 +6729,13 @@ struct llm_build_context { ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), 1 ); - ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); + ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { cur = ggml_scale(ctx0, cur, 0.5F); } - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6766,8 +6769,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case); + struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -6778,7 +6781,7 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { const llama_layer * layer = &model.layers[il]; - struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load( + struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load( ctx0, gf, state_copy, state_mask, ubatch, il, worst_case ); @@ -6792,10 +6795,10 @@ struct llm_build_context { 1 ); - cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); + cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); - ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); + ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -6816,7 +6819,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6860,7 +6863,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6976,7 +6979,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.build_cvec(ctx0, cur, il); + cur = lgf.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -7206,7 +7209,7 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_cgraph * result = NULL; - struct llm_build_context llm(lctx, ubatch, cb, worst_case); + struct llm_build_context llm(lctx, lctx.get_model(), lctx.get_cparams(), ubatch, std::move(cb), lctx.init(), worst_case); switch (model.arch) { case LLM_ARCH_LLAMA: From f63aeecce681afacd5acfab8401fb298c16e31de Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 15:08:40 +0200 Subject: [PATCH 36/84] llama : models now build their graphs using llama_graph_i ggml-ci --- src/llama-context.cpp | 59 +- src/llama-context.h | 26 +- src/llama-graph.h | 8 +- src/llama-model.cpp | 7374 ++++++++++++++++++++++++++++++++++++++++ src/llama-model.h | 13 + src/llama.cpp | 7418 +---------------------------------------- 6 files changed, 7457 insertions(+), 7441 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index b29c98af63add..74d6a67bbe9d2 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -193,6 +193,47 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } +void llama_context::build_cb( + ggml_tensor * cur, + const char * name, + int il) { + if (il >= 0) { + ggml_format_name(cur, "%s-%d", name, il); + } else { + ggml_set_name(cur, name); + } + + if (!cparams.offload_kqv) { + if (strcmp(name, "kqv_merged_cont") == 0) { + // all nodes between the KV store and the attention output are run on the CPU + ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu); + } + } + + // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends + // FIXME: fix in ggml_backend_sched + const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer; + // TODO: during #11213, the requirement for ubatch.n_tokens < 32 was removed to simplify + // not sure if this is still needed, but it can be brought back if needed + //if (ubatch.n_tokens < 32 || full_offload) { + if (full_offload) { + if (il != -1 && strcmp(name, "norm") == 0) { + const auto & dev_layer = model.dev_layer(il); + for (auto & backend : backends) { + if (ggml_backend_get_device(backend.get()) == dev_layer) { + if (ggml_backend_supports_op(backend.get(), cur)) { + ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get()); + } + } + } + } + } +} + +ggml_cgraph * llama_context::build_graph(const llama_ubatch & ubatch, bool worst_case) { + return model.build_graph(*this, cparams, ubatch, init(), worst_case); +} + llama_perf_context_data llama_context::perf_get_data() const { llama_perf_context_data data = {}; @@ -298,11 +339,7 @@ void llama_context::perf_reset() { llama_context_unified::llama_context_unified( const llama_model & model, - const llama_context_params & params, - build_graph_callback && cb_build_graph) : - llama_context(model), - cb_build_graph(std::move(cb_build_graph)) { - + const llama_context_params & params) : llama_context(model) { const auto & hparams = model.hparams; cparams.n_seq_max = std::max(1u, params.n_seq_max); @@ -555,7 +592,7 @@ llama_context_unified::llama_context_unified( llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_pp = this->cb_build_graph(*this, ubatch_pp, true); + ggml_cgraph * gf_pp = build_graph(ubatch_pp, true); // reserve pp graph first so that buffers are only allocated once ggml_backend_sched_reserve(sched.get(), gf_pp); @@ -564,13 +601,13 @@ llama_context_unified::llama_context_unified( // reserve with tg graph to get the number of splits and nodes llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_tg = this->cb_build_graph(*this, ubatch_tg, true); + ggml_cgraph * gf_tg = build_graph(ubatch_tg, true); ggml_backend_sched_reserve(sched.get(), gf_tg); int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); int n_nodes_tg = ggml_graph_n_nodes(gf_tg); // reserve again with pp graph to avoid ggml-alloc reallocations during inference - gf_pp = this->cb_build_graph(*this, ubatch_pp, true); + gf_pp = build_graph(ubatch_pp, true); if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); throw std::runtime_error("failed to allocate compute buffers"); @@ -893,7 +930,7 @@ struct llama_context_unified::batch_manager { llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf = lctx.cb_build_graph(lctx, ubatch, true); + ggml_cgraph * gf = lctx.build_graph(ubatch, true); // initialize scheduler with the worst-case graph ggml_backend_sched_reset(lctx.sched.get()); @@ -1004,7 +1041,7 @@ int llama_context_unified::decode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - ggml_cgraph * gf = cb_build_graph(*this, ubatch, false); + ggml_cgraph * gf = build_graph(ubatch, false); // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -1227,7 +1264,7 @@ int llama_context_unified::encode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - ggml_cgraph * gf = cb_build_graph(*this, ubatch, false); + ggml_cgraph * gf = build_graph(ubatch, false); ggml_backend_sched_alloc_graph(sched.get(), gf); diff --git a/src/llama-context.h b/src/llama-context.h index b446118ff2ffd..8d7a6ad58dec4 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -82,6 +82,14 @@ struct llama_context : public llama_graph_i { int32_t il_start, int32_t il_end); + virtual void build_cb( + ggml_tensor * cur, + const char * name, + int il); + + // TODO: add encode/decode graphs + virtual ggml_cgraph * build_graph(const llama_ubatch & ubatch, bool worst_case); + // decode a batch of tokens by evaluating the transformer // in case of unsuccessful decoding (error or warning), // the kv_cache state will be returned to its original state @@ -171,11 +179,6 @@ struct llama_context : public llama_graph_i { // members - // TODO: temporary public until llama_context implements the graph build function - std::vector backends; - ggml_backend_t backend_cpu = nullptr; - ggml_backend_sched_ptr sched; - protected: const llama_model & model; @@ -189,8 +192,13 @@ struct llama_context : public llama_graph_i { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; + ggml_backend_t backend_cpu = nullptr; + std::vector backends; + std::vector> set_n_threads_fns; + ggml_backend_sched_ptr sched; + // memory buffers used to evaluate the model std::vector buf_compute_meta; @@ -213,13 +221,9 @@ class llama_context_unified : public llama_context { public: struct batch_manager; - // TODO: tmp until llama_model starts implementing the graph build function - typedef std::function build_graph_callback; - llama_context_unified( const llama_model & model, - const llama_context_params & params, - build_graph_callback && cb_build_graph); + const llama_context_params & params); virtual ~llama_context_unified(); @@ -244,8 +248,6 @@ class llama_context_unified : public llama_context { llama_sbatch sbatch; - build_graph_callback cb_build_graph; - // host buffer for the model output (logits and embeddings) ggml_backend_buffer_ptr buf_output; diff --git a/src/llama-graph.h b/src/llama-graph.h index 37dff8db40541..0084d99ccade6 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -7,9 +7,15 @@ struct ggml_context; struct ggml_tensor; struct llama_ubatch; -// TODO: pass to llama_model graph build +// TODO: can become more granular in the future class llama_graph_i { public: + // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) + virtual void build_cb( + ggml_tensor * cur, + const char * name, + int il) = 0; + // apply control vector for layer il virtual ggml_tensor * build_cvec( ggml_context * ctx0, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0f4b62c434d4b..bded48be6c25b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2,12 +2,17 @@ #include "llama-impl.h" #include "llama-mmap.h" +#include "llama-graph.h" +#include "llama-batch.h" +#include "llama-cparams.h" #include "llama-model-loader.h" #include "ggml-cpp.h" #include #include +#include +#include #include #include #include @@ -3774,6 +3779,7375 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const { return it->second; } +// +// llm_build +// + +enum llm_ffn_op_type { + LLM_FFN_SILU, + LLM_FFN_GELU, + LLM_FFN_RELU, + LLM_FFN_RELU_SQR, + LLM_FFN_SWIGLU, +}; + +enum llm_ffn_gate_type { + LLM_FFN_SEQ, + LLM_FFN_PAR, // ffn_gate is parallel to ffn_up +}; + +enum llm_norm_type { + LLM_NORM, + LLM_NORM_RMS, + LLM_NORM_GROUP, +}; + +struct llm_build_context { + llama_graph_i & lgf; + const llama_model & model; + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_ubatch & ubatch; + + const int64_t n_embd; + const int64_t n_layer; + const int64_t n_rot; + const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) + const int64_t n_head; + const int64_t n_head_kv; + const int64_t n_embd_head_k; + const int64_t n_embd_k_gqa; + const int64_t n_embd_head_v; + const int64_t n_embd_v_gqa; + const int64_t n_expert; + const int64_t n_expert_used; + + const float freq_base; + const float freq_scale; + const float ext_factor; + const float attn_factor; + const float beta_fast; + const float beta_slow; + const float norm_eps; + const float norm_rms_eps; + + const int32_t n_tokens; + const int32_t n_ctx_orig; + + const bool worst_case; + const bool flash_attn; + + const enum llama_pooling_type pooling_type; + const enum llama_rope_type rope_type; + + const ggml_context_ptr ctx = nullptr; + ggml_context * ctx0 = nullptr; + + // TODO: consider making the entire interface noexcept + llm_build_context( + llama_graph_i & lgf, + const llama_model & model, + const llama_cparams & cparams, + const llama_ubatch & ubatch, + ggml_context_ptr && ctx, + bool worst_case) : + lgf (lgf), + model (model), + hparams (model.hparams), + cparams (cparams), + ubatch (ubatch), + n_embd (hparams.n_embd), + n_layer (hparams.n_layer), + n_rot (hparams.n_rot), + n_ctx (cparams.n_ctx), + n_head (hparams.n_head()), + n_head_kv (hparams.n_head_kv()), + n_embd_head_k (hparams.n_embd_head_k), + n_embd_k_gqa (hparams.n_embd_k_gqa()), + n_embd_head_v (hparams.n_embd_head_v), + n_embd_v_gqa (hparams.n_embd_v_gqa()), + n_expert (hparams.n_expert), + n_expert_used (hparams.n_expert_used), + freq_base (cparams.rope_freq_base), + freq_scale (cparams.rope_freq_scale), + ext_factor (cparams.yarn_ext_factor), + attn_factor (cparams.yarn_attn_factor), + beta_fast (cparams.yarn_beta_fast), + beta_slow (cparams.yarn_beta_slow), + norm_eps (hparams.f_norm_eps), + norm_rms_eps (hparams.f_norm_rms_eps), + n_tokens (ubatch.n_tokens), + n_ctx_orig (cparams.n_ctx_orig_yarn), + worst_case (worst_case), + flash_attn (cparams.flash_attn), + pooling_type (cparams.pooling_type), + rope_type (hparams.rope_type), + ctx (std::move(ctx)), + ctx0 (this->ctx.get()) { + } + + // TODO: tmp + void cb(struct ggml_tensor * cur, const char * name, int il) { + lgf.build_cb(cur, name, il); + } + + // TODO: tmp + struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { + struct ggml_tensor * inpL = lgf.build_inp_embd(ctx0, tok_embd, ubatch); + cb(inpL, "inp_embd", -1); + + return inpL; + } + + // TODO: tmp + struct ggml_tensor * build_lora_mm( + struct ggml_tensor * w, + struct ggml_tensor * cur) { + return lgf.build_lora_mm(ctx0, w, cur); + } + + // TODO: tmp + struct ggml_tensor * build_lora_mm_id( + struct ggml_tensor * w, // struct ggml_tensor * as + struct ggml_tensor * cur, // struct ggml_tensor * b + struct ggml_tensor * ids) { + return lgf.build_lora_mm_id(ctx0, w, cur, ids); + } + + struct ggml_tensor * build_norm( + struct ggml_tensor * cur, + struct ggml_tensor * mw, + struct ggml_tensor * mb, + llm_norm_type type, + int il) { + switch (type) { + case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm (ctx0, cur, hparams.f_norm_rms_eps); break; + case LLM_NORM_GROUP: + { + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]); + cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[2]); + } break; + } + + if (mw || mb) { + cb(cur, "norm", il); + } + + if (mw) { + cur = ggml_mul(ctx0, cur, mw); + if (mb) { + cb(cur, "norm_w", il); + } + } + + if (mb) { + cur = ggml_add(ctx0, cur, mb); + } + + return cur; + } + + struct ggml_tensor * build_ffn( + struct ggml_tensor * cur, + struct ggml_tensor * up, + struct ggml_tensor * up_b, + struct ggml_tensor * up_s, + struct ggml_tensor * gate, + struct ggml_tensor * gate_b, + struct ggml_tensor * gate_s, + struct ggml_tensor * down, + struct ggml_tensor * down_b, + struct ggml_tensor * down_s, + struct ggml_tensor * act_scales, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + int il) { + struct ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur; + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx0, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (up_s) { + tmp = ggml_mul(ctx0, tmp, up_s); + cb(tmp, "ffn_up_s", il); + } + + if (gate) { + switch (type_gate) { + case LLM_FFN_SEQ: + { + cur = build_lora_mm(gate, tmp); + cb(cur, "ffn_gate", il); + } break; + case LLM_FFN_PAR: + { + cur = build_lora_mm(gate, cur); + cb(cur, "ffn_gate", il); + } break; + } + + if (gate_b) { + cur = ggml_add(ctx0, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + + if (gate_s) { + cur = ggml_mul(ctx0, cur, gate_s); + cb(cur, "ffn_gate_s", il); + } + + } else { + cur = tmp; + } + + switch (type_op) { + case LLM_FFN_SILU: + { + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_silu", il); + } break; + case LLM_FFN_GELU: + { + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_gelu", il); + if (act_scales != NULL) { + cur = ggml_div(ctx0, cur, act_scales); + cb(cur, "ffn_act", il); + } + } break; + case LLM_FFN_RELU: + { + cur = ggml_relu(ctx0, cur); + cb(cur, "ffn_relu", il); + } break; + case LLM_FFN_RELU_SQR: + { + cur = ggml_relu(ctx0, cur); + cb(cur, "ffn_relu", il); + + cur = ggml_sqr(ctx0, cur); + cb(cur, "ffn_sqr(relu)", il); + } break; + case LLM_FFN_SWIGLU: + { + // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + int64_t split_point = cur->ne[0] / 2; + struct ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); + struct ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); + + x0 = ggml_silu(ctx0, x0); + cb(cur, "ffn_silu", il); + + cur = ggml_mul(ctx0, x0, x1); + cb(cur, "ffn_mul", il); + } break; + } + + if (type_gate == LLM_FFN_PAR) { + cur = ggml_mul(ctx0, cur, tmp); + cb(cur, "ffn_gate_par", il); + } + + if (down) { + cur = build_lora_mm(down, cur); + } + + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx0, cur, down_b); + } + + if (down_s) { + cur = ggml_mul(ctx0, cur, down_s); + cb(cur, "ffn_down_s", il); + } + + return cur; + } + + struct ggml_tensor * build_moe_ffn( + struct ggml_tensor * cur, + struct ggml_tensor * gate_inp, + struct ggml_tensor * up_exps, + struct ggml_tensor * gate_exps, + struct ggml_tensor * down_exps, + struct ggml_tensor * exp_probs_b, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + bool scale_w, + float w_scale, + llama_expert_gating_func_type gating_op, + int il) { + int64_t n_embd = cur->ne[0]; + int64_t n_tokens = cur->ne[1]; + + ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens] + cb(logits, "ffn_moe_logits", il); + + ggml_tensor * probs = nullptr; + switch (gating_op) { + case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: + { + probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens] + } break; + case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: + { + probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens] + } break; + default: + GGML_ABORT("fatal error"); + } + cb(probs, "ffn_moe_probs", il); + + // add experts selection bias - introduced in DeepSeek V3 + // leave probs unbiased as it's later used to get expert weights + ggml_tensor * selection_probs = probs; + if (exp_probs_b != nullptr) { + selection_probs = ggml_add(ctx0, probs, exp_probs_b); + cb(selection_probs, "ffn_moe_probs_biased", il); + } + + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + cb(selected_experts, "ffn_moe_topk", il); + + ggml_tensor * weights = ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights", il); + + if (norm_w) { + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); + + ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] + cb(weights_sum, "ffn_moe_weights_sum", il); + + weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights_norm", il); + + weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); + } + if (scale_w) { + weights = ggml_scale(ctx0, weights, w_scale); + cb(weights, "ffn_moe_weights_scaled", il); + } + + cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); + ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(up, "ffn_moe_up", il); + + ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(gate, "ffn_moe_gate", il); + + switch (type_op) { + case LLM_FFN_SILU: + { + gate = ggml_silu(ctx0, gate); + cb(gate, "ffn_moe_silu", il); + } break; + case LLM_FFN_GELU: + { + gate = ggml_gelu(ctx0, gate); + cb(gate, "ffn_moe_gelu", il); + } break; + default: + GGML_ABORT("fatal error"); + } + + ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] + cb(par, "ffn_moe_gate_par", il); + + ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + cb(experts, "ffn_moe_down", il); + + experts = ggml_mul(ctx0, experts, weights); + + // aggregate experts + ggml_tensor * moe_out = nullptr; + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens, + experts->nb[2], i*experts->nb[1]); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx0, moe_out, cur_expert); + } + } + + if (n_expert_used == 1) { + // avoid returning a non-contiguous tensor + moe_out = ggml_cont(ctx0, moe_out); + } + + return moe_out; + } + + struct ggml_tensor * build_attn( + struct ggml_cgraph * graph, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + struct ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + int il) { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(graph, q_cur); + ggml_build_forward_expand(graph, k_cur); + ggml_build_forward_expand(graph, v_cur); + + //build_kv_store(graph, k_cur, v_cur, il); + lgf.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); + + struct ggml_tensor * cur; + + //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il); + cur = lgf.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); + cb(cur, "kqv_out", il); + + return cur; + } + + struct ggml_tensor * build_rwkv_channel_mix( + const struct llama_layer * layer, + struct ggml_tensor * cur, + struct ggml_tensor * x_prev, + const llm_arch arch) { + struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + switch (arch) { + case LLM_ARCH_RWKV6: + { + struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); + struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); + + struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); + struct ggml_tensor * k = ggml_sqr( + ctx0, + ggml_relu( + ctx0, + build_lora_mm(layer->channel_mix_key, xk) + ) + ); + cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k)); + } break; + default: + GGML_ABORT("fatal error"); + } + + return cur; + } + + struct ggml_cgraph * build_k_shift() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + lgf.build_k_shift(ctx0, gf); + + return gf; + } + + struct ggml_cgraph * build_defrag() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + lgf.build_defrag(ctx0, gf); + + return gf; + } + + struct ggml_tensor * build_inp_pos() { + ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens); + cb(cur, "inp_pos", -1); + + return cur; + } + + struct ggml_tensor * build_inp_out_ids() { + ggml_tensor * cur = lgf.build_inp_out_ids(ctx0, n_tokens, worst_case); + cb(cur, "inp_out_ids", -1); + + return cur; + } + + struct ggml_tensor * build_inp_mean() { + ggml_tensor * cur = lgf.build_inp_mean(ctx0, n_tokens); + cb(cur, "inp_mean", -1); + + return cur; + } + + struct ggml_tensor * build_inp_cls() { + ggml_tensor * cur = lgf.build_inp_cls(ctx0, n_tokens); + cb(cur, "inp_cls", -1); + + return cur; + } + + struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { + // find result_norm tensor for input + struct ggml_tensor * inp = nullptr; + for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + inp = ggml_graph_node(gf, i); + if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { + break; + } + + inp = nullptr; + } + GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor"); + + struct ggml_tensor * cur; + + switch (pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + cur = inp; + } break; + case LLAMA_POOLING_TYPE_MEAN: + { + struct ggml_tensor * inp_mean = build_inp_mean(); + cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean); + } break; + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + struct ggml_tensor * inp_cls = build_inp_cls(); + cur = ggml_get_rows(ctx0, inp, inp_cls); + } break; + case LLAMA_POOLING_TYPE_RANK: + { + struct ggml_tensor * inp_cls = build_inp_cls(); + inp = ggml_get_rows(ctx0, inp, inp_cls); + + // classification head + // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566 + GGML_ASSERT(model.cls != nullptr); + GGML_ASSERT(model.cls_b != nullptr); + + cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b); + cur = ggml_tanh(ctx0, cur); + + // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896 + if (model.cls_out) { + GGML_ASSERT(model.cls_out_b != nullptr); + + cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b); + } + } break; + default: + { + GGML_ABORT("unknown pooling type"); + } + } + + cb(cur, "result_embd_pooled", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + //struct ggml_tensor * build_pos_bucket(bool causal) { + // if (causal) { + // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); + // } else { + // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); + // } + + // ggml_set_input(lctx.inp_pos_bucket); + // cb(lctx.inp_pos_bucket, "pos_bucket", -1); + + // return lctx.inp_pos_bucket; + //} + + //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { + // struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); + // cb(pos_bucket_1d, "pos_bucket_1d", -1); + + // struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); + // cb(pos_bias, "pos_bias", -1); + + // pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0); + // cb(pos_bias, "pos_bias", -1); + + // pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); + // cb(pos_bias, "pos_bias", -1); + + // pos_bias = ggml_cont(ctx0, pos_bias); + // cb(pos_bias, "pos_bias", -1); + + // return pos_bias; + //} + + struct ggml_tensor * build_inp_embd_enc() { + ggml_tensor * cur = lgf.build_inp_embd_enc(ctx0, n_tokens, worst_case); + cb(cur, "embd_enc", -1); + + return cur; + } + + struct ggml_tensor * build_inp_KQ_mask_cross() { + ggml_tensor * cur = lgf.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); + cb(cur, "KQ_mask_cross", -1); + + return cur; + } + + struct ggml_cgraph * build_llama() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, kq_scale, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + if (model.layers[il].ffn_gate_inp == nullptr) { + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + } + + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + + // For Granite architecture + if (hparams.f_logit_scale) { + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); + } + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_deci() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_head = hparams.n_head(il); + + if (n_head == 0) { + // attention-free layer of Llama-3_1-Nemotron-51B + cur = inpL; + } else { + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + } + + if (n_head > 0 && n_head_kv == 0) { + // "linear attention" of Llama-3_1-Nemotron-51B + cur = build_lora_mm(model.layers[il].wo, cur); + cb(cur, "wo", il); + } else if (n_head > 0) { + // self-attention + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, kq_scale, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + + // modified to support attention-free layer of Llama-3_1-Nemotron-51B + struct ggml_tensor * ffn_inp = cur; + if (n_head > 0) { + ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + } + + // feed-forward network + if (model.layers[il].ffn_gate_inp == nullptr) { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + + // For Granite architecture + if (hparams.f_logit_scale) { + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); + } + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_baichuan() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + switch (model.type) { + case LLM_TYPE_7B: + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + break; + case LLM_TYPE_13B: + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); + break; + default: + GGML_ABORT("fatal error"); + } + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_xverse() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_falcon() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; + + attn_norm = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(attn_norm, "attn_norm", il); + + // self-attention + { + if (model.layers[il].attn_norm_2) { + // Falcon-40B + cur = build_norm(inpL, + model.layers[il].attn_norm_2, + model.layers[il].attn_norm_2_b, + LLM_NORM, il); + cb(cur, "attn_norm_2", il); + } else { + cur = attn_norm; + } + + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + // using mode = 2 for neox mode + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = cur; + + // feed forward + { + cur = build_ffn(attn_norm, // !! use the attn norm, not the result + model.layers[il].ffn_up, NULL, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = ggml_add(ctx0, cur, inpL); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + // norm + cur = build_norm(cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_grok() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // multiply by embedding_multiplier_scale of 78.38367176906169 + inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // Grok + // if attn_out_norm is present then apply it before adding the input + if (model.layers[il].attn_out_norm) { + cur = build_norm(cur, + model.layers[il].attn_out_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_out_norm", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_GELU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + + // Grok + // if layer_out_norm is present then apply it before adding the input + // Idea: maybe ffn_out_norm is a better name + if (model.layers[il].layer_out_norm) { + cur = build_norm(cur, + model.layers[il].layer_out_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "layer_out_norm", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + + // Grok + // multiply logits by output_multiplier_scale of 0.5773502691896257 + + cur = ggml_scale(ctx0, cur, 0.5773502691896257f); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_dbrx() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; + + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(cur, "wqkv_clamped", il); + + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].attn_out_norm, NULL, + LLM_NORM, il); + cb(cur, "attn_out_norm", il); + + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_starcoder() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); + + inpL = ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); + + for (int il = 0; il < n_layer; ++il) { + cur = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = build_norm(inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_refact() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_bert() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + struct ggml_tensor * inp_pos = nullptr; + + if (model.arch != LLM_ARCH_JINA_BERT_V2) { + inp_pos = build_inp_pos(); + } + + // construct input embeddings (token, type, position) + inpL = build_inp_embd(model.tok_embd); + + // token types are hardcoded to zero ("Sentence A") + struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); + inpL = ggml_add(ctx0, inpL, type_row0); + if (model.arch == LLM_ARCH_BERT) { + inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); + } + cb(inpL, "inp_embd", -1); + + // embed layer norm + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); + cb(inpL, "inp_norm", -1); + + lgf.build_attn_inp(ctx0, n_tokens, false, false, worst_case); + + // iterate layers + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur = inpL; + + struct ggml_tensor * Qcur; + struct ggml_tensor * Kcur; + struct ggml_tensor * Vcur; + + // self-attention + if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq); + cb(Qcur, "Qcur", il); + + if (model.layers[il].attn_q_norm) { + Qcur = build_norm(Qcur, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, il); + } + + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk); + cb(Kcur, "Kcur", il); + + if (model.layers[il].attn_k_norm) { + Kcur = build_norm(Kcur, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, il); + } + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + } else { + // compute Q and K and RoPE them + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + cb(kq, "kq", il); + + //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); + kq = lgf.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); + cb(kq, "kq_soft_max_ext", il); + + struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); + cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); + cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + cb(cur, "kqv_merged_cont", il); + + ggml_build_forward_expand(gf, cur); + + cur = build_lora_mm(model.layers[il].wo, cur); + if (model.layers[il].bo) { + cb(cur, "kqv_wo", il); + } + + if (model.layers[il].bo) { + cur = ggml_add(ctx0, cur, model.layers[il].bo); + } + cb(cur, "kqv_out", il); + + if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // re-add the layer input + cur = ggml_add(ctx0, cur, inpL); + + // attention layer norm + cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il); + + if (model.layers[il].attn_norm_2 != nullptr) { + cur = ggml_add(ctx0, cur, inpL); // re-add the layer input + cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il); + } + + struct ggml_tensor * ffn_inp = cur; + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + if (model.arch == LLM_ARCH_BERT) { + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, il); + } else { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + } + cb(cur, "ffn_out", il); + + // attentions bypass the intermediate layer + cur = ggml_add(ctx0, cur, ffn_inp); + + // output layer norm + cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cb(cur, "result_embd", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_bloom() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + inpL = build_norm(inpL, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, -1); + cb(inpL, "inp_norm", -1); + + for (int il = 0; il < n_layer; ++il) { + cur = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // Add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = build_norm(inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_mpt() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * pos; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + if (model.pos_embd) { + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); + + inpL = ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; + + attn_norm = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(attn_norm, "attn_norm", il); + + // self-attention + { + cur = attn_norm; + + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + if (model.layers[il].bqkv){ + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } + + if (hparams.f_clamp_kqv > 0.0f) { + cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(cur, "wqkv_clamped", il); + } + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // Q/K Layernorm + if (model.layers[il].attn_q_norm) { + Qcur = build_norm(Qcur, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, il); + cb(Qcur, "Qcur", il); + + Kcur = build_norm(Kcur, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, il); + cb(Kcur, "Kcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } else { + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // Add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // feed forward + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + model.layers[il].ffn_act, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_stablelm() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + struct ggml_tensor * inpSA = cur; + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + + if (model.layers[il].attn_q_norm) { + Qcur = build_norm(Qcur, + model.layers[il].attn_q_norm, + NULL, + LLM_NORM, il); + cb(Qcur, "Qcur", il); + } + if (model.layers[il].attn_k_norm) { + Kcur = build_norm(Kcur, + model.layers[il].attn_k_norm, + NULL, + LLM_NORM, il); + cb(Kcur, "Kcur", il); + } + + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + if (model.layers[il].ffn_norm) { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + } else { + // parallel residual + cur = inpSA; + } + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_qwen() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + // using mode = 2 for neox mode + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward forward + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_qwen2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_qwen2vl() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_multi( + ctx0, + ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_multi( + ctx0, + ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_qwen2moe() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self_attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + ggml_tensor * moe_out = + build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + + // FFN shared expert + { + ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur); + cb(cur_gate_inp, "ffn_shexp_gate_inp", il); + + // sigmoid + ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); + cb(cur_gate, "ffn_shexp_gate", il); + + ggml_tensor * cur_ffn = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur_ffn, "ffn_shexp", il); + + ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate); + cb(ffn_shexp_out, "ffn_shexp_out", il); + + moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out); + cb(moe_out, "ffn_out", il); + + cur = moe_out; + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_phi2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * attn_norm_output; + struct ggml_tensor * ffn_output; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + attn_norm_output = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(attn_norm_output, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; + + if (model.layers[il].wqkv) { + cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + } else { + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + // with phi2, we scale the Q to avoid precision issues + // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 + Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head))); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids); + } + + // FF + { + ffn_output = build_ffn(attn_norm_output, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(ffn_output, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_output); + cur = ggml_add(ctx0, cur, inpL); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = build_norm(inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output_no_bias", -1); + + cur = ggml_add(ctx0, cur, model.output_b); + cb(cur, "result_output", -1); + ggml_build_forward_expand(gf, cur); + return gf; + } + + struct ggml_cgraph * build_phi3() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); + + for (int il = 0; il < n_layer; ++il) { + auto residual = inpL; + + // self-attention + { + // rope freq factors for 128k context + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + + struct ggml_tensor* attn_norm_output = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM_RMS, il); + cb(attn_norm_output, "attn_norm", il); + + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; + + if (model.layers[il].wqkv) { + cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); + cb(cur, "wqkv", il); + + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); + } else { + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor* inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + } + + cur = ggml_add(ctx0, cur, residual); + residual = cur; + + cur = build_norm(cur, + model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + if (model.layers[il].ffn_gate_inp == nullptr) { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + } else { + // MoE branch + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + } + + cur = ggml_add(ctx0, residual, cur); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = build_norm(inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + + if (model.output_b != nullptr) { + cb(cur, "result_output_no_bias", -1); + cur = ggml_add(ctx0, cur, model.output_b); + } + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + + struct ggml_cgraph * build_plamo() { + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + struct ggml_tensor * attention_norm = cur; + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr, + n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr, + n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + struct ggml_tensor * sa_out = cur; + + cur = attention_norm; + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // feed-forward network + { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, sa_out); + cur = ggml_add(ctx0, cur, inpL); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_gpt2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * pos; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); + + inpL = ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); + + for (int il = 0; il < n_layer; ++il) { + cur = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = build_norm(inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_codeshell() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + cur = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(tmpq, "tmpq", il); + cb(tmpk, "tmpk", il); + cb(Vcur, "Vcur", il); + + struct ggml_tensor * Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = build_norm(inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_orion() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + // if (model.layers[il].bq) { + // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + // cb(Qcur, "Qcur", il); + // } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + // if (model.layers[il].bk) { + // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + // cb(Kcur, "Kcur", il); + // } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + // if (model.layers[il].bv) { + // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + // cb(Vcur, "Vcur", il); + // } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_internlm2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_minicpm3() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + //TODO: if the model varies, these parameters need to be read from the model + const int64_t n_embd_base = 256; + const float scale_embd = 12.0f; + const float scale_depth = 1.4f; + const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k)); + + const uint32_t n_embd_head_qk_rope = hparams.n_rot; + const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // scale the input embeddings + inpL = ggml_scale(ctx0, inpL, scale_embd); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self_attention + { + struct ggml_tensor * q = NULL; + // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} + q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + cb(q, "q", il); + + q = build_norm(q, + model.layers[il].attn_q_a_norm, NULL, + LLM_NORM_RMS, il); + cb(q, "q", il); + + // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} + q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); + cb(q, "q", il); + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + 0); + cb(q_nope, "q_nope", il); + + // and {n_head * n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, n_embd_head_qk_nope)); + cb(q_pe, "q_pe", il); + + // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_pe_compresseed, "kv_pe_compresseed", il); + + // split into {kv_lora_rank, n_tokens} + struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, + kv_pe_compresseed->nb[1], + 0); + cb(kv_compressed, "kv_compressed", il); + + // and {n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, + kv_pe_compresseed->nb[1], + kv_pe_compresseed->nb[1], + ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); + cb(k_pe, "k_pe", il); + + // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont + kv_compressed = ggml_cont(ctx0, kv_compressed); + kv_compressed = build_norm(kv_compressed, + model.layers[il].attn_kv_a_norm, NULL, + LLM_NORM_RMS, il); + cb(kv_compressed, "kv_compressed", il); + + // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} + struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); + cb(kv, "kv", il); + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), + ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), + 0); + cb(k_nope, "k_nope", il); + + // and {n_head * n_embd_head_v, n_tokens} + struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), + ggml_row_size(kv->type, (n_embd_head_qk_nope))); + cb(v_states, "v_states", il); + + v_states = ggml_cont(ctx0, v_states); + cb(v_states, "v_states", il); + + v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, + ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), + 0); + cb(v_states, "v_states", il); + + q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this + q_pe = ggml_rope_ext( + ctx0, q_pe, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(q_pe, "q_pe", il); + + // shared RoPE key + k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this + k_pe = ggml_rope_ext( + ctx0, k_pe, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(k_pe, "k_pe", il); + + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); + cb(q_states, "q_states", il); + + struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + cb(k_states, "k_states", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + k_states, v_states, q_states, n_tokens, kq_scale, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // scale_res - scale the hidden states for residual connection + const float scale_res = scale_depth/sqrtf(float(n_layer)); + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled", il); + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + // scale the hidden states for residual connection + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled_ffn", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head scaling + const float scale_lmhead = float(n_embd_base)/float(n_embd); + cur = ggml_scale(ctx0, cur, scale_lmhead); + cb(cur, "lmhead_scaling", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_gemma() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head_k = hparams.n_embd_head_k; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); + cb(Qcur, "Qcur_scaled", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); + cb(sa_out, "sa_out", il); + + cur = build_norm(sa_out, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, sa_out); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_gemma2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head_k = hparams.n_embd_head_k; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); + + for (int il = 0; il < n_layer; ++il) { + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + + // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e + switch (model.type) { + case LLM_TYPE_2B: + case LLM_TYPE_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; + case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; + default: GGML_ABORT("fatal error"); + }; + cb(Qcur, "Qcur_scaled", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + } + + cur = build_norm(cur, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); + cb(sa_out, "sa_out", il); + + cur = build_norm(sa_out, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = build_norm(cur, + model.layers[il].ffn_post_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", -1); + + cur = ggml_add(ctx0, cur, sa_out); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + + // final logit soft-capping + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); + cur = ggml_tanh(ctx0, cur); + cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + + struct ggml_cgraph * build_starcoder2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_mamba() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + // {n_embd, n_tokens} + inpL = build_inp_embd(model.tok_embd); + + struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); + + for (int il = 0; il < n_layer; ++il) { + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il); + cur = lgf.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // residual + cur = ggml_add(ctx0, cur, inpL); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + // final rmsnorm + cur = build_norm(inpL, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_command_r() { + + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const float f_logit_scale = hparams.f_logit_scale; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM, il); + cb(cur, "attn_norm", il); + struct ggml_tensor * ffn_inp = cur; + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + if (model.layers[il].attn_q_norm) { + Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur) * n_embd_head, + ggml_element_size(Qcur) * n_embd_head * n_head, + 0); + cb(Qcur, "Qcur", il); + Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, + ggml_element_size(Kcur) * n_embd_head, + ggml_element_size(Kcur) * n_embd_head * n_head_kv, + 0); + cb(Kcur, "Kcur", il); + + Qcur = build_norm(Qcur, + model.layers[il].attn_q_norm, + NULL, + LLM_NORM, il); + cb(Qcur, "Qcur", il); + + Kcur = build_norm(Kcur, + model.layers[il].attn_k_norm, + NULL, + LLM_NORM, il); + cb(Kcur, "Kcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); + } + + struct ggml_tensor * attn_out = cur; + + // feed-forward network + { + cur = build_ffn(ffn_inp, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + // add together residual + FFN + self-attention + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + + if (f_logit_scale) { + cur = ggml_scale(ctx0, cur, f_logit_scale); + } + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + + } + + struct ggml_cgraph * build_cohere2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const float f_logit_scale = hparams.f_logit_scale; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); + + // sliding window switch pattern + const int32_t sliding_window_pattern = 4; + + for (int il = 0; il < n_layer; ++il) { + // three layers sliding window attention (window size 4096) and ROPE + // fourth layer uses global attention without positional embeddings + const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + + // norm + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); + cb(cur, "attn_norm", il); + struct ggml_tensor * ffn_inp = cur; + + // self-attention + { + // rope freq factors for 128k context + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + if (is_sliding) { + Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, + beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, + attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + } else { + // For non-sliding layers, just reshape without applying RoPE + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + } + + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, + n_tokens, 1.0f / sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); + } + + struct ggml_tensor * attn_out = cur; + + // feed-forward network + { + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, + NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, + il); + cb(cur, "ffn_out", il); + } + + // add together residual + FFN + self-attention + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + + if (f_logit_scale) { + cur = ggml_scale(ctx0, cur, f_logit_scale); + } + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + // ref: https://allenai.org/olmo + // based on the original build_llama() function, changes: + // * non-parametric layer norm + // * clamp qkv + // * removed bias + // * removed MoE + struct ggml_cgraph * build_olmo() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + NULL, NULL, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (hparams.f_clamp_kqv > 0.0f) { + Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (hparams.f_clamp_kqv > 0.0f) { + Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (hparams.f_clamp_kqv > 0.0f) { + Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, nullptr, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + NULL, NULL, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + NULL, NULL, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_olmo2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = inpL; + + // self_attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur_rope", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + cur = build_norm(cur, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_ffn(ffn_inp, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = build_norm(cur, + model.layers[il].ffn_post_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", -1); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + // based on the build_qwen2moe() function, changes: + // * removed shared experts + // * removed bias + // * added q, k norm + struct ggml_cgraph * build_olmoe() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self_attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur_rope", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_openelm() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_head_qkv = 2*n_head_kv + n_head; + + cur = inpL; + struct ggml_tensor * residual = cur; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0)); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head)); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); + cb(Vcur, "Vcur", il); + + Qcur = build_norm(Qcur, + model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, il); + cb(Qcur, "Qcur", il); + + Kcur = build_norm(Kcur, + model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, il); + cb(Kcur, "Kcur", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); + cb(Qcur, "Vcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = inpL; + + // norm + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_gptneox() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + cur = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // ffn + if (hparams.use_par_res) { + // attention and ffn are computed in parallel + // x = x + attn(ln1(x)) + ffn(ln2(x)) + + struct ggml_tensor * attn_out = cur; + + cur = build_norm(inpL, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, attn_out); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } else { + // attention and ffn are computed sequentially + // x = x + attn(ln1(x)) + // x = x + ffn(ln2(x)) + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + } + + cur = build_norm(inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_arctic() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp); + cb(ffn_out, "ffn_out", il); + + // MoE + cur = build_norm(inpSA, + model.layers[il].ffn_norm_exps, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm_exps", il); + + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_out); + cb(cur, "ffn_out", il); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_deepseek() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, kq_scale, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + if ((uint32_t) il < hparams.n_layer_dense_lead) { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // MoE branch + ggml_tensor * moe_out = + build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, hparams.expert_weights_scale, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(moe_out, "ffn_moe_out", il); + + // FFN shared expert + { + ggml_tensor * ffn_shexp = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_deepseek2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + bool is_lite = (hparams.n_layer == 27); + + // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. + // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. + const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); + const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k)); + const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); + + const uint32_t n_embd_head_qk_rope = hparams.n_rot; + const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + // {n_embd, n_tokens} + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self_attention + { + struct ggml_tensor * q = NULL; + if (!is_lite) { + // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} + q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + cb(q, "q", il); + + q = build_norm(q, + model.layers[il].attn_q_a_norm, NULL, + LLM_NORM_RMS, il); + cb(q, "q", il); + + // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} + q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); + cb(q, "q", il); + } else { + q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(q, "q", il); + } + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + 0); + cb(q_nope, "q_nope", il); + + // and {n_head * n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, n_embd_head_qk_nope)); + cb(q_pe, "q_pe", il); + + // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_pe_compresseed, "kv_pe_compresseed", il); + + // split into {kv_lora_rank, n_tokens} + struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, + kv_pe_compresseed->nb[1], + 0); + cb(kv_compressed, "kv_compressed", il); + + // and {n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, + kv_pe_compresseed->nb[1], + kv_pe_compresseed->nb[1], + ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); + cb(k_pe, "k_pe", il); + + // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont + kv_compressed = ggml_cont(ctx0, kv_compressed); + kv_compressed = build_norm(kv_compressed, + model.layers[il].attn_kv_a_norm, NULL, + LLM_NORM_RMS, il); + cb(kv_compressed, "kv_compressed", il); + + // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} + struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); + cb(kv, "kv", il); + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), + ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), + 0); + cb(k_nope, "k_nope", il); + + // and {n_head * n_embd_head_v, n_tokens} + struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), + ggml_row_size(kv->type, (n_embd_head_qk_nope))); + cb(v_states, "v_states", il); + + v_states = ggml_cont(ctx0, v_states); + cb(v_states, "v_states", il); + + v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, + ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), + 0); + cb(v_states, "v_states", il); + + q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this + q_pe = ggml_rope_ext( + ctx0, q_pe, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow + ); + cb(q_pe, "q_pe", il); + + // shared RoPE key + k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this + k_pe = ggml_rope_ext( + ctx0, k_pe, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow + ); + cb(k_pe, "k_pe", il); + + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); + cb(q_states, "q_states", il); + + struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + cb(k_states, "k_states", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + k_states, v_states, q_states, n_tokens, kq_scale, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + if ((uint32_t) il < hparams.n_layer_dense_lead) { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // MoE branch + ggml_tensor * moe_out = + build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + true, hparams.expert_weights_scale, + (enum llama_expert_gating_func_type) hparams.expert_gating_func, + il); + cb(moe_out, "ffn_moe_out", il); + + // FFN shared expert + { + ggml_tensor * ffn_shexp = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_bitnet() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + if (model.layers[il].wq_scale) { + Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); + } + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + // B1.K + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + if (model.layers[il].wk_scale) { + Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); + } + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + // B1.V + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + if (model.layers[il].wv_scale) { + Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); + } + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + NULL, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + + cur = build_norm(cur, + model.layers[il].attn_sub_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_sub_norm", il); + + cur = build_lora_mm(model.layers[il].wo, cur); + if (model.layers[il].wo_scale) { + cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); + } + if (model.layers[il].bo) { + cur = ggml_add(ctx0, cur, model.layers[il].bo); + } + cb(cur, "attn_o_out", il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward forward + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, + model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, + NULL, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_sub_out", il); + + cur = build_norm(cur, + model.layers[il].ffn_sub_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_sub_norm", il); + + cur = build_lora_mm(model.layers[il].ffn_down, cur); + if (model.layers[il].ffn_down_scale) { + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); + } + cb(cur, "ffn_down", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + // FIXME: do not use model.tok_embd directly, duplicate as model.output + cur = build_lora_mm(model.tok_embd, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; + } + + //struct ggml_cgraph * build_t5_enc() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + // const int64_t n_embd_head = hparams.n_embd_head_v; + // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; + + // inpL = build_inp_embd(model.tok_embd); + + // GGML_ASSERT(lctx.is_encoding); + // struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); + + // // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + // struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); + + // for (int il = 0; il < n_layer; ++il) { + // struct ggml_tensor * inpSA = inpL; + + // // norm + // cur = build_norm(inpL, + // model.layers[il].attn_norm_enc, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm", il); + + // // self-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur); + // cb(Qcur, "Qcur", il); + + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur); + // cb(Kcur, "Kcur", il); + + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur); + // cb(Vcur, "Vcur", il); + + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); + + // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; + // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); + // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + // cb(kq_b, "kq_b", il); + + // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); + + // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); + // cb(v, "v", il); + + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); + // cb(kqv, "kqv", il); + + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); + + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); + + // ggml_build_forward_expand(gf, cur); + + // cur = build_lora_mm(model.layers[il].wo_enc, cur); + // cb(cur, "kqv_out", il); + // } + + // if (il == n_layer - 1) { + // // skip computing output for unused tokens + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + // } + + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + // cb(ffn_inp, "ffn_inp", il); + + // // feed-forward network + // { + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm_enc, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); + + // // T5 uses relu, flan-T5 uses gelu-gated + // cur = build_ffn(cur, + // model.layers[il].ffn_up_enc, NULL, NULL, + // model.layers[il].ffn_gate_enc, NULL, NULL, + // model.layers[il].ffn_down_enc, NULL, NULL, + // NULL, + // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + // il); + // cb(cur, "ffn_out", il); + // } + + // cur = ggml_add(ctx0, cur, ffn_inp); + // cb(cur, "ffn_out", il); + + // ggml_tensor * layer_dir = cvec.tensor_for(il); + // if (layer_dir != nullptr) { + // cur = ggml_add(ctx0, cur, layer_dir); + // } + // cb(cur, "l_out", il); + + // // input for next layer + // inpL = cur; + // } + + // cur = inpL; + // cb(cur, "result_embd", -1); + + // cur = build_norm(cur, + // model.output_norm_enc, NULL, + // LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); + + // ggml_build_forward_expand(gf, cur); + + // return gf; + //} + + //struct ggml_cgraph * build_t5_dec() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + // const int64_t n_embd_head = hparams.n_embd_head_v; + // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; + + // inpL = build_inp_embd(model.tok_embd); + + // GGML_ASSERT(!lctx.is_encoding); + // GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); + + // struct ggml_tensor * embd_enc = build_inp_embd_enc(); + // struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); + + // struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); + // struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); + + // for (int il = 0; il < n_layer; ++il) { + // struct ggml_tensor * inpSA = inpL; + + // // norm + // cur = build_norm(inpL, + // model.layers[il].attn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm", il); + + // // self-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + // cb(Qcur, "Qcur", il); + + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + // cb(Kcur, "Kcur", il); + + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + // cb(Vcur, "Vcur", il); + + // build_kv_store(gf, Kcur, Vcur, il); + + // struct ggml_tensor * k = + // ggml_view_3d(ctx0, kv_self.k_l[il], + // n_embd_head_k, n_kv, n_head_kv, + // ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + // ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + // 0); + // cb(k, "k", il); + + // struct ggml_tensor * v = + // ggml_view_3d(ctx0, kv_self.v_l[il], + // n_kv, n_embd_head_v, n_head_kv, + // ggml_element_size(kv_self.v_l[il])*n_ctx, + // ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + // 0); + // cb(v, "v", il); + + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); + + // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; + // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); + // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + // cb(kq_b, "kq_b", il); + + // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); + + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + // cb(kqv, "kqv", il); + + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); + + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); + + // ggml_build_forward_expand(gf, cur); + + // cur = build_lora_mm(model.layers[il].wo, cur); + // cb(cur, "kqv_out", il); + // } + + // cur = ggml_add(ctx0, cur, inpSA); + // cb(cur, "cross_inp", il); + + // struct ggml_tensor * inpCA = cur; + + // // norm + // cur = build_norm(cur, + // model.layers[il].attn_norm_cross, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm_cross", il); + + // // cross-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur); + // cb(Qcur, "Qcur", il); + + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc); + // cb(Kcur, "Kcur", il); + + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc); + // cb(Vcur, "Vcur", il); + + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); + + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); + + // kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); + + // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); + // cb(v, "v", il); + + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); + // cb(kqv, "kqv", il); + + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); + + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); + + // ggml_build_forward_expand(gf, cur); + + // cur = build_lora_mm(model.layers[il].wo_cross, cur); + // cb(cur, "kqv_out", il); + // } + + // if (il == n_layer - 1) { + // // skip computing output for unused tokens + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + // inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); + // } + + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); + // cb(ffn_inp, "ffn_inp", il); + + // // feed-forward network + // { + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); + + // // T5 uses relu, flan-T5 uses gelu-gated + // cur = build_ffn(cur, + // model.layers[il].ffn_up, NULL, NULL, + // model.layers[il].ffn_gate, NULL, NULL, + // model.layers[il].ffn_down, NULL, NULL, + // NULL, + // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + // il); + // cb(cur, "ffn_out", il); + // } + + // cur = ggml_add(ctx0, cur, ffn_inp); + // cb(cur, "ffn_out", il); + + // ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + // if (layer_dir != nullptr) { + // cur = ggml_add(ctx0, cur, layer_dir); + // } + // cb(cur, "l_out", il); + + // // input for next layer + // inpL = cur; + // } + + // cur = inpL; + // cb(cur, "result_embd", -1); + + // cur = build_norm(cur, + // model.output_norm, NULL, + // LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); + + // // lm_head + // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); + + // ggml_build_forward_expand(gf, cur); + + // return gf; + //} + + struct ggml_cgraph * build_jais() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + cur = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + + cur = build_norm(inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_chatglm() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, + NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; + + if (model.layers[il].wqkv == nullptr) { + Qcur = build_lora_mm(model.layers[il].wq, cur); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + } + Kcur = build_lora_mm(model.layers[il].wk, cur); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + } + Vcur = build_lora_mm(model.layers[il].wv, cur); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + } + } else { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + if (model.layers[il].bqkv) { + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor); + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur_rope", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // Add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + + } + + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + + cur = build_norm(inpL, + model.output_norm, + NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_nemotron() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + //GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, model.output_norm_b, + LLM_NORM, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_exaone() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + ggml_cgraph * build_rwkv6() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + GGML_ASSERT(hparams.token_shift_count == 2); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); + + struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); + + const auto n_embd = hparams.n_embd; + const auto n_seq_tokens = ubatch.n_seq_tokens; + const auto n_seqs = ubatch.n_seqs; + + for (int il = 0; il < n_layer; ++il) { + const llama_layer * layer = &model.layers[il]; + + struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load( + ctx0, gf, state_copy, state_mask, ubatch, il, worst_case + ); + + struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); + struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); + + struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); + cb(att_norm, "attn_norm", il); + + struct ggml_tensor * x_prev = ggml_concat( + ctx0, + att_shift, + ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), + 1 + ); + + cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + struct ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); + cb(ffn_norm, "ffn_norm", il); + + x_prev = ggml_concat( + ctx0, + ffn_shift, + ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), + 1 + ); + + cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6); + cur = ggml_add(ctx0, cur, ffn_inp); + + token_shift = ggml_concat(ctx0, + ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)), + ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), + 1 + ); + ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); + + if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { + cur = ggml_scale(ctx0, cur, 0.5F); + } + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + + cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py + ggml_cgraph * build_rwkv6qwen2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + GGML_ASSERT(n_embd == hparams.n_embd_k_s()); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); + + const auto n_embd = hparams.n_embd; + const auto n_seq_tokens = ubatch.n_seq_tokens; + const auto n_seqs = ubatch.n_seqs; + + inpL = build_inp_embd(model.tok_embd); + + for (int il = 0; il < n_layer; ++il) { + const llama_layer * layer = &model.layers[il]; + + struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load( + ctx0, gf, state_copy, state_mask, ubatch, il, worst_case + ); + + struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); + cb(att_norm, "attn_norm", il); + + struct ggml_tensor * x_prev = ggml_concat( + ctx0, + token_shift, + ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), + 1 + ); + + cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); + + token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); + ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + + cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + // ref: https://github.com/facebookresearch/chameleon + // based on the original build_llama() function, changes: + // * qk-norm + // * swin-norm + // * removed bias + // * removed MoE + struct ggml_cgraph * build_chameleon() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + if (hparams.swin_norm) { + cur = inpL; + } else { + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + } + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + if (model.layers[il].attn_q_norm) { + Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur) * n_embd_head, + ggml_element_size(Qcur) * n_embd_head * n_head, + 0); + cb(Qcur, "Qcur", il); + + Qcur = build_norm(Qcur, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, il); + cb(Qcur, "Qcur", il); + } + + if (model.layers[il].attn_k_norm) { + Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, + ggml_element_size(Kcur) * n_embd_head, + ggml_element_size(Kcur) * n_embd_head * n_head_kv, + 0); + cb(Kcur, "Kcur", il); + + Kcur = build_norm(Kcur, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, il); + cb(Kcur, "Kcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = build_attn(gf, + model.layers[il].wo, nullptr, + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + + if (hparams.swin_norm) { + cur = build_norm(cur, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + } + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + if (!hparams.swin_norm) { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + } + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + if (hparams.swin_norm) { + cur = build_norm(cur, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lgf.build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output_with_img_logits", -1); + + // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. + // Needs to be removed once image outputs are supported. + int img_token_end_idx = 8196; + int img_token_start_idx = 4; + int num_img_tokens = img_token_end_idx - img_token_start_idx; + // creates 1d tensor of size num_img_tokens and values -FLT_MAX, + // which ensures that text token values are always at least larger than image token values + struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens); + img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX); + cb(img_logits, "img_logits", -1); + cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_wavtokenizer_dec() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); + + cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1); + cur = ggml_add(ctx0, cur, model.conv1d_b); + + // posnet + for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) { + const auto & layer = model.layers[il].posnet; + + inpL = cur; + + switch (il) { + case 0: + case 1: + case 3: + case 4: + { + cur = build_norm(cur, + layer.norm1, + layer.norm1_b, + LLM_NORM_GROUP, 0); + + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.conv1_b); + + cur = build_norm(cur, + layer.norm2, + layer.norm2_b, + LLM_NORM_GROUP, 0); + + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.conv2_b); + + cur = ggml_add(ctx0, cur, inpL); + } break; + case 2: + { + cur = build_norm(cur, + layer.attn_norm, + layer.attn_norm_b, + LLM_NORM_GROUP, 0); + + struct ggml_tensor * q; + struct ggml_tensor * k; + struct ggml_tensor * v; + + q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1); + k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1); + v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1); + + q = ggml_add(ctx0, q, layer.attn_q_b); + k = ggml_add(ctx0, k, layer.attn_k_b); + v = ggml_add(ctx0, v, layer.attn_v_b); + + q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); + k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); + + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + + kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f); + + cur = ggml_mul_mat(ctx0, kq, v); + + cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.attn_o_b); + + cur = ggml_add(ctx0, cur, inpL); + } break; + case 5: + { + cur = build_norm(cur, + layer.norm, + layer.norm_b, + LLM_NORM_GROUP, 0); + } break; + default: GGML_ABORT("unknown posnet layer"); + }; + } + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = build_norm(cur, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, -1); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + inpL = cur; + + // convnext + for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) { + const auto & layer = model.layers[il].convnext; + + cur = inpL; + + cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.dw_b); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = build_norm(cur, + layer.norm, + layer.norm_b, + LLM_NORM, -1); + + cur = build_ffn(cur, + layer.pw1, layer.pw1_b, NULL, + NULL, NULL, NULL, + layer.pw2, layer.pw2_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, il); + + cur = ggml_mul(ctx0, cur, layer.gamma); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + inpL = ggml_add(ctx0, cur, inpL); + } + + cur = inpL; + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = build_norm(cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, -1); + + // lm_head + cur = build_lora_mm(model.output, cur); + + cur = ggml_add(ctx0, cur, model.output_b); + cb(cur, "result_embd", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } +}; + +ggml_cgraph * llama_model::build_graph( + llama_graph_i & lgf, + const llama_cparams & cparams, + const llama_ubatch & ubatch, + ggml_context_ptr && ctx, + bool worst_case) const { + struct ggml_cgraph * result = NULL; + + struct llm_build_context llm(lgf, *this, cparams, ubatch, std::move(ctx), worst_case); + + switch (arch) { + case LLM_ARCH_LLAMA: + case LLM_ARCH_MINICPM: + case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: + { + result = llm.build_llama(); + } break; + case LLM_ARCH_DECI: + { + result = llm.build_deci(); + } break; + case LLM_ARCH_BAICHUAN: + { + result = llm.build_baichuan(); + } break; + case LLM_ARCH_FALCON: + { + result = llm.build_falcon(); + } break; + case LLM_ARCH_GROK: + { + result = llm.build_grok(); + } break; + case LLM_ARCH_STARCODER: + { + result = llm.build_starcoder(); + } break; + case LLM_ARCH_REFACT: + { + result = llm.build_refact(); + } break; + case LLM_ARCH_BERT: + case LLM_ARCH_JINA_BERT_V2: + case LLM_ARCH_NOMIC_BERT: + { + result = llm.build_bert(); + } break; + case LLM_ARCH_BLOOM: + { + result = llm.build_bloom(); + } break; + case LLM_ARCH_MPT: + { + result = llm.build_mpt(); + } break; + case LLM_ARCH_STABLELM: + { + result = llm.build_stablelm(); + } break; + case LLM_ARCH_QWEN: + { + result = llm.build_qwen(); + } break; + case LLM_ARCH_QWEN2: + { + result = llm.build_qwen2(); + } break; + case LLM_ARCH_QWEN2VL: + { + result = llm.build_qwen2vl(); + } break; + case LLM_ARCH_QWEN2MOE: + { + result = llm.build_qwen2moe(); + } break; + case LLM_ARCH_PHI2: + { + result = llm.build_phi2(); + } break; + case LLM_ARCH_PHI3: + case LLM_ARCH_PHIMOE: + { + result = llm.build_phi3(); + } break; + case LLM_ARCH_PLAMO: + { + result = llm.build_plamo(); + } break; + case LLM_ARCH_GPT2: + { + result = llm.build_gpt2(); + } break; + case LLM_ARCH_CODESHELL: + { + result = llm.build_codeshell(); + } break; + case LLM_ARCH_ORION: + { + result = llm.build_orion(); + } break; + case LLM_ARCH_INTERNLM2: + { + result = llm.build_internlm2(); + } break; + case LLM_ARCH_MINICPM3: + { + result = llm.build_minicpm3(); + } break; + case LLM_ARCH_GEMMA: + { + result = llm.build_gemma(); + } break; + case LLM_ARCH_GEMMA2: + { + result = llm.build_gemma2(); + } break; + case LLM_ARCH_STARCODER2: + { + result = llm.build_starcoder2(); + } break; + case LLM_ARCH_MAMBA: + { + result = llm.build_mamba(); + } break; + case LLM_ARCH_XVERSE: + { + result = llm.build_xverse(); + } break; + case LLM_ARCH_COMMAND_R: + { + result = llm.build_command_r(); + } break; + case LLM_ARCH_COHERE2: + { + result = llm.build_cohere2(); + } break; + case LLM_ARCH_DBRX: + { + result = llm.build_dbrx(); + } break; + case LLM_ARCH_OLMO: + { + result = llm.build_olmo(); + } break; + case LLM_ARCH_OLMO2: + { + result = llm.build_olmo2(); + } break; + case LLM_ARCH_OLMOE: + { + result = llm.build_olmoe(); + } break; + case LLM_ARCH_OPENELM: + { + result = llm.build_openelm(); + } break; + case LLM_ARCH_GPTNEOX: + { + result = llm.build_gptneox(); + } break; + case LLM_ARCH_ARCTIC: + { + result = llm.build_arctic(); + } break; + case LLM_ARCH_DEEPSEEK: + { + result = llm.build_deepseek(); + } break; + case LLM_ARCH_DEEPSEEK2: + { + result = llm.build_deepseek2(); + } break; + case LLM_ARCH_CHATGLM: + { + result = llm.build_chatglm(); + } break; + case LLM_ARCH_BITNET: + { + result = llm.build_bitnet(); + } break; + //case LLM_ARCH_T5: + // { + // if (lctx.is_encoding) { + // result = llm.build_t5_enc(); + // } else { + // result = llm.build_t5_dec(); + // } + // } break; + //case LLM_ARCH_T5ENCODER: + // { + // result = llm.build_t5_enc(); + // } break; + case LLM_ARCH_JAIS: + { + result = llm.build_jais(); + } break; + case LLM_ARCH_NEMOTRON: + { + result = llm.build_nemotron(); + } break; + case LLM_ARCH_EXAONE: + { + result = llm.build_exaone(); + } break; + case LLM_ARCH_RWKV6: + { + result = llm.build_rwkv6(); + } break; + case LLM_ARCH_RWKV6QWEN2: + { + result = llm.build_rwkv6qwen2(); + } break; + case LLM_ARCH_CHAMELEON: + { + result = llm.build_chameleon(); + } break; + case LLM_ARCH_WAVTOKENIZER_DEC: + { + result = llm.build_wavtokenizer_dec(); + } break; + default: + GGML_ABORT("fatal error"); + } + + // add on pooling layer + if (cparams.embeddings) { + result = llm.append_pooling(result); + } + + return result; +} + // // interface implementation // diff --git a/src/llama-model.h b/src/llama-model.h index a7c30444786fd..5d2a07abc570f 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -5,11 +5,16 @@ #include "llama-hparams.h" #include "llama-vocab.h" +#include "ggml-cpp.h" + #include #include #include #include +class llama_graph_i; +struct llama_cparams; +struct llama_ubatch; struct llama_model_loader; // available models @@ -362,6 +367,14 @@ struct llama_model { const struct ggml_tensor * get_tensor(const char * name) const; + // TODO: add encode/decode graphs + ggml_cgraph * build_graph( + llama_graph_i & lgf, + const llama_cparams & cparams, + const llama_ubatch & ubatch, + ggml_context_ptr && ctx, + bool worst_case) const; + private: struct impl; std::unique_ptr pimpl; diff --git a/src/llama.cpp b/src/llama.cpp index e71a87ee9fcdf..83b66035fc585 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9,7431 +9,18 @@ #include "ggml.h" #include "ggml-backend.h" -#include "ggml-cpp.h" #include -#include -#include -#include -#include #include #include #include #include #include -#include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif -// -// llm_build -// - -using llm_build_cb = std::function; - -enum llm_ffn_op_type { - LLM_FFN_SILU, - LLM_FFN_GELU, - LLM_FFN_RELU, - LLM_FFN_RELU_SQR, - LLM_FFN_SWIGLU, -}; - -enum llm_ffn_gate_type { - LLM_FFN_SEQ, - LLM_FFN_PAR, // ffn_gate is parallel to ffn_up -}; - -enum llm_norm_type { - LLM_NORM, - LLM_NORM_RMS, - LLM_NORM_GROUP, -}; - -struct llm_build_context { - llama_graph_i & lgf; - const llama_model & model; - const llama_hparams & hparams; - const llama_cparams & cparams; - const llama_ubatch & ubatch; - - const int64_t n_embd; - const int64_t n_layer; - const int64_t n_rot; - const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) - const int64_t n_head; - const int64_t n_head_kv; - const int64_t n_embd_head_k; - const int64_t n_embd_k_gqa; - const int64_t n_embd_head_v; - const int64_t n_embd_v_gqa; - const int64_t n_expert; - const int64_t n_expert_used; - - const float freq_base; - const float freq_scale; - const float ext_factor; - const float attn_factor; - const float beta_fast; - const float beta_slow; - const float norm_eps; - const float norm_rms_eps; - - const int32_t n_tokens; - const int32_t n_ctx_orig; - - const bool worst_case; - const bool flash_attn; - - const enum llama_pooling_type pooling_type; - const enum llama_rope_type rope_type; - - const llm_build_cb & cb; - - const ggml_context_ptr ctx = nullptr; - ggml_context * ctx0 = nullptr; - - // TODO: consider making the entire interface noexcept - llm_build_context( - llama_graph_i & lgf, - const llama_model & model, - const llama_cparams & cparams, - const llama_ubatch & ubatch, - llm_build_cb && cb, - ggml_context_ptr && ctx, - bool worst_case) : - lgf (lgf), - model (model), - hparams (model.hparams), - cparams (cparams), - ubatch (ubatch), - n_embd (hparams.n_embd), - n_layer (hparams.n_layer), - n_rot (hparams.n_rot), - n_ctx (cparams.n_ctx), - n_head (hparams.n_head()), - n_head_kv (hparams.n_head_kv()), - n_embd_head_k (hparams.n_embd_head_k), - n_embd_k_gqa (hparams.n_embd_k_gqa()), - n_embd_head_v (hparams.n_embd_head_v), - n_embd_v_gqa (hparams.n_embd_v_gqa()), - n_expert (hparams.n_expert), - n_expert_used (hparams.n_expert_used), - freq_base (cparams.rope_freq_base), - freq_scale (cparams.rope_freq_scale), - ext_factor (cparams.yarn_ext_factor), - attn_factor (cparams.yarn_attn_factor), - beta_fast (cparams.yarn_beta_fast), - beta_slow (cparams.yarn_beta_slow), - norm_eps (hparams.f_norm_eps), - norm_rms_eps (hparams.f_norm_rms_eps), - n_tokens (ubatch.n_tokens), - n_ctx_orig (cparams.n_ctx_orig_yarn), - worst_case (worst_case), - flash_attn (cparams.flash_attn), - pooling_type (cparams.pooling_type), - rope_type (hparams.rope_type), - cb (std::move(cb)), - ctx (std::move(ctx)), - ctx0 (this->ctx.get()) { - } - - // TODO: tmp - struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { - struct ggml_tensor * inpL = lgf.build_inp_embd(ctx0, tok_embd, ubatch); - cb(inpL, "inp_embd", -1); - - return inpL; - } - - // TODO: tmp - struct ggml_tensor * build_lora_mm( - struct ggml_tensor * w, - struct ggml_tensor * cur) { - return lgf.build_lora_mm(ctx0, w, cur); - } - - // TODO: tmp - struct ggml_tensor * build_lora_mm_id( - struct ggml_tensor * w, // struct ggml_tensor * as - struct ggml_tensor * cur, // struct ggml_tensor * b - struct ggml_tensor * ids) { - return lgf.build_lora_mm_id(ctx0, w, cur, ids); - } - - struct ggml_tensor * build_norm( - struct ggml_tensor * cur, - struct ggml_tensor * mw, - struct ggml_tensor * mb, - llm_norm_type type, - int il) { - switch (type) { - case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break; - case LLM_NORM_RMS: cur = ggml_rms_norm (ctx0, cur, hparams.f_norm_rms_eps); break; - case LLM_NORM_GROUP: - { - cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]); - cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[2]); - } break; - } - - if (mw || mb) { - cb(cur, "norm", il); - } - - if (mw) { - cur = ggml_mul(ctx0, cur, mw); - if (mb) { - cb(cur, "norm_w", il); - } - } - - if (mb) { - cur = ggml_add(ctx0, cur, mb); - } - - return cur; - } - - struct ggml_tensor * build_ffn( - struct ggml_tensor * cur, - struct ggml_tensor * up, - struct ggml_tensor * up_b, - struct ggml_tensor * up_s, - struct ggml_tensor * gate, - struct ggml_tensor * gate_b, - struct ggml_tensor * gate_s, - struct ggml_tensor * down, - struct ggml_tensor * down_b, - struct ggml_tensor * down_s, - struct ggml_tensor * act_scales, - llm_ffn_op_type type_op, - llm_ffn_gate_type type_gate, - const llm_build_cb & cb, - int il) { - struct ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur; - cb(tmp, "ffn_up", il); - - if (up_b) { - tmp = ggml_add(ctx0, tmp, up_b); - cb(tmp, "ffn_up_b", il); - } - - if (up_s) { - tmp = ggml_mul(ctx0, tmp, up_s); - cb(tmp, "ffn_up_s", il); - } - - if (gate) { - switch (type_gate) { - case LLM_FFN_SEQ: - { - cur = build_lora_mm(gate, tmp); - cb(cur, "ffn_gate", il); - } break; - case LLM_FFN_PAR: - { - cur = build_lora_mm(gate, cur); - cb(cur, "ffn_gate", il); - } break; - } - - if (gate_b) { - cur = ggml_add(ctx0, cur, gate_b); - cb(cur, "ffn_gate_b", il); - } - - if (gate_s) { - cur = ggml_mul(ctx0, cur, gate_s); - cb(cur, "ffn_gate_s", il); - } - - } else { - cur = tmp; - } - - switch (type_op) { - case LLM_FFN_SILU: - { - cur = ggml_silu(ctx0, cur); - cb(cur, "ffn_silu", il); - } break; - case LLM_FFN_GELU: - { - cur = ggml_gelu(ctx0, cur); - cb(cur, "ffn_gelu", il); - if (act_scales != NULL) { - cur = ggml_div(ctx0, cur, act_scales); - cb(cur, "ffn_act", il); - } - } break; - case LLM_FFN_RELU: - { - cur = ggml_relu(ctx0, cur); - cb(cur, "ffn_relu", il); - } break; - case LLM_FFN_RELU_SQR: - { - cur = ggml_relu(ctx0, cur); - cb(cur, "ffn_relu", il); - - cur = ggml_sqr(ctx0, cur); - cb(cur, "ffn_sqr(relu)", il); - } break; - case LLM_FFN_SWIGLU: - { - // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf - int64_t split_point = cur->ne[0] / 2; - struct ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); - struct ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); - - x0 = ggml_silu(ctx0, x0); - cb(cur, "ffn_silu", il); - - cur = ggml_mul(ctx0, x0, x1); - cb(cur, "ffn_mul", il); - } break; - } - - if (type_gate == LLM_FFN_PAR) { - cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "ffn_gate_par", il); - } - - if (down) { - cur = build_lora_mm(down, cur); - } - - if (down_b) { - cb(cur, "ffn_down", il); - } - - if (down_b) { - cur = ggml_add(ctx0, cur, down_b); - } - - if (down_s) { - cur = ggml_mul(ctx0, cur, down_s); - cb(cur, "ffn_down_s", il); - } - - return cur; - } - - struct ggml_tensor * build_moe_ffn( - struct ggml_tensor * cur, - struct ggml_tensor * gate_inp, - struct ggml_tensor * up_exps, - struct ggml_tensor * gate_exps, - struct ggml_tensor * down_exps, - struct ggml_tensor * exp_probs_b, - int64_t n_expert, - int64_t n_expert_used, - llm_ffn_op_type type_op, - bool norm_w, - bool scale_w, - float w_scale, - llama_expert_gating_func_type gating_op, - const llm_build_cb & cb, - int il) { - int64_t n_embd = cur->ne[0]; - int64_t n_tokens = cur->ne[1]; - - ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens] - cb(logits, "ffn_moe_logits", il); - - ggml_tensor * probs = nullptr; - switch (gating_op) { - case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: - { - probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens] - } break; - case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: - { - probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens] - } break; - default: - GGML_ABORT("fatal error"); - } - cb(probs, "ffn_moe_probs", il); - - // add experts selection bias - introduced in DeepSeek V3 - // leave probs unbiased as it's later used to get expert weights - ggml_tensor * selection_probs = probs; - if (exp_probs_b != nullptr) { - selection_probs = ggml_add(ctx0, probs, exp_probs_b); - cb(selection_probs, "ffn_moe_probs_biased", il); - } - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] - cb(selected_experts->src[0], "ffn_moe_argsort", il); - cb(selected_experts, "ffn_moe_topk", il); - - ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights", il); - - if (norm_w) { - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); - - ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] - cb(weights_sum, "ffn_moe_weights_sum", il); - - weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights_norm", il); - - weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); - } - if (scale_w) { - weights = ggml_scale(ctx0, weights, w_scale); - cb(weights, "ffn_moe_weights_scaled", il); - } - - cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); - ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(up, "ffn_moe_up", il); - - ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(gate, "ffn_moe_gate", il); - - switch (type_op) { - case LLM_FFN_SILU: - { - gate = ggml_silu(ctx0, gate); - cb(gate, "ffn_moe_silu", il); - } break; - case LLM_FFN_GELU: - { - gate = ggml_gelu(ctx0, gate); - cb(gate, "ffn_moe_gelu", il); - } break; - default: - GGML_ABORT("fatal error"); - } - - ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] - cb(par, "ffn_moe_gate_par", il); - - ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] - cb(experts, "ffn_moe_down", il); - - experts = ggml_mul(ctx0, experts, weights); - - // aggregate experts - ggml_tensor * moe_out = nullptr; - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens, - experts->nb[2], i*experts->nb[1]); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx0, moe_out, cur_expert); - } - } - - if (n_expert_used == 1) { - // avoid returning a non-contiguous tensor - moe_out = ggml_cont(ctx0, moe_out); - } - - return moe_out; - } - - struct ggml_tensor * build_attn( - struct ggml_cgraph * graph, - struct ggml_tensor * wo, - struct ggml_tensor * wo_b, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - struct ggml_tensor * q_cur, - int32_t n_tokens, - float kq_scale, - const llm_build_cb & cb, - int il) { - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(graph, q_cur); - ggml_build_forward_expand(graph, k_cur); - ggml_build_forward_expand(graph, v_cur); - - //build_kv_store(graph, k_cur, v_cur, il); - lgf.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); - - struct ggml_tensor * cur; - - //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il); - cur = lgf.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); - cb(cur, "kqv_out", il); - - return cur; - } - - struct ggml_tensor * build_rwkv_channel_mix( - const struct llama_layer * layer, - struct ggml_tensor * cur, - struct ggml_tensor * x_prev, - const llm_arch arch) { - struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); - switch (arch) { - case LLM_ARCH_RWKV6: - { - struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); - struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); - - struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); - struct ggml_tensor * k = ggml_sqr( - ctx0, - ggml_relu( - ctx0, - build_lora_mm(layer->channel_mix_key, xk) - ) - ); - cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k)); - } break; - default: - GGML_ABORT("fatal error"); - } - - return cur; - } - - struct ggml_cgraph * build_k_shift() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - lgf.build_k_shift(ctx0, gf); - - return gf; - } - - struct ggml_cgraph * build_defrag() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - lgf.build_defrag(ctx0, gf); - - return gf; - } - - struct ggml_tensor * build_inp_pos() { - ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens); - cb(cur, "inp_pos", -1); - - return cur; - } - - struct ggml_tensor * build_inp_out_ids() { - ggml_tensor * cur = lgf.build_inp_out_ids(ctx0, n_tokens, worst_case); - cb(cur, "inp_out_ids", -1); - - return cur; - } - - struct ggml_tensor * build_inp_mean() { - ggml_tensor * cur = lgf.build_inp_mean(ctx0, n_tokens); - cb(cur, "inp_mean", -1); - - return cur; - } - - struct ggml_tensor * build_inp_cls() { - ggml_tensor * cur = lgf.build_inp_cls(ctx0, n_tokens); - cb(cur, "inp_cls", -1); - - return cur; - } - - struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { - // find result_norm tensor for input - struct ggml_tensor * inp = nullptr; - for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { - inp = ggml_graph_node(gf, i); - if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { - break; - } - - inp = nullptr; - } - GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor"); - - struct ggml_tensor * cur; - - switch (pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - cur = inp; - } break; - case LLAMA_POOLING_TYPE_MEAN: - { - struct ggml_tensor * inp_mean = build_inp_mean(); - cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean); - } break; - case LLAMA_POOLING_TYPE_CLS: - case LLAMA_POOLING_TYPE_LAST: - { - struct ggml_tensor * inp_cls = build_inp_cls(); - cur = ggml_get_rows(ctx0, inp, inp_cls); - } break; - case LLAMA_POOLING_TYPE_RANK: - { - struct ggml_tensor * inp_cls = build_inp_cls(); - inp = ggml_get_rows(ctx0, inp, inp_cls); - - // classification head - // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566 - GGML_ASSERT(model.cls != nullptr); - GGML_ASSERT(model.cls_b != nullptr); - - cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b); - cur = ggml_tanh(ctx0, cur); - - // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en - // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896 - if (model.cls_out) { - GGML_ASSERT(model.cls_out_b != nullptr); - - cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b); - } - } break; - default: - { - GGML_ABORT("unknown pooling type"); - } - } - - cb(cur, "result_embd_pooled", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - //struct ggml_tensor * build_pos_bucket(bool causal) { - // if (causal) { - // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); - // } else { - // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); - // } - - // ggml_set_input(lctx.inp_pos_bucket); - // cb(lctx.inp_pos_bucket, "pos_bucket", -1); - - // return lctx.inp_pos_bucket; - //} - - //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { - // struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); - // cb(pos_bucket_1d, "pos_bucket_1d", -1); - - // struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); - // cb(pos_bias, "pos_bias", -1); - - // pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0); - // cb(pos_bias, "pos_bias", -1); - - // pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); - // cb(pos_bias, "pos_bias", -1); - - // pos_bias = ggml_cont(ctx0, pos_bias); - // cb(pos_bias, "pos_bias", -1); - - // return pos_bias; - //} - - struct ggml_tensor * build_inp_embd_enc() { - ggml_tensor * cur = lgf.build_inp_embd_enc(ctx0, n_tokens, worst_case); - cb(cur, "embd_enc", -1); - - return cur; - } - - struct ggml_tensor * build_inp_KQ_mask_cross() { - ggml_tensor * cur = lgf.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); - cb(cur, "KQ_mask_cross", -1); - - return cur; - } - - struct ggml_cgraph * build_llama() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); - - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - // For Granite architecture - if (hparams.f_residual_scale) { - cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - if (model.layers[il].ffn_gate_inp == nullptr) { - - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } else { - // MoE branch - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, true, - false, 0.0, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - cb, il); - cb(cur, "ffn_moe_out", il); - } - - // For Granite architecture - if (hparams.f_residual_scale) { - cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - - // For Granite architecture - if (hparams.f_logit_scale) { - cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); - } - - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_deci() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_head = hparams.n_head(il); - - if (n_head == 0) { - // attention-free layer of Llama-3_1-Nemotron-51B - cur = inpL; - } else { - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - } - - if (n_head > 0 && n_head_kv == 0) { - // "linear attention" of Llama-3_1-Nemotron-51B - cur = build_lora_mm(model.layers[il].wo, cur); - cb(cur, "wo", il); - } else if (n_head > 0) { - // self-attention - // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); - - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - // For Granite architecture - if (hparams.f_residual_scale) { - cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); - } - - // modified to support attention-free layer of Llama-3_1-Nemotron-51B - struct ggml_tensor * ffn_inp = cur; - if (n_head > 0) { - ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - } - - // feed-forward network - if (model.layers[il].ffn_gate_inp == nullptr) { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - // For Granite architecture - if (hparams.f_residual_scale) { - cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - - // For Granite architecture - if (hparams.f_logit_scale) { - cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); - } - - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_baichuan() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - switch (model.type) { - case LLM_TYPE_7B: - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - break; - case LLM_TYPE_13B: - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); - break; - default: - GGML_ABORT("fatal error"); - } - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_xverse() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_falcon() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * attn_norm; - - attn_norm = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(attn_norm, "attn_norm", il); - - // self-attention - { - if (model.layers[il].attn_norm_2) { - // Falcon-40B - cur = build_norm(inpL, - model.layers[il].attn_norm_2, - model.layers[il].attn_norm_2_b, - LLM_NORM, il); - cb(cur, "attn_norm_2", il); - } else { - cur = attn_norm; - } - - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - - // using mode = 2 for neox mode - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = cur; - - // feed forward - { - cur = build_ffn(attn_norm, // !! use the attn norm, not the result - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cur = ggml_add(ctx0, cur, inpL); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - // norm - cur = build_norm(cur, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_grok() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // multiply by embedding_multiplier_scale of 78.38367176906169 - inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - // Grok - // if attn_out_norm is present then apply it before adding the input - if (model.layers[il].attn_out_norm) { - cur = build_norm(cur, - model.layers[il].attn_out_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_out_norm", il); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - // MoE branch - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_GELU, true, - false, 0.0, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - cb, il); - cb(cur, "ffn_moe_out", il); - - // Grok - // if layer_out_norm is present then apply it before adding the input - // Idea: maybe ffn_out_norm is a better name - if (model.layers[il].layer_out_norm) { - cur = build_norm(cur, - model.layers[il].layer_out_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "layer_out_norm", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - - // Grok - // multiply logits by output_multiplier_scale of 0.5773502691896257 - - cur = ggml_scale(ctx0, cur, 0.5773502691896257f); - - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_dbrx() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - struct ggml_tensor * Qcur = nullptr; - struct ggml_tensor * Kcur = nullptr; - struct ggml_tensor * Vcur = nullptr; - - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); - cb(cur, "wqkv_clamped", il); - - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - // MoE branch - cur = build_norm(ffn_inp, - model.layers[il].attn_out_norm, NULL, - LLM_NORM, il); - cb(cur, "attn_out_norm", il); - - cur = build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, true, - false, 0.0, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - cb, il); - cb(cur, "ffn_moe_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_starcoder() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); - cb(pos, "pos_embd", -1); - - inpL = ggml_add(ctx0, inpL, pos); - cb(inpL, "inpL", -1); - - for (int il = 0; il < n_layer; ++il) { - cur = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - // FF - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = build_norm(inpL, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_refact() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cb(Kcur, "Kcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cb(Qcur, "Qcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_bert() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - struct ggml_tensor * inp_pos = nullptr; - - if (model.arch != LLM_ARCH_JINA_BERT_V2) { - inp_pos = build_inp_pos(); - } - - // construct input embeddings (token, type, position) - inpL = build_inp_embd(model.tok_embd); - - // token types are hardcoded to zero ("Sentence A") - struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); - inpL = ggml_add(ctx0, inpL, type_row0); - if (model.arch == LLM_ARCH_BERT) { - inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); - } - cb(inpL, "inp_embd", -1); - - // embed layer norm - inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - cb(inpL, "inp_norm", -1); - - lgf.build_attn_inp(ctx0, n_tokens, false, false, worst_case); - - // iterate layers - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur = inpL; - - struct ggml_tensor * Qcur; - struct ggml_tensor * Kcur; - struct ggml_tensor * Vcur; - - // self-attention - if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { - Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq); - cb(Qcur, "Qcur", il); - - if (model.layers[il].attn_q_norm) { - Qcur = build_norm(Qcur, - model.layers[il].attn_q_norm, - model.layers[il].attn_q_norm_b, - LLM_NORM, il); - } - - Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk); - cb(Kcur, "Kcur", il); - - if (model.layers[il].attn_k_norm) { - Kcur = build_norm(Kcur, - model.layers[il].attn_k_norm, - model.layers[il].attn_k_norm_b, - LLM_NORM, il); - } - Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - } else { - // compute Q and K and RoPE them - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); - - //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); - kq = lgf.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); - cb(kq, "kq_soft_max_ext", il); - - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - cb(v, "v", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); - - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); - - ggml_build_forward_expand(gf, cur); - - cur = build_lora_mm(model.layers[il].wo, cur); - if (model.layers[il].bo) { - cb(cur, "kqv_wo", il); - } - - if (model.layers[il].bo) { - cur = ggml_add(ctx0, cur, model.layers[il].bo); - } - cb(cur, "kqv_out", il); - - if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // re-add the layer input - cur = ggml_add(ctx0, cur, inpL); - - // attention layer norm - cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il); - - if (model.layers[il].attn_norm_2 != nullptr) { - cur = ggml_add(ctx0, cur, inpL); // re-add the layer input - cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il); - } - - struct ggml_tensor * ffn_inp = cur; - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - if (model.arch == LLM_ARCH_BERT) { - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_PAR, cb, il); - } else { - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - } - cb(cur, "ffn_out", il); - - // attentions bypass the intermediate layer - cur = ggml_add(ctx0, cur, ffn_inp); - - // output layer norm - cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cb(cur, "result_embd", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_bloom() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - inpL = build_norm(inpL, - model.tok_norm, - model.tok_norm_b, - LLM_NORM, -1); - cb(inpL, "inp_norm", -1); - - for (int il = 0; il < n_layer; ++il) { - cur = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // Add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - // FF - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = build_norm(inpL, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_mpt() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * pos; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - if (model.pos_embd) { - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); - cb(pos, "pos_embd", -1); - - inpL = ggml_add(ctx0, inpL, pos); - cb(inpL, "inpL", -1); - } - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * attn_norm; - - attn_norm = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(attn_norm, "attn_norm", il); - - // self-attention - { - cur = attn_norm; - - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - if (model.layers[il].bqkv){ - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - } - - if (hparams.f_clamp_kqv > 0.0f) { - cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); - cb(cur, "wqkv_clamped", il); - } - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - // Q/K Layernorm - if (model.layers[il].attn_q_norm) { - Qcur = build_norm(Qcur, - model.layers[il].attn_q_norm, - model.layers[il].attn_q_norm_b, - LLM_NORM, il); - cb(Qcur, "Qcur", il); - - Kcur = build_norm(Kcur, - model.layers[il].attn_k_norm, - model.layers[il].attn_k_norm_b, - LLM_NORM, il); - cb(Kcur, "Kcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } else { - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // Add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - // feed forward - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - model.layers[il].ffn_act, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_stablelm() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - struct ggml_tensor * inpSA = cur; - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cb(Qcur, "Qcur", il); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cb(Kcur, "Kcur", il); - - if (model.layers[il].attn_q_norm) { - Qcur = build_norm(Qcur, - model.layers[il].attn_q_norm, - NULL, - LLM_NORM, il); - cb(Qcur, "Qcur", il); - } - if (model.layers[il].attn_k_norm) { - Kcur = build_norm(Kcur, - model.layers[il].attn_k_norm, - NULL, - LLM_NORM, il); - cb(Kcur, "Kcur", il); - } - - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - if (model.layers[il].ffn_norm) { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - } else { - // parallel residual - cur = inpSA; - } - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_qwen() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - - // using mode = 2 for neox mode - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward forward - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_qwen2vl() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - int sections[4]; - std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_multi( - ctx0, - ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_multi( - ctx0, - ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_qwen2moe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self_attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // MoE branch - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - ggml_tensor * moe_out = - build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, false, - false, 0.0, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - cb, il); - cb(cur, "ffn_moe_out", il); - - // FFN shared expert - { - ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur); - cb(cur_gate_inp, "ffn_shexp_gate_inp", il); - - // sigmoid - ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); - cb(cur_gate, "ffn_shexp_gate", il); - - ggml_tensor * cur_ffn = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur_ffn, "ffn_shexp", il); - - ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate); - cb(ffn_shexp_out, "ffn_shexp_out", il); - - moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out); - cb(moe_out, "ffn_out", il); - - cur = moe_out; - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_phi2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * attn_norm_output; - struct ggml_tensor * ffn_output; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - attn_norm_output = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(attn_norm_output, "attn_norm", il); - - // self-attention - { - struct ggml_tensor * Qcur = nullptr; - struct ggml_tensor * Kcur = nullptr; - struct ggml_tensor * Vcur = nullptr; - - if (model.layers[il].wqkv) { - cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - } else { - Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); - } - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - // with phi2, we scale the Q to avoid precision issues - // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 - Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head))); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids); - } - - // FF - { - ffn_output = build_ffn(attn_norm_output, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(ffn_output, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_output); - cur = ggml_add(ctx0, cur, inpL); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = build_norm(inpL, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output_no_bias", -1); - - cur = ggml_add(ctx0, cur, model.output_b); - cb(cur, "result_output", -1); - ggml_build_forward_expand(gf, cur); - return gf; - } - - struct ggml_cgraph * build_phi3() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); - - for (int il = 0; il < n_layer; ++il) { - auto residual = inpL; - - // self-attention - { - // rope freq factors for 128k context - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); - - struct ggml_tensor* attn_norm_output = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM_RMS, il); - cb(attn_norm_output, "attn_norm", il); - - struct ggml_tensor * Qcur = nullptr; - struct ggml_tensor * Kcur = nullptr; - struct ggml_tensor * Vcur = nullptr; - - if (model.layers[il].wqkv) { - cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); - cb(cur, "wqkv", il); - - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); - } else { - Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); - } - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor* inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - residual = ggml_get_rows(ctx0, residual, inp_out_ids); - } - - cur = ggml_add(ctx0, cur, residual); - residual = cur; - - cur = build_norm(cur, - model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - // feed-forward network - if (model.layers[il].ffn_gate_inp == nullptr) { - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } else { - // MoE branch - cur = build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, true, - false, 0.0, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - cb, il); - cb(cur, "ffn_moe_out", il); - } - - cur = ggml_add(ctx0, residual, cur); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = build_norm(inpL, - model.output_norm, - model.output_norm_b, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - - if (model.output_b != nullptr) { - cb(cur, "result_output_no_bias", -1); - cur = ggml_add(ctx0, cur, model.output_b); - } - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - - struct ggml_cgraph * build_plamo() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - struct ggml_tensor * attention_norm = cur; - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr, - n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr, - n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - struct ggml_tensor * sa_out = cur; - - cur = attention_norm; - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // feed-forward network - { - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, sa_out); - cur = ggml_add(ctx0, cur, inpL); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_gpt2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * pos; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); - cb(pos, "pos_embd", -1); - - inpL = ggml_add(ctx0, inpL, pos); - cb(inpL, "inpL", -1); - - for (int il = 0; il < n_layer; ++il) { - cur = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - // FF - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = build_norm(inpL, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_codeshell() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - cur = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(tmpq, "tmpq", il); - cb(tmpk, "tmpk", il); - cb(Vcur, "Vcur", il); - - struct ggml_tensor * Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - // FF - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = build_norm(inpL, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_orion() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - // if (model.layers[il].bq) { - // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - // cb(Qcur, "Qcur", il); - // } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - // if (model.layers[il].bk) { - // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - // cb(Kcur, "Kcur", il); - // } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - // if (model.layers[il].bv) { - // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - // cb(Vcur, "Vcur", il); - // } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_internlm2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_minicpm3() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - //TODO: if the model varies, these parameters need to be read from the model - const int64_t n_embd_base = 256; - const float scale_embd = 12.0f; - const float scale_depth = 1.4f; - const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k)); - - const uint32_t n_embd_head_qk_rope = hparams.n_rot; - const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; - const uint32_t kv_lora_rank = hparams.n_lora_kv; - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // scale the input embeddings - inpL = ggml_scale(ctx0, inpL, scale_embd); - cb(inpL, "inp_scaled", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self_attention - { - struct ggml_tensor * q = NULL; - // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} - q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); - cb(q, "q", il); - - q = build_norm(q, - model.layers[il].attn_q_a_norm, NULL, - LLM_NORM_RMS, il); - cb(q, "q", il); - - // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} - q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); - cb(q, "q", il); - - // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), - 0); - cb(q_nope, "q_nope", il); - - // and {n_head * n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), - ggml_row_size(q->type, n_embd_head_qk_nope)); - cb(q_pe, "q_pe", il); - - // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); - cb(kv_pe_compresseed, "kv_pe_compresseed", il); - - // split into {kv_lora_rank, n_tokens} - struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, - kv_pe_compresseed->nb[1], - 0); - cb(kv_compressed, "kv_compressed", il); - - // and {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, - kv_pe_compresseed->nb[1], - kv_pe_compresseed->nb[1], - ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); - cb(k_pe, "k_pe", il); - - // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont - kv_compressed = ggml_cont(ctx0, kv_compressed); - kv_compressed = build_norm(kv_compressed, - model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, il); - cb(kv_compressed, "kv_compressed", il); - - // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} - struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); - cb(kv, "kv", il); - - // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), - ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), - 0); - cb(k_nope, "k_nope", il); - - // and {n_head * n_embd_head_v, n_tokens} - struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), - ggml_row_size(kv->type, (n_embd_head_qk_nope))); - cb(v_states, "v_states", il); - - v_states = ggml_cont(ctx0, v_states); - cb(v_states, "v_states", il); - - v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, - ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), - 0); - cb(v_states, "v_states", il); - - q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - q_pe = ggml_rope_ext( - ctx0, q_pe, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(q_pe, "q_pe", il); - - // shared RoPE key - k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - k_pe = ggml_rope_ext( - ctx0, k_pe, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(k_pe, "k_pe", il); - - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); - cb(q_states, "q_states", il); - - struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); - cb(k_states, "k_states", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - k_states, v_states, q_states, n_tokens, kq_scale, cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - // scale_res - scale the hidden states for residual connection - const float scale_res = scale_depth/sqrtf(float(n_layer)); - cur = ggml_scale(ctx0, cur, scale_res); - cb(cur, "hidden_scaled", il); - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - // scale the hidden states for residual connection - cur = ggml_scale(ctx0, cur, scale_res); - cb(cur, "hidden_scaled_ffn", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head scaling - const float scale_lmhead = float(n_embd_base)/float(n_embd); - cur = ggml_scale(ctx0, cur, scale_lmhead); - cb(cur, "lmhead_scaling", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_gemma() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head_k = hparams.n_embd_head_k; - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); - cb(inpL, "inp_scaled", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(Qcur, "Qcur", il); - - Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); - cb(Qcur, "Qcur_scaled", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); - cb(sa_out, "sa_out", il); - - cur = build_norm(sa_out, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - // feed-forward network - { - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, sa_out); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_gemma2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head_k = hparams.n_embd_head_k; - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); - cb(inpL, "inp_scaled", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); - - for (int il = 0; il < n_layer; ++il) { - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(Qcur, "Qcur", il); - - // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e - switch (model.type) { - case LLM_TYPE_2B: - case LLM_TYPE_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; - case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; - default: GGML_ABORT("fatal error"); - }; - cb(Qcur, "Qcur_scaled", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); - } - - cur = build_norm(cur, - model.layers[il].attn_post_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_post_norm", il); - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); - cb(sa_out, "sa_out", il); - - cur = build_norm(sa_out, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - // feed-forward network - { - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = build_norm(cur, - model.layers[il].ffn_post_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "ffn_post_norm", -1); - - cur = ggml_add(ctx0, cur, sa_out); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - - // final logit soft-capping - cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); - cur = ggml_tanh(ctx0, cur); - cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); - - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - - struct ggml_cgraph * build_starcoder2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_mamba() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - // {n_embd, n_tokens} - inpL = build_inp_embd(model.tok_embd); - - struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); - - for (int il = 0; il < n_layer; ++il) { - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il); - cur = lgf.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case); - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // residual - cur = ggml_add(ctx0, cur, inpL); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - // final rmsnorm - cur = build_norm(inpL, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_command_r() { - - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - const float f_logit_scale = hparams.f_logit_scale; - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM, il); - cb(cur, "attn_norm", il); - struct ggml_tensor * ffn_inp = cur; - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - if (model.layers[il].attn_q_norm) { - Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, - ggml_element_size(Qcur) * n_embd_head, - ggml_element_size(Qcur) * n_embd_head * n_head, - 0); - cb(Qcur, "Qcur", il); - Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, - ggml_element_size(Kcur) * n_embd_head, - ggml_element_size(Kcur) * n_embd_head * n_head_kv, - 0); - cb(Kcur, "Kcur", il); - - Qcur = build_norm(Qcur, - model.layers[il].attn_q_norm, - NULL, - LLM_NORM, il); - cb(Qcur, "Qcur", il); - - Kcur = build_norm(Kcur, - model.layers[il].attn_k_norm, - NULL, - LLM_NORM, il); - cb(Kcur, "Kcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); - } - - struct ggml_tensor * attn_out = cur; - - // feed-forward network - { - cur = build_ffn(ffn_inp, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - // add together residual + FFN + self-attention - cur = ggml_add(ctx0, cur, inpL); - cur = ggml_add(ctx0, cur, attn_out); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - - if (f_logit_scale) { - cur = ggml_scale(ctx0, cur, f_logit_scale); - } - - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - - } - - struct ggml_cgraph * build_cohere2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - const float f_logit_scale = hparams.f_logit_scale; - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); - - // sliding window switch pattern - const int32_t sliding_window_pattern = 4; - - for (int il = 0; il < n_layer; ++il) { - // three layers sliding window attention (window size 4096) and ROPE - // fourth layer uses global attention without positional embeddings - const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); - - // norm - cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); - cb(cur, "attn_norm", il); - struct ggml_tensor * ffn_inp = cur; - - // self-attention - { - // rope freq factors for 128k context - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); - - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - if (is_sliding) { - Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, - beta_fast, beta_slow); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, - attn_factor, beta_fast, beta_slow); - cb(Kcur, "Kcur", il); - } else { - // For non-sliding layers, just reshape without applying RoPE - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cb(Qcur, "Qcur", il); - - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cb(Kcur, "Kcur", il); - } - - cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, - n_tokens, 1.0f / sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); - } - - struct ggml_tensor * attn_out = cur; - - // feed-forward network - { - cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, - NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, - cb, il); - cb(cur, "ffn_out", il); - } - - // add together residual + FFN + self-attention - cur = ggml_add(ctx0, cur, inpL); - cur = ggml_add(ctx0, cur, attn_out); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - - if (f_logit_scale) { - cur = ggml_scale(ctx0, cur, f_logit_scale); - } - - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - // ref: https://allenai.org/olmo - // based on the original build_llama() function, changes: - // * non-parametric layer norm - // * clamp qkv - // * removed bias - // * removed MoE - struct ggml_cgraph * build_olmo() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - NULL, NULL, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (hparams.f_clamp_kqv > 0.0f) { - Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (hparams.f_clamp_kqv > 0.0f) { - Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (hparams.f_clamp_kqv > 0.0f) { - Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - NULL, NULL, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - NULL, NULL, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_olmo2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - cur = inpL; - - // self_attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, il); - cb(Qcur, "Qcur_normed", il); - - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, il); - cb(Kcur, "Kcur_normed", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur_rope", il); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur_rope", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - cur = build_norm(cur, - model.layers[il].attn_post_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_post_norm", il); - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_ffn(ffn_inp, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - cur = build_norm(cur, - model.layers[il].ffn_post_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "ffn_post_norm", -1); - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - // based on the build_qwen2moe() function, changes: - // * removed shared experts - // * removed bias - // * added q, k norm - struct ggml_cgraph * build_olmoe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self_attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, il); - cb(Qcur, "Qcur_normed", il); - - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, il); - cb(Kcur, "Kcur_normed", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur_rope", il); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur_rope", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // MoE branch - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, false, - false, 0.0, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - cb, il); - cb(cur, "ffn_moe_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_openelm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - const int64_t n_head = hparams.n_head(il); - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_head_qkv = 2*n_head_kv + n_head; - - cur = inpL; - struct ggml_tensor * residual = cur; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0)); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head)); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); - cb(Vcur, "Vcur", il); - - Qcur = build_norm(Qcur, - model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, il); - cb(Qcur, "Qcur", il); - - Kcur = build_norm(Kcur, - model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, il); - cb(Kcur, "Kcur", il); - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig, - freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); - cb(Qcur, "Vcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - residual = ggml_get_rows(ctx0, residual, inp_out_ids); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - inpL = cur; - } - - cur = inpL; - - // norm - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_gptneox() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - cur = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // ffn - if (hparams.use_par_res) { - // attention and ffn are computed in parallel - // x = x + attn(ln1(x)) + ffn(ln2(x)) - - struct ggml_tensor * attn_out = cur; - - cur = build_norm(inpL, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, attn_out); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } else { - // attention and ffn are computed sequentially - // x = x + attn(ln1(x)) - // x = x + ffn(ln2(x)) - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - } - - cur = build_norm(inpL, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_arctic() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp); - cb(ffn_out, "ffn_out", il); - - // MoE - cur = build_norm(inpSA, - model.layers[il].ffn_norm_exps, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm_exps", il); - - cur = build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, true, - false, 0.0, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - cb, il); - cb(cur, "ffn_moe_out", il); - - cur = ggml_add(ctx0, cur, ffn_out); - cb(cur, "ffn_out", il); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_deepseek() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); - - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } else { - // MoE branch - ggml_tensor * moe_out = - build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, false, - false, hparams.expert_weights_scale, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - cb, il); - cb(moe_out, "ffn_moe_out", il); - - // FFN shared expert - { - ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(ffn_shexp, "ffn_shexp", il); - - cur = ggml_add(ctx0, moe_out, ffn_shexp); - cb(cur, "ffn_out", il); - } - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_deepseek2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - bool is_lite = (hparams.n_layer == 27); - - // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. - // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. - const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); - const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k)); - const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); - - const uint32_t n_embd_head_qk_rope = hparams.n_rot; - const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; - const uint32_t kv_lora_rank = hparams.n_lora_kv; - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - // {n_embd, n_tokens} - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self_attention - { - struct ggml_tensor * q = NULL; - if (!is_lite) { - // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} - q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); - cb(q, "q", il); - - q = build_norm(q, - model.layers[il].attn_q_a_norm, NULL, - LLM_NORM_RMS, il); - cb(q, "q", il); - - // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} - q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); - cb(q, "q", il); - } else { - q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(q, "q", il); - } - - // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), - 0); - cb(q_nope, "q_nope", il); - - // and {n_head * n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), - ggml_row_size(q->type, n_embd_head_qk_nope)); - cb(q_pe, "q_pe", il); - - // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); - cb(kv_pe_compresseed, "kv_pe_compresseed", il); - - // split into {kv_lora_rank, n_tokens} - struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, - kv_pe_compresseed->nb[1], - 0); - cb(kv_compressed, "kv_compressed", il); - - // and {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, - kv_pe_compresseed->nb[1], - kv_pe_compresseed->nb[1], - ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); - cb(k_pe, "k_pe", il); - - // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont - kv_compressed = ggml_cont(ctx0, kv_compressed); - kv_compressed = build_norm(kv_compressed, - model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, il); - cb(kv_compressed, "kv_compressed", il); - - // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} - struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); - cb(kv, "kv", il); - - // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), - ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), - 0); - cb(k_nope, "k_nope", il); - - // and {n_head * n_embd_head_v, n_tokens} - struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), - ggml_row_size(kv->type, (n_embd_head_qk_nope))); - cb(v_states, "v_states", il); - - v_states = ggml_cont(ctx0, v_states); - cb(v_states, "v_states", il); - - v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, - ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), - 0); - cb(v_states, "v_states", il); - - q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - q_pe = ggml_rope_ext( - ctx0, q_pe, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(q_pe, "q_pe", il); - - // shared RoPE key - k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - k_pe = ggml_rope_ext( - ctx0, k_pe, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(k_pe, "k_pe", il); - - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); - cb(q_states, "q_states", il); - - struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); - cb(k_states, "k_states", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - k_states, v_states, q_states, n_tokens, kq_scale, cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } else { - // MoE branch - ggml_tensor * moe_out = - build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - model.layers[il].ffn_exp_probs_b, - n_expert, n_expert_used, - LLM_FFN_SILU, hparams.expert_weights_norm, - true, hparams.expert_weights_scale, - (enum llama_expert_gating_func_type) hparams.expert_gating_func, - cb, il); - cb(moe_out, "ffn_moe_out", il); - - // FFN shared expert - { - ggml_tensor * ffn_shexp = build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(ffn_shexp, "ffn_shexp", il); - - cur = ggml_add(ctx0, moe_out, ffn_shexp); - cb(cur, "ffn_out", il); - } - } - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_bitnet() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - if (model.layers[il].wq_scale) { - Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); - } - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - // B1.K - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - if (model.layers[il].wk_scale) { - Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); - } - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - // B1.V - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - if (model.layers[il].wv_scale) { - Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); - } - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - NULL, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - - cur = build_norm(cur, - model.layers[il].attn_sub_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_sub_norm", il); - - cur = build_lora_mm(model.layers[il].wo, cur); - if (model.layers[il].wo_scale) { - cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); - } - if (model.layers[il].bo) { - cur = ggml_add(ctx0, cur, model.layers[il].bo); - } - cb(cur, "attn_o_out", il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward forward - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, - model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, - NULL, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_sub_out", il); - - cur = build_norm(cur, - model.layers[il].ffn_sub_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_sub_norm", il); - - cur = build_lora_mm(model.layers[il].ffn_down, cur); - if (model.layers[il].ffn_down_scale) { - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); - } - cb(cur, "ffn_down", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - // FIXME: do not use model.tok_embd directly, duplicate as model.output - cur = build_lora_mm(model.tok_embd, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - return gf; - } - - //struct ggml_cgraph * build_t5_enc() { - // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - // const int64_t n_embd_head = hparams.n_embd_head_v; - // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - // struct ggml_tensor * cur; - // struct ggml_tensor * inpL; - - // inpL = build_inp_embd(model.tok_embd); - - // GGML_ASSERT(lctx.is_encoding); - // struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); - - // // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - // struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); - - // for (int il = 0; il < n_layer; ++il) { - // struct ggml_tensor * inpSA = inpL; - - // // norm - // cur = build_norm(inpL, - // model.layers[il].attn_norm_enc, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "attn_norm", il); - - // // self-attention - // { - // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur); - // cb(Qcur, "Qcur", il); - - // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur); - // cb(Kcur, "Kcur", il); - - // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur); - // cb(Vcur, "Vcur", il); - - // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - - // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - - // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - // cb(kq, "kq", il); - - // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); - // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - // cb(kq_b, "kq_b", il); - - // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); - // cb(kq, "kq_soft_max_ext", il); - - // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - // cb(v, "v", il); - - // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - // cb(kqv, "kqv", il); - - // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - // cb(kqv_merged, "kqv_merged", il); - - // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - // cb(cur, "kqv_merged_cont", il); - - // ggml_build_forward_expand(gf, cur); - - // cur = build_lora_mm(model.layers[il].wo_enc, cur); - // cb(cur, "kqv_out", il); - // } - - // if (il == n_layer - 1) { - // // skip computing output for unused tokens - // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - // cur = ggml_get_rows(ctx0, cur, inp_out_ids); - // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - // } - - // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - // cb(ffn_inp, "ffn_inp", il); - - // // feed-forward network - // { - // cur = build_norm(ffn_inp, - // model.layers[il].ffn_norm_enc, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "ffn_norm", il); - - // // T5 uses relu, flan-T5 uses gelu-gated - // cur = build_ffn(cur, - // model.layers[il].ffn_up_enc, NULL, NULL, - // model.layers[il].ffn_gate_enc, NULL, NULL, - // model.layers[il].ffn_down_enc, NULL, NULL, - // NULL, - // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - // cb, il); - // cb(cur, "ffn_out", il); - // } - - // cur = ggml_add(ctx0, cur, ffn_inp); - // cb(cur, "ffn_out", il); - - // ggml_tensor * layer_dir = cvec.tensor_for(il); - // if (layer_dir != nullptr) { - // cur = ggml_add(ctx0, cur, layer_dir); - // } - // cb(cur, "l_out", il); - - // // input for next layer - // inpL = cur; - // } - - // cur = inpL; - // cb(cur, "result_embd", -1); - - // cur = build_norm(cur, - // model.output_norm_enc, NULL, - // LLM_NORM_RMS, -1); - // cb(cur, "result_norm", -1); - - // ggml_build_forward_expand(gf, cur); - - // return gf; - //} - - //struct ggml_cgraph * build_t5_dec() { - // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - // const int64_t n_embd_head = hparams.n_embd_head_v; - // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - // struct ggml_tensor * cur; - // struct ggml_tensor * inpL; - - // inpL = build_inp_embd(model.tok_embd); - - // GGML_ASSERT(!lctx.is_encoding); - // GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - - // struct ggml_tensor * embd_enc = build_inp_embd_enc(); - // struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); - - // struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - // struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); - - // for (int il = 0; il < n_layer; ++il) { - // struct ggml_tensor * inpSA = inpL; - - // // norm - // cur = build_norm(inpL, - // model.layers[il].attn_norm, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "attn_norm", il); - - // // self-attention - // { - // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - // cb(Qcur, "Qcur", il); - - // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - // cb(Kcur, "Kcur", il); - - // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - // cb(Vcur, "Vcur", il); - - // build_kv_store(gf, Kcur, Vcur, il); - - // struct ggml_tensor * k = - // ggml_view_3d(ctx0, kv_self.k_l[il], - // n_embd_head_k, n_kv, n_head_kv, - // ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - // ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - // 0); - // cb(k, "k", il); - - // struct ggml_tensor * v = - // ggml_view_3d(ctx0, kv_self.v_l[il], - // n_kv, n_embd_head_v, n_head_kv, - // ggml_element_size(kv_self.v_l[il])*n_ctx, - // ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, - // 0); - // cb(v, "v", il); - - // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - - // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - // cb(kq, "kq", il); - - // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); - // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - // cb(kq_b, "kq_b", il); - - // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); - // cb(kq, "kq_soft_max_ext", il); - - // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - // cb(kqv, "kqv", il); - - // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - // cb(kqv_merged, "kqv_merged", il); - - // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - // cb(cur, "kqv_merged_cont", il); - - // ggml_build_forward_expand(gf, cur); - - // cur = build_lora_mm(model.layers[il].wo, cur); - // cb(cur, "kqv_out", il); - // } - - // cur = ggml_add(ctx0, cur, inpSA); - // cb(cur, "cross_inp", il); - - // struct ggml_tensor * inpCA = cur; - - // // norm - // cur = build_norm(cur, - // model.layers[il].attn_norm_cross, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "attn_norm_cross", il); - - // // cross-attention - // { - // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur); - // cb(Qcur, "Qcur", il); - - // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc); - // cb(Kcur, "Kcur", il); - - // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc); - // cb(Vcur, "Vcur", il); - - // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); - - // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - - // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - // cb(kq, "kq", il); - - // kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); - // cb(kq, "kq_soft_max_ext", il); - - // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); - // cb(v, "v", il); - - // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); - // cb(kqv, "kqv", il); - - // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - // cb(kqv_merged, "kqv_merged", il); - - // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - // cb(cur, "kqv_merged_cont", il); - - // ggml_build_forward_expand(gf, cur); - - // cur = build_lora_mm(model.layers[il].wo_cross, cur); - // cb(cur, "kqv_out", il); - // } - - // if (il == n_layer - 1) { - // // skip computing output for unused tokens - // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - // cur = ggml_get_rows(ctx0, cur, inp_out_ids); - // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - // inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); - // } - - // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); - // cb(ffn_inp, "ffn_inp", il); - - // // feed-forward network - // { - // cur = build_norm(ffn_inp, - // model.layers[il].ffn_norm, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "ffn_norm", il); - - // // T5 uses relu, flan-T5 uses gelu-gated - // cur = build_ffn(cur, - // model.layers[il].ffn_up, NULL, NULL, - // model.layers[il].ffn_gate, NULL, NULL, - // model.layers[il].ffn_down, NULL, NULL, - // NULL, - // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - // cb, il); - // cb(cur, "ffn_out", il); - // } - - // cur = ggml_add(ctx0, cur, ffn_inp); - // cb(cur, "ffn_out", il); - - // ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - // if (layer_dir != nullptr) { - // cur = ggml_add(ctx0, cur, layer_dir); - // } - // cb(cur, "l_out", il); - - // // input for next layer - // inpL = cur; - // } - - // cur = inpL; - // cb(cur, "result_embd", -1); - - // cur = build_norm(cur, - // model.output_norm, NULL, - // LLM_NORM_RMS, -1); - // cb(cur, "result_norm", -1); - - // // lm_head - // cur = build_lora_mm(model.output, cur); - // cb(cur, "result_output", -1); - - // ggml_build_forward_expand(gf, cur); - - // return gf; - //} - - struct ggml_cgraph * build_jais() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - cur = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa))); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - } - - // add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - // FF - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - inpL = ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); - } - - cur = build_norm(inpL, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_chatglm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - cur = build_norm(inpL, - model.layers[il].attn_norm, - NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - struct ggml_tensor * Qcur = nullptr; - struct ggml_tensor * Kcur = nullptr; - struct ggml_tensor * Vcur = nullptr; - - if (model.layers[il].wqkv == nullptr) { - Qcur = build_lora_mm(model.layers[il].wq, cur); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - } - Kcur = build_lora_mm(model.layers[il].wk, cur); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - } - Vcur = build_lora_mm(model.layers[il].wv, cur); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - } - } else { - cur = build_lora_mm(model.layers[il].wqkv, cur); - cb(cur, "wqkv", il); - if (model.layers[il].bqkv) { - cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); - } - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - } - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor); - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur_rope", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur_rope", il); - - cur = build_attn(gf, - model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - // Add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // FF - { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, - NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_out", il); - - } - - inpL = ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); - } - - cur = build_norm(inpL, - model.output_norm, - NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_nemotron() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - //GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, - model.layers[il].attn_norm_b, - LLM_NORM, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - NULL, NULL, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - NULL, - LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, model.output_norm_b, - LLM_NORM, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_exaone() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); - - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - ggml_cgraph * build_rwkv6() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - GGML_ASSERT(hparams.token_shift_count == 2); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - - struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); - - const auto n_embd = hparams.n_embd; - const auto n_seq_tokens = ubatch.n_seq_tokens; - const auto n_seqs = ubatch.n_seqs; - - for (int il = 0; il < n_layer; ++il) { - const llama_layer * layer = &model.layers[il]; - - struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load( - ctx0, gf, state_copy, state_mask, ubatch, il, worst_case - ); - - struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); - struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); - - struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); - cb(att_norm, "attn_norm", il); - - struct ggml_tensor * x_prev = ggml_concat( - ctx0, - att_shift, - ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), - 1 - ); - - cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - struct ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); - cb(ffn_norm, "ffn_norm", il); - - x_prev = ggml_concat( - ctx0, - ffn_shift, - ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), - 1 - ); - - cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6); - cur = ggml_add(ctx0, cur, ffn_inp); - - token_shift = ggml_concat(ctx0, - ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)), - ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), - 1 - ); - ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); - - if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { - cur = ggml_scale(ctx0, cur, 0.5F); - } - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - - cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py - ggml_cgraph * build_rwkv6qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - GGML_ASSERT(n_embd == hparams.n_embd_k_s()); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); - - const auto n_embd = hparams.n_embd; - const auto n_seq_tokens = ubatch.n_seq_tokens; - const auto n_seqs = ubatch.n_seqs; - - inpL = build_inp_embd(model.tok_embd); - - for (int il = 0; il < n_layer; ++il) { - const llama_layer * layer = &model.layers[il]; - - struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load( - ctx0, gf, state_copy, state_mask, ubatch, il, worst_case - ); - - struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); - cb(att_norm, "attn_norm", il); - - struct ggml_tensor * x_prev = ggml_concat( - ctx0, - token_shift, - ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), - 1 - ); - - cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); - - token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); - ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_inp); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - - cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - // ref: https://github.com/facebookresearch/chameleon - // based on the original build_llama() function, changes: - // * qk-norm - // * swin-norm - // * removed bias - // * removed MoE - struct ggml_cgraph * build_chameleon() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - if (hparams.swin_norm) { - cur = inpL; - } else { - cur = build_norm(inpL, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - } - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - if (model.layers[il].attn_q_norm) { - Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, - ggml_element_size(Qcur) * n_embd_head, - ggml_element_size(Qcur) * n_embd_head * n_head, - 0); - cb(Qcur, "Qcur", il); - - Qcur = build_norm(Qcur, - model.layers[il].attn_q_norm, - model.layers[il].attn_q_norm_b, - LLM_NORM, il); - cb(Qcur, "Qcur", il); - } - - if (model.layers[il].attn_k_norm) { - Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, - ggml_element_size(Kcur) * n_embd_head, - ggml_element_size(Kcur) * n_embd_head * n_head_kv, - 0); - cb(Kcur, "Kcur", il); - - Kcur = build_norm(Kcur, - model.layers[il].attn_k_norm, - model.layers[il].attn_k_norm_b, - LLM_NORM, il); - cb(Kcur, "Kcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = build_attn(gf, - model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - - if (hparams.swin_norm) { - cur = build_norm(cur, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, il); - } - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - if (!hparams.swin_norm) { - cur = build_norm(ffn_inp, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - } - - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - - if (hparams.swin_norm) { - cur = build_norm(cur, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - } - - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); - - cur = lgf.build_cvec(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = build_norm(cur, - model.output_norm, NULL, - LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - cb(cur, "result_output_with_img_logits", -1); - - // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. - // Needs to be removed once image outputs are supported. - int img_token_end_idx = 8196; - int img_token_start_idx = 4; - int num_img_tokens = img_token_end_idx - img_token_start_idx; - // creates 1d tensor of size num_img_tokens and values -FLT_MAX, - // which ensures that text token values are always at least larger than image token values - struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens); - img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX); - cb(img_logits, "img_logits", -1); - cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - struct ggml_cgraph * build_wavtokenizer_dec() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - - cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); - - cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.conv1d_b); - - // posnet - for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) { - const auto & layer = model.layers[il].posnet; - - inpL = cur; - - switch (il) { - case 0: - case 1: - case 3: - case 4: - { - cur = build_norm(cur, - layer.norm1, - layer.norm1_b, - LLM_NORM_GROUP, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, layer.conv1_b); - - cur = build_norm(cur, - layer.norm2, - layer.norm2_b, - LLM_NORM_GROUP, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, layer.conv2_b); - - cur = ggml_add(ctx0, cur, inpL); - } break; - case 2: - { - cur = build_norm(cur, - layer.attn_norm, - layer.attn_norm_b, - LLM_NORM_GROUP, 0); - - struct ggml_tensor * q; - struct ggml_tensor * k; - struct ggml_tensor * v; - - q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1); - k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1); - v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1); - - q = ggml_add(ctx0, q, layer.attn_q_b); - k = ggml_add(ctx0, k, layer.attn_k_b); - v = ggml_add(ctx0, v, layer.attn_v_b); - - q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); - k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); - - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - - kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f); - - cur = ggml_mul_mat(ctx0, kq, v); - - cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1); - cur = ggml_add(ctx0, cur, layer.attn_o_b); - - cur = ggml_add(ctx0, cur, inpL); - } break; - case 5: - { - cur = build_norm(cur, - layer.norm, - layer.norm_b, - LLM_NORM_GROUP, 0); - } break; - default: GGML_ABORT("unknown posnet layer"); - }; - } - - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - - cur = build_norm(cur, - model.tok_norm, - model.tok_norm_b, - LLM_NORM, -1); - - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - - inpL = cur; - - // convnext - for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) { - const auto & layer = model.layers[il].convnext; - - cur = inpL; - - cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1); - cur = ggml_add(ctx0, cur, layer.dw_b); - - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - - cur = build_norm(cur, - layer.norm, - layer.norm_b, - LLM_NORM, -1); - - cur = build_ffn(cur, - layer.pw1, layer.pw1_b, NULL, - NULL, NULL, NULL, - layer.pw2, layer.pw2_b, NULL, - NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - - cur = ggml_mul(ctx0, cur, layer.gamma); - - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - - inpL = ggml_add(ctx0, cur, inpL); - } - - cur = inpL; - - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - - cur = build_norm(cur, - model.output_norm, - model.output_norm_b, - LLM_NORM, -1); - - // lm_head - cur = build_lora_mm(model.output, cur); - - cur = ggml_add(ctx0, cur, model.output_b); - cb(cur, "result_embd", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } -}; - -static struct ggml_cgraph * llama_build_graph( - llama_context & lctx, - const llama_ubatch & ubatch, - bool worst_case) { - const auto & model = lctx.get_model(); - const auto & cparams = lctx.get_cparams(); - - // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) - llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { - if (il >= 0) { - ggml_format_name(cur, "%s-%d", name, il); - } else { - ggml_set_name(cur, name); - } - - if (!cparams.offload_kqv) { - if (strcmp(name, "kqv_merged_cont") == 0) { - // all nodes between the KV store and the attention output are run on the CPU - ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu); - } - } - - // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends - // FIXME: fix in ggml_backend_sched - const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer; - if (ubatch.n_tokens < 32 || full_offload) { - if (il != -1 && strcmp(name, "norm") == 0) { - const auto & dev_layer = model.dev_layer(il); - for (auto & backend : lctx.backends) { - if (ggml_backend_get_device(backend.get()) == dev_layer) { - if (ggml_backend_supports_op(backend.get(), cur)) { - ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get()); - } - } - } - } - } - }; - - struct ggml_cgraph * result = NULL; - - struct llm_build_context llm(lctx, lctx.get_model(), lctx.get_cparams(), ubatch, std::move(cb), lctx.init(), worst_case); - - switch (model.arch) { - case LLM_ARCH_LLAMA: - case LLM_ARCH_MINICPM: - case LLM_ARCH_GRANITE: - case LLM_ARCH_GRANITE_MOE: - { - result = llm.build_llama(); - } break; - case LLM_ARCH_DECI: - { - result = llm.build_deci(); - } break; - case LLM_ARCH_BAICHUAN: - { - result = llm.build_baichuan(); - } break; - case LLM_ARCH_FALCON: - { - result = llm.build_falcon(); - } break; - case LLM_ARCH_GROK: - { - result = llm.build_grok(); - } break; - case LLM_ARCH_STARCODER: - { - result = llm.build_starcoder(); - } break; - case LLM_ARCH_REFACT: - { - result = llm.build_refact(); - } break; - case LLM_ARCH_BERT: - case LLM_ARCH_JINA_BERT_V2: - case LLM_ARCH_NOMIC_BERT: - { - result = llm.build_bert(); - } break; - case LLM_ARCH_BLOOM: - { - result = llm.build_bloom(); - } break; - case LLM_ARCH_MPT: - { - result = llm.build_mpt(); - } break; - case LLM_ARCH_STABLELM: - { - result = llm.build_stablelm(); - } break; - case LLM_ARCH_QWEN: - { - result = llm.build_qwen(); - } break; - case LLM_ARCH_QWEN2: - { - result = llm.build_qwen2(); - } break; - case LLM_ARCH_QWEN2VL: - { - result = llm.build_qwen2vl(); - } break; - case LLM_ARCH_QWEN2MOE: - { - result = llm.build_qwen2moe(); - } break; - case LLM_ARCH_PHI2: - { - result = llm.build_phi2(); - } break; - case LLM_ARCH_PHI3: - case LLM_ARCH_PHIMOE: - { - result = llm.build_phi3(); - } break; - case LLM_ARCH_PLAMO: - { - result = llm.build_plamo(); - } break; - case LLM_ARCH_GPT2: - { - result = llm.build_gpt2(); - } break; - case LLM_ARCH_CODESHELL: - { - result = llm.build_codeshell(); - } break; - case LLM_ARCH_ORION: - { - result = llm.build_orion(); - } break; - case LLM_ARCH_INTERNLM2: - { - result = llm.build_internlm2(); - } break; - case LLM_ARCH_MINICPM3: - { - result = llm.build_minicpm3(); - } break; - case LLM_ARCH_GEMMA: - { - result = llm.build_gemma(); - } break; - case LLM_ARCH_GEMMA2: - { - result = llm.build_gemma2(); - } break; - case LLM_ARCH_STARCODER2: - { - result = llm.build_starcoder2(); - } break; - case LLM_ARCH_MAMBA: - { - result = llm.build_mamba(); - } break; - case LLM_ARCH_XVERSE: - { - result = llm.build_xverse(); - } break; - case LLM_ARCH_COMMAND_R: - { - result = llm.build_command_r(); - } break; - case LLM_ARCH_COHERE2: - { - result = llm.build_cohere2(); - } break; - case LLM_ARCH_DBRX: - { - result = llm.build_dbrx(); - } break; - case LLM_ARCH_OLMO: - { - result = llm.build_olmo(); - } break; - case LLM_ARCH_OLMO2: - { - result = llm.build_olmo2(); - } break; - case LLM_ARCH_OLMOE: - { - result = llm.build_olmoe(); - } break; - case LLM_ARCH_OPENELM: - { - result = llm.build_openelm(); - } break; - case LLM_ARCH_GPTNEOX: - { - result = llm.build_gptneox(); - } break; - case LLM_ARCH_ARCTIC: - { - result = llm.build_arctic(); - } break; - case LLM_ARCH_DEEPSEEK: - { - result = llm.build_deepseek(); - } break; - case LLM_ARCH_DEEPSEEK2: - { - result = llm.build_deepseek2(); - } break; - case LLM_ARCH_CHATGLM: - { - result = llm.build_chatglm(); - } break; - case LLM_ARCH_BITNET: - { - result = llm.build_bitnet(); - } break; - //case LLM_ARCH_T5: - // { - // if (lctx.is_encoding) { - // result = llm.build_t5_enc(); - // } else { - // result = llm.build_t5_dec(); - // } - // } break; - //case LLM_ARCH_T5ENCODER: - // { - // result = llm.build_t5_enc(); - // } break; - case LLM_ARCH_JAIS: - { - result = llm.build_jais(); - } break; - case LLM_ARCH_NEMOTRON: - { - result = llm.build_nemotron(); - } break; - case LLM_ARCH_EXAONE: - { - result = llm.build_exaone(); - } break; - case LLM_ARCH_RWKV6: - { - result = llm.build_rwkv6(); - } break; - case LLM_ARCH_RWKV6QWEN2: - { - result = llm.build_rwkv6qwen2(); - } break; - case LLM_ARCH_CHAMELEON: - { - result = llm.build_chameleon(); - } break; - case LLM_ARCH_WAVTOKENIZER_DEC: - { - result = llm.build_wavtokenizer_dec(); - } break; - default: - GGML_ABORT("fatal error"); - } - - // add on pooling layer - if (cparams.embeddings) { - result = llm.append_pooling(result); - } - - return result; -} - // // interface implementation // @@ -7740,10 +327,7 @@ struct llama_context * llama_init_from_model( try { // TODO: add logic which llama_context implementation to construct - ctx = new llama_context_unified(*model, params, - [](llama_context & lctx, const llama_ubatch & ubatch, bool worst_case) { - return llama_build_graph(lctx, ubatch, worst_case); - }); + ctx = new llama_context_unified(*model, params); } catch (const std::exception & e) { LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what()); return nullptr; From 6ee86e5e0f45e99fe2f0c3b322fe3ab82e632f9b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 16:29:15 +0200 Subject: [PATCH 37/84] graph : restore ubatch in build_cb ggml-ci --- src/llama-context.cpp | 6 ++---- src/llama-context.h | 1 + src/llama-graph.h | 1 + src/llama-model.cpp | 3 ++- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 74d6a67bbe9d2..62f76f48b9d08 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -196,6 +196,7 @@ bool llama_context::apply_adapter_cvec( void llama_context::build_cb( ggml_tensor * cur, const char * name, + const llama_ubatch & ubatch, int il) { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); @@ -213,10 +214,7 @@ void llama_context::build_cb( // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer; - // TODO: during #11213, the requirement for ubatch.n_tokens < 32 was removed to simplify - // not sure if this is still needed, but it can be brought back if needed - //if (ubatch.n_tokens < 32 || full_offload) { - if (full_offload) { + if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); for (auto & backend : backends) { diff --git a/src/llama-context.h b/src/llama-context.h index 8d7a6ad58dec4..dc85c797100a4 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -85,6 +85,7 @@ struct llama_context : public llama_graph_i { virtual void build_cb( ggml_tensor * cur, const char * name, + const llama_ubatch & ubatch, int il); // TODO: add encode/decode graphs diff --git a/src/llama-graph.h b/src/llama-graph.h index 0084d99ccade6..d111d76e92b93 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -14,6 +14,7 @@ class llama_graph_i { virtual void build_cb( ggml_tensor * cur, const char * name, + const llama_ubatch & ubatch, int il) = 0; // apply control vector for layer il diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bded48be6c25b..ba11f1e1514cc 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -248,6 +248,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara return cur_buft; } } + return nullptr; } @@ -3888,7 +3889,7 @@ struct llm_build_context { // TODO: tmp void cb(struct ggml_tensor * cur, const char * name, int il) { - lgf.build_cb(cur, name, il); + lgf.build_cb(cur, name, ubatch, il); } // TODO: tmp From fbe6a07256c36264bfbb0749d2285f397edf38bb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 17:16:44 +0200 Subject: [PATCH 38/84] context : rename to llama_context_kv_self --- src/llama-context.cpp | 140 +++++++++++++++++++++--------------------- src/llama-context.h | 54 ++++++++-------- src/llama-graph.h | 3 + src/llama-model.h | 1 + src/llama.cpp | 2 +- 5 files changed, 102 insertions(+), 98 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 62f76f48b9d08..665a144d70252 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -332,10 +332,10 @@ void llama_context::perf_reset() { } // -// llama_context_unified +// llama_context_kv_self // -llama_context_unified::llama_context_unified( +llama_context_kv_self::llama_context_kv_self( const llama_model & model, const llama_context_params & params) : llama_context(model) { const auto & hparams = model.hparams; @@ -636,29 +636,29 @@ llama_context_unified::llama_context_unified( } } -llama_context_unified::~llama_context_unified() = default; +llama_context_kv_self::~llama_context_kv_self() = default; -uint32_t llama_context_unified::n_seq_max() const { +uint32_t llama_context_kv_self::n_seq_max() const { // TODO: add notion of n_seq_max to llama_kv_cache and use it here return kv_self.size; } -llama_kv_cache * llama_context_unified::get_kv_self() { +llama_kv_cache * llama_context_kv_self::get_kv_self() { return &kv_self; } -const llama_kv_cache * llama_context_unified::get_kv_self() const { +const llama_kv_cache * llama_context_kv_self::get_kv_self() const { return &kv_self; } -float * llama_context_unified::get_logits() { +float * llama_context_kv_self::get_logits() { // reorder logits for backward compatibility reorder_outputs(); return logits; } -float * llama_context_unified::get_logits_ith(int32_t i) { +float * llama_context_kv_self::get_logits_ith(int32_t i) { int32_t j = -1; try { @@ -696,14 +696,14 @@ float * llama_context_unified::get_logits_ith(int32_t i) { } } -float * llama_context_unified::get_embeddings() { +float * llama_context_kv_self::get_embeddings() { // reorder embeddings for backward compatibility reorder_outputs(); return embd; } -float * llama_context_unified::get_embeddings_ith(int32_t i) { +float * llama_context_kv_self::get_embeddings_ith(int32_t i) { int32_t j = -1; try { @@ -741,7 +741,7 @@ float * llama_context_unified::get_embeddings_ith(int32_t i) { } } -float * llama_context_unified::get_embeddings_seq(llama_seq_id seq_id) { +float * llama_context_kv_self::get_embeddings_seq(llama_seq_id seq_id) { auto it = embd_seq.find(seq_id); if (it == embd_seq.end()) { return nullptr; @@ -750,7 +750,7 @@ float * llama_context_unified::get_embeddings_seq(llama_seq_id seq_id) { return it->second.data(); } -ggml_context_ptr llama_context_unified::init() { +ggml_context_ptr llama_context_kv_self::init() { inp_tokens = nullptr; inp_embd = nullptr; inp_pos = nullptr; @@ -771,8 +771,8 @@ ggml_context_ptr llama_context_unified::init() { return llama_context::init(); } -struct llama_context_unified::batch_manager { - batch_manager(llama_context_unified & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { +struct llama_context_kv_self::batch_manager { + batch_manager(llama_context_kv_self & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { const auto & model = lctx.model; const auto & cparams = lctx.cparams; const auto & hparams = lctx.model.hparams; @@ -982,18 +982,18 @@ struct llama_context_unified::batch_manager { int64_t n_outputs_all = 0; - llama_context_unified & lctx; + llama_context_kv_self & lctx; const llama_batch & batch; llama_kv_slot_restorer kv_slot_restorer; }; -std::unique_ptr llama_context_unified::prepare_batch(const llama_batch & batch) { +std::unique_ptr llama_context_kv_self::prepare_batch(const llama_batch & batch) { return std::make_unique(*this, batch); } -int llama_context_unified::decode(llama_batch & inp_batch) { +int llama_context_kv_self::decode(llama_batch & inp_batch) { is_encoding = false; if (inp_batch.n_tokens == 0) { @@ -1198,7 +1198,7 @@ int llama_context_unified::decode(llama_batch & inp_batch) { return 0; } -int llama_context_unified::encode(llama_batch & inp_batch) { +int llama_context_kv_self::encode(llama_batch & inp_batch) { is_encoding = true; if (inp_batch.n_tokens == 0) { @@ -1375,7 +1375,7 @@ int llama_context_unified::encode(llama_batch & inp_batch) { return 0; } -enum ggml_status llama_context_unified::compute_graph( +enum ggml_status llama_context_kv_self::compute_graph( ggml_cgraph * graph, bool batched) { int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; @@ -1402,23 +1402,23 @@ enum ggml_status llama_context_unified::compute_graph( return status; } -llama_pos llama_context_unified::pos_max() const { +llama_pos llama_context_kv_self::pos_max() const { return kv_self.pos_max(); } -uint32_t llama_context_unified::get_ctx_padding(const llama_cparams & cparams) const { +uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) const { return kv_self.get_padding(cparams); } -void llama_context_unified::prepare_k_shift() { +void llama_context_kv_self::prepare_k_shift() { } -void llama_context_unified::prepare_defrag() { +void llama_context_kv_self::prepare_defrag() { } // llama input -void llama_context_unified::set_inputs(const llama_ubatch & ubatch) { +void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; // @@ -1837,7 +1837,7 @@ void llama_context_unified::set_inputs(const llama_ubatch & ubatch) { } } -void llama_context_unified::reorder_outputs() { +void llama_context_kv_self::reorder_outputs() { std::vector & out_ids = sbatch.out_ids; if (!out_ids.empty()) { const uint32_t n_vocab = model.vocab.n_tokens(); @@ -1875,7 +1875,7 @@ void llama_context_unified::reorder_outputs() { } } -size_t llama_context_unified::reserve_outputs(size_t n_outputs) { +size_t llama_context_kv_self::reserve_outputs(size_t n_outputs) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; @@ -1944,7 +1944,7 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) { return n_outputs_max; } -void llama_context_unified::kv_self_update() { +void llama_context_kv_self::kv_self_update() { auto & kv = kv_self; if (kv.has_shift) { @@ -2009,7 +2009,7 @@ void llama_context_unified::kv_self_update() { } } -void llama_context_unified::build_attn_inp( +void llama_context_kv_self::build_attn_inp( ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -2040,7 +2040,7 @@ void llama_context_unified::build_attn_inp( } } -void llama_context_unified::build_attn_kv_store( +void llama_context_kv_self::build_attn_kv_store( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * k_cur, @@ -2084,7 +2084,7 @@ void llama_context_unified::build_attn_kv_store( ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view)); } -ggml_tensor * llama_context_unified::build_attn_qkv( +ggml_tensor * llama_context_kv_self::build_attn_qkv( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * wo, @@ -2236,7 +2236,7 @@ ggml_tensor * llama_context_unified::build_attn_qkv( return cur; } -ggml_tensor * llama_context_unified::build_soft_max_ext( +ggml_tensor * llama_context_kv_self::build_soft_max_ext( ggml_context * ctx0, ggml_tensor * kq, float kq_scale) { @@ -2245,7 +2245,7 @@ ggml_tensor * llama_context_unified::build_soft_max_ext( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } -ggml_tensor * llama_context_unified::build_inp_embd( +ggml_tensor * llama_context_kv_self::build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, const llama_ubatch & ubatch) { @@ -2295,7 +2295,7 @@ ggml_tensor * llama_context_unified::build_inp_embd( return inpL; } -ggml_tensor * llama_context_unified::build_inp_pos( +ggml_tensor * llama_context_kv_self::build_inp_pos( ggml_context * ctx0, int32_t n_tokens) { inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); @@ -2304,7 +2304,7 @@ ggml_tensor * llama_context_unified::build_inp_pos( return inp_pos; } -ggml_tensor * llama_context_unified::build_inp_out_ids( +ggml_tensor * llama_context_kv_self::build_inp_out_ids( ggml_context * ctx0, int32_t n_tokens, bool worst_case) { @@ -2316,7 +2316,7 @@ ggml_tensor * llama_context_unified::build_inp_out_ids( return inp_out_ids; } -ggml_tensor * llama_context_unified::build_inp_mean( +ggml_tensor * llama_context_kv_self::build_inp_mean( ggml_context * ctx0, int32_t n_tokens) { inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); @@ -2325,7 +2325,7 @@ ggml_tensor * llama_context_unified::build_inp_mean( return inp_mean; } -ggml_tensor * llama_context_unified::build_inp_cls( +ggml_tensor * llama_context_kv_self::build_inp_cls( ggml_context * ctx0, int32_t n_tokens) { inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); @@ -2334,7 +2334,7 @@ ggml_tensor * llama_context_unified::build_inp_cls( return inp_cls; } -void llama_context_unified::build_k_shift( +void llama_context_kv_self::build_k_shift( ggml_context * ctx0, ggml_cgraph * graph) { const auto & n_ctx = cparams.n_ctx; @@ -2406,7 +2406,7 @@ void llama_context_unified::build_k_shift( } } -void llama_context_unified::build_defrag( +void llama_context_kv_self::build_defrag( ggml_context * ctx0, ggml_cgraph * graph) { const auto & hparams = model.hparams; @@ -2676,7 +2676,7 @@ void llama_context_unified::build_defrag( #endif } -ggml_tensor * llama_context_unified::build_inp_embd_enc( +ggml_tensor * llama_context_kv_self::build_inp_embd_enc( ggml_context * ctx0, int32_t n_tokens, bool worst_case) { @@ -2692,7 +2692,7 @@ ggml_tensor * llama_context_unified::build_inp_embd_enc( return inp_embd_enc; } -ggml_tensor * llama_context_unified::build_inp_KQ_mask_cross( +ggml_tensor * llama_context_kv_self::build_inp_KQ_mask_cross( ggml_context * ctx0, int32_t n_tokens, bool worst_case) { @@ -2708,7 +2708,7 @@ ggml_tensor * llama_context_unified::build_inp_KQ_mask_cross( return inp_KQ_mask_cross; } -ggml_tensor * llama_context_unified::build_inp_s_copy( +ggml_tensor * llama_context_kv_self::build_inp_s_copy( ggml_context * ctx0, bool worst_case) { const auto n_kv = worst_case ? kv_self.size : kv_self.n; @@ -2719,7 +2719,7 @@ ggml_tensor * llama_context_unified::build_inp_s_copy( return inp_s_copy; } -ggml_tensor * llama_context_unified::build_inp_s_mask( +ggml_tensor * llama_context_kv_self::build_inp_s_mask( ggml_context * ctx0, bool worst_case) { const auto n_kv = worst_case ? kv_self.size : kv_self.n; @@ -2729,7 +2729,7 @@ ggml_tensor * llama_context_unified::build_inp_s_mask( return inp_s_mask; } -ggml_tensor * llama_context_unified::build_copy_mask_state( +ggml_tensor * llama_context_kv_self::build_copy_mask_state( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * s, @@ -2764,7 +2764,7 @@ ggml_tensor * llama_context_unified::build_copy_mask_state( } // TODO: split -ggml_tensor * llama_context_unified::build_mamba_layer( +ggml_tensor * llama_context_kv_self::build_mamba_layer( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * cur, @@ -2900,7 +2900,7 @@ ggml_tensor * llama_context_unified::build_mamba_layer( } -ggml_tensor * llama_context_unified::build_rwkv_token_shift_load( +ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * state_copy, @@ -2927,7 +2927,7 @@ ggml_tensor * llama_context_unified::build_rwkv_token_shift_load( } -ggml_tensor * llama_context_unified::build_rwkv_token_shift_store( +ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, @@ -2951,7 +2951,7 @@ ggml_tensor * llama_context_unified::build_rwkv_token_shift_store( } -ggml_tensor * llama_context_unified::build_rwkv6_time_mix( +ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( ggml_context * ctx0, ggml_cgraph * graph, ggml_tensor * cur, @@ -3130,7 +3130,7 @@ ggml_tensor * llama_context_unified::build_rwkv6_time_mix( // TODO: replace all non-fatal assertions with returned errors or exceptions struct llama_data_write { - llama_data_write(llama_context_unified * ctx) : ctx(ctx) {} + llama_data_write(llama_context_kv_self * ctx) : ctx(ctx) {} virtual ~llama_data_write() = default; virtual void write(const void * src, size_t size) = 0; @@ -3215,11 +3215,11 @@ struct llama_data_write { } } - llama_context_unified * ctx; + llama_context_kv_self * ctx; }; struct llama_data_read { - llama_data_read(llama_context_unified * ctx) : ctx(ctx) {} + llama_data_read(llama_context_kv_self * ctx) : ctx(ctx) {} virtual ~llama_data_read() = default; virtual const uint8_t * read(size_t size) = 0; @@ -3311,11 +3311,11 @@ struct llama_data_read { } } - llama_context_unified * ctx; + llama_context_kv_self * ctx; }; struct llama_data_write_dummy : llama_data_write { - llama_data_write_dummy(llama_context_unified * ctx) : llama_data_write(ctx) {} + llama_data_write_dummy(llama_context_kv_self * ctx) : llama_data_write(ctx) {} void write(const void * /* src */, size_t size) override { size_written += size; @@ -3334,7 +3334,7 @@ struct llama_data_write_dummy : llama_data_write { struct llama_data_write_buffer : llama_data_write { llama_data_write_buffer( - llama_context_unified * ctx, + llama_context_kv_self * ctx, uint8_t * p, size_t len) : llama_data_write(ctx), ptr(p), buf_size(len) {} void write(const void * src, size_t size) override { @@ -3368,7 +3368,7 @@ struct llama_data_write_buffer : llama_data_write { struct llama_data_read_buffer : llama_data_read { llama_data_read_buffer( - llama_context_unified * ctx, + llama_context_kv_self * ctx, const uint8_t * p, size_t len) : llama_data_read(ctx), ptr(p), buf_size(len) {} const uint8_t * read(size_t size) override { @@ -3397,7 +3397,7 @@ struct llama_data_read_buffer : llama_data_read { struct llama_data_write_file : llama_data_write { llama_data_write_file( - llama_context_unified * ctx, + llama_context_kv_self * ctx, llama_file * f) : llama_data_write(ctx), file(f) {} void write(const void * src, size_t size) override { @@ -3422,7 +3422,7 @@ struct llama_data_write_file : llama_data_write { struct llama_data_read_file : llama_data_read { llama_data_read_file( - llama_context_unified * ctx, + llama_context_kv_self * ctx, llama_file * f) : llama_data_read(ctx), file(f) {} void read_to(void * dst, size_t size) override { @@ -3445,7 +3445,7 @@ struct llama_data_read_file : llama_data_read { std::vector temp_buffer; }; -size_t llama_context_unified::state_get_size() { +size_t llama_context_kv_self::state_get_size() { llama_data_write_dummy data_ctx(this); try { return state_get_data(data_ctx); @@ -3455,7 +3455,7 @@ size_t llama_context_unified::state_get_size() { } } -size_t llama_context_unified::state_get_data(uint8_t * dst, size_t size) { +size_t llama_context_kv_self::state_get_data(uint8_t * dst, size_t size) { llama_data_write_buffer data_ctx(this, dst, size); try { return state_get_data(data_ctx); @@ -3465,7 +3465,7 @@ size_t llama_context_unified::state_get_data(uint8_t * dst, size_t size) { } } -size_t llama_context_unified::state_set_data(const uint8_t * src, size_t size) { +size_t llama_context_kv_self::state_set_data(const uint8_t * src, size_t size) { llama_data_read_buffer data_ctx(this, src, size); try { return state_set_data(data_ctx); @@ -3475,7 +3475,7 @@ size_t llama_context_unified::state_set_data(const uint8_t * src, size_t size) { } } -size_t llama_context_unified::state_seq_get_size(llama_seq_id seq_id) { +size_t llama_context_kv_self::state_seq_get_size(llama_seq_id seq_id) { llama_data_write_dummy data_ctx(this); try { return state_seq_get_data(data_ctx, seq_id); @@ -3485,7 +3485,7 @@ size_t llama_context_unified::state_seq_get_size(llama_seq_id seq_id) { } } -size_t llama_context_unified::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { +size_t llama_context_kv_self::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { llama_data_write_buffer data_ctx(this, dst, size); try { return state_seq_get_data(data_ctx, seq_id); @@ -3495,7 +3495,7 @@ size_t llama_context_unified::state_seq_get_data(llama_seq_id seq_id, uint8_t * } } -size_t llama_context_unified::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { +size_t llama_context_kv_self::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { llama_data_read_buffer data_ctx(this, src, size); try { return state_seq_set_data(data_ctx, seq_id); @@ -3505,7 +3505,7 @@ size_t llama_context_unified::state_seq_set_data(llama_seq_id seq_id, const uint } } -bool llama_context_unified::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +bool llama_context_kv_self::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { llama_file file(filepath, "rb"); // sanity checks @@ -3548,7 +3548,7 @@ bool llama_context_unified::state_load_file(const char * filepath, llama_token * return true; } -bool llama_context_unified::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) { +bool llama_context_kv_self::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) { llama_file file(filepath, "wb"); file.write_u32(LLAMA_SESSION_MAGIC); @@ -3565,7 +3565,7 @@ bool llama_context_unified::state_save_file(const char * filepath, const llama_t return true; } -size_t llama_context_unified::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +size_t llama_context_kv_self::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { llama_file file(filepath, "rb"); // version checks @@ -3608,7 +3608,7 @@ size_t llama_context_unified::state_seq_load_file(llama_seq_id seq_id, const cha return file.tell(); } -size_t llama_context_unified::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) { +size_t llama_context_kv_self::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) { llama_file file(filepath, "wb"); file.write_u32(LLAMA_STATE_SEQ_MAGIC); @@ -3641,7 +3641,7 @@ size_t llama_context_unified::state_seq_save_file(llama_seq_id seq_id, const cha * llama_state_get_data_internal(ctx, data_ctx); * */ -size_t llama_context_unified::state_get_data(llama_data_write & data_ctx) { +size_t llama_context_kv_self::state_get_data(llama_data_write & data_ctx) { synchronize(); data_ctx.write_model_info(); @@ -3667,7 +3667,7 @@ size_t llama_context_unified::state_get_data(llama_data_write & data_ctx) { return data_ctx.get_size_written(); } -size_t llama_context_unified::state_set_data(llama_data_read & data_ctx) { +size_t llama_context_kv_self::state_set_data(llama_data_read & data_ctx) { synchronize(); data_ctx.read_model_info(); @@ -3693,7 +3693,7 @@ size_t llama_context_unified::state_set_data(llama_data_read & data_ctx) { return data_ctx.get_size_read(); } -size_t llama_context_unified::state_seq_get_data(llama_data_write & data_ctx, llama_seq_id seq_id) { +size_t llama_context_kv_self::state_seq_get_data(llama_data_write & data_ctx, llama_seq_id seq_id) { synchronize(); llama_kv_cache::io io = { @@ -3712,7 +3712,7 @@ size_t llama_context_unified::state_seq_get_data(llama_data_write & data_ctx, ll return data_ctx.get_size_written(); } -size_t llama_context_unified::state_seq_set_data(llama_data_read & data_ctx, llama_seq_id seq_id) { +size_t llama_context_kv_self::state_seq_set_data(llama_data_read & data_ctx, llama_seq_id seq_id) { synchronize(); llama_kv_cache::io io = { diff --git a/src/llama-context.h b/src/llama-context.h index dc85c797100a4..648a41045a070 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -82,6 +82,8 @@ struct llama_context : public llama_graph_i { int32_t il_start, int32_t il_end); + // graph build API (generic) + virtual void build_cb( ggml_tensor * cur, const char * name, @@ -91,6 +93,27 @@ struct llama_context : public llama_graph_i { // TODO: add encode/decode graphs virtual ggml_cgraph * build_graph(const llama_ubatch & ubatch, bool worst_case); + // apply control vector for layer il + virtual ggml_tensor * build_cvec( + ggml_context * ctx0, + ggml_tensor * cur, + int il); + + // do mat_mul, while optionally apply lora + virtual ggml_tensor * build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur); + + // do mat_mul_id, while optionally apply lora + virtual ggml_tensor * build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, // struct ggml_tensor * as + ggml_tensor * cur, // struct ggml_tensor * b + ggml_tensor * ids); + + virtual ggml_tensor * build_rope_factors(int il); + // decode a batch of tokens by evaluating the transformer // in case of unsuccessful decoding (error or warning), // the kv_cache state will be returned to its original state @@ -116,29 +139,6 @@ struct llama_context : public llama_graph_i { // virtual int encode(llama_batch & inp_batch) = 0; - // graph build API (generic) - - // apply control vector for layer il - virtual ggml_tensor * build_cvec( - ggml_context * ctx0, - ggml_tensor * cur, - int il); - - // do mat_mul, while optionally apply lora - virtual ggml_tensor * build_lora_mm( - ggml_context * ctx0, - ggml_tensor * w, - ggml_tensor * cur); - - // do mat_mul_id, while optionally apply lora - virtual ggml_tensor * build_lora_mm_id( - ggml_context * ctx0, - ggml_tensor * w, // struct ggml_tensor * as - ggml_tensor * cur, // struct ggml_tensor * b - ggml_tensor * ids); - - virtual ggml_tensor * build_rope_factors(int il); - // state save/load virtual size_t state_get_size() = 0; @@ -217,16 +217,16 @@ struct llama_context : public llama_graph_i { mutable int32_t n_eval = 0; // number of eval calls }; -// TODO: make implementation details private -class llama_context_unified : public llama_context { +// transformer with a self-attention KV cache +class llama_context_kv_self : public llama_context { public: struct batch_manager; - llama_context_unified( + llama_context_kv_self( const llama_model & model, const llama_context_params & params); - virtual ~llama_context_unified(); + virtual ~llama_context_kv_self(); virtual uint32_t n_seq_max() const override; diff --git a/src/llama-graph.h b/src/llama-graph.h index d111d76e92b93..5267d53da4c06 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -2,6 +2,9 @@ #include +// note: do not add high-level objects here, such as llama_context, llama_kv_cache, etc. +// not sure about llama_batch/llama_sbatch yet + struct ggml_cgraph; struct ggml_context; struct ggml_tensor; diff --git a/src/llama-model.h b/src/llama-model.h index 5d2a07abc570f..0374b484b10ab 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -368,6 +368,7 @@ struct llama_model { const struct ggml_tensor * get_tensor(const char * name) const; // TODO: add encode/decode graphs + // TODO: return a struct containing the graph and the output tensors, such as logits, embeddings, etc. ggml_cgraph * build_graph( llama_graph_i & lgf, const llama_cparams & cparams, diff --git a/src/llama.cpp b/src/llama.cpp index 83b66035fc585..d20a2a6d50f60 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -327,7 +327,7 @@ struct llama_context * llama_init_from_model( try { // TODO: add logic which llama_context implementation to construct - ctx = new llama_context_unified(*model, params); + ctx = new llama_context_kv_self(*model, params); } catch (const std::exception & e) { LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what()); return nullptr; From 3a504d9a0bd7d952d22cd2d707446de2316ec955 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Feb 2025 12:18:44 +0200 Subject: [PATCH 39/84] llama : introduce llama_io interfaces ggml-ci --- src/CMakeLists.txt | 1 + src/llama-context.cpp | 488 +++++++++++++++-------------------------- src/llama-context.h | 14 +- src/llama-io.cpp | 15 ++ src/llama-io.h | 35 +++ src/llama-kv-cache.cpp | 18 +- src/llama-kv-cache.h | 21 +- 7 files changed, 254 insertions(+), 338 deletions(-) create mode 100644 src/llama-io.cpp create mode 100644 src/llama-io.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f1f5d41d495a1..7f919c90ec5c3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,6 +18,7 @@ add_library(llama llama-graph.cpp llama-hparams.cpp llama-impl.cpp + llama-io.cpp llama-kv-cache.cpp llama-mmap.cpp llama-model-loader.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 665a144d70252..d6618f1438869 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2,6 +2,7 @@ #include "llama-impl.h" #include "llama-mmap.h" +#include "llama-io.h" #include #include @@ -3128,214 +3129,29 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( // TODO: this needs a big rework -// TODO: replace all non-fatal assertions with returned errors or exceptions -struct llama_data_write { - llama_data_write(llama_context_kv_self * ctx) : ctx(ctx) {} - virtual ~llama_data_write() = default; - - virtual void write(const void * src, size_t size) = 0; - virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0; - virtual size_t get_size_written() = 0; - - void write_string(const std::string & str) { - uint32_t str_size = str.size(); - - write(&str_size, sizeof(str_size)); - write(str.data(), str_size); - } - - void write_model_info() { - const auto & model = ctx->get_model(); - const std::string arch_str = llm_arch_name(model.arch); - write_string(arch_str); - // TODO: add more model-specific info which should prevent loading the session file if not identical - } - - //void write_rng(const std::mt19937 & rng) { - // std::ostringstream rng_ss; - // rng_ss << rng; - - // const std::string & rng_str = rng_ss.str(); - - // write_string(rng_str); - //} - - void write_output_ids() { - ctx->reorder_outputs(); - - const uint32_t n_outputs = ctx->n_outputs; - - std::vector output_pos; - - const size_t n_batch = ctx->n_batch(); - const auto & output_ids = ctx->output_ids; - - GGML_ASSERT(n_outputs <= ctx->output_size); - - output_pos.resize(n_outputs); - - // build a more compact representation of the output ids - for (size_t i = 0; i < n_batch; ++i) { - // map an output id to a position in the batch - int32_t pos = output_ids[i]; - if (pos >= 0) { - GGML_ASSERT((uint32_t) pos < n_outputs); - output_pos[pos] = i; - } - } - - write(&n_outputs, sizeof(n_outputs)); - - if (n_outputs) { - write(output_pos.data(), n_outputs * sizeof(int32_t)); - } - } - - void write_logits() { - const auto & model = ctx->get_model(); - - const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * model.vocab.n_tokens()); - - write(&logits_size, sizeof(logits_size)); - - if (logits_size) { - write(ctx->logits, logits_size * sizeof(float)); - } - } - - void write_embeddings() { - const auto & model = ctx->get_model(); - - const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * model.hparams.n_embd); - - write(&embeddings_size, sizeof(embeddings_size)); - - if (embeddings_size) { - write(ctx->embd, embeddings_size * sizeof(float)); - } - } - - llama_context_kv_self * ctx; -}; - -struct llama_data_read { - llama_data_read(llama_context_kv_self * ctx) : ctx(ctx) {} - virtual ~llama_data_read() = default; - - virtual const uint8_t * read(size_t size) = 0; - virtual void read_to(void * dst, size_t size) = 0; - virtual size_t get_size_read() = 0; - - void read_string(std::string & str) { - uint32_t str_size; - read_to(&str_size, sizeof(str_size)); - - str.assign((const char *) read(str_size), str_size); - } - - // validate model information - void read_model_info() { - const auto & model = ctx->get_model(); - - const std::string cur_arch_str = llm_arch_name(model.arch); - - std::string arch_str; - read_string(arch_str); - if (cur_arch_str != arch_str) { - throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); - } - // TODO: add more info which needs to be identical but which is not verified otherwise - } - - //void read_rng(std::mt19937 & rng) { - // std::string rng_str; - // read_string(rng_str); - - // std::istringstream rng_ss(rng_str); - // rng_ss >> rng; - - // if (rng_ss.fail()) { - // throw std::runtime_error("failed to load RNG state"); - // } - //} - - void read_output_ids() { - std::vector output_pos; - - uint32_t n_outputs; - read_to(&n_outputs, sizeof(n_outputs)); - - if (n_outputs > ctx->reserve_outputs(n_outputs)) { - throw std::runtime_error("could not reserve outputs"); - } - - if (n_outputs) { - output_pos.resize(n_outputs); - read_to(output_pos.data(), n_outputs * sizeof(int32_t)); - - for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { - int32_t id = output_pos[i]; - if ((uint32_t) id >= ctx->n_batch()) { - throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->n_batch())); - } - ctx->output_ids[id] = i; - } - - ctx->n_outputs = n_outputs; - } - } - - void read_logits() { - uint64_t logits_size; - read_to(&logits_size, sizeof(logits_size)); - - if (ctx->logits_size < logits_size) { - throw std::runtime_error("logits buffer too small"); - } - - if (logits_size) { - read_to(ctx->logits, logits_size * sizeof(float)); - } - } - - void read_embeddings() { - uint64_t embeddings_size; - read_to(&embeddings_size, sizeof(embeddings_size)); - - if (ctx->embd_size < embeddings_size) { - throw std::runtime_error("embeddings buffer too small"); - } - - if (embeddings_size) { - read_to(ctx->embd, embeddings_size * sizeof(float)); - } - } - - llama_context_kv_self * ctx; -}; - -struct llama_data_write_dummy : llama_data_write { - llama_data_write_dummy(llama_context_kv_self * ctx) : llama_data_write(ctx) {} +class llama_io_write_dummy : public llama_io_write_i { +public: + llama_io_write_dummy() = default; void write(const void * /* src */, size_t size) override { size_written += size; } - void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { + void write_tensor(const ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { size_written += size; } - size_t get_size_written() override { + size_t n_bytes() override { return size_written; } size_t size_written = 0; }; -struct llama_data_write_buffer : llama_data_write { - llama_data_write_buffer( - llama_context_kv_self * ctx, - uint8_t * p, size_t len) : llama_data_write(ctx), ptr(p), buf_size(len) {} +class llama_io_write_buffer : public llama_io_write_i { +public: + llama_io_write_buffer( + uint8_t * p, size_t len) : ptr(p), buf_size(len) {} void write(const void * src, size_t size) override { if (size > buf_size) { @@ -3347,7 +3163,7 @@ struct llama_data_write_buffer : llama_data_write { buf_size -= size; } - void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override { if (size > buf_size) { throw std::runtime_error("unexpectedly reached end of buffer"); } @@ -3357,7 +3173,7 @@ struct llama_data_write_buffer : llama_data_write { buf_size -= size; } - size_t get_size_written() override { + size_t n_bytes() override { return size_written; } @@ -3366,10 +3182,9 @@ struct llama_data_write_buffer : llama_data_write { size_t size_written = 0; }; -struct llama_data_read_buffer : llama_data_read { - llama_data_read_buffer( - llama_context_kv_self * ctx, - const uint8_t * p, size_t len) : llama_data_read(ctx), ptr(p), buf_size(len) {} +class llama_io_read_buffer : public llama_io_read_i { +public: + llama_io_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {} const uint8_t * read(size_t size) override { const uint8_t * base_ptr = ptr; @@ -3386,7 +3201,7 @@ struct llama_data_read_buffer : llama_data_read { memcpy(dst, read(size), size); } - size_t get_size_read() override { + size_t n_bytes() override { return size_read; } @@ -3395,23 +3210,22 @@ struct llama_data_read_buffer : llama_data_read { size_t size_read = 0; }; -struct llama_data_write_file : llama_data_write { - llama_data_write_file( - llama_context_kv_self * ctx, - llama_file * f) : llama_data_write(ctx), file(f) {} +class llama_io_write_file : public llama_io_write_i { +public: + llama_io_write_file(llama_file * f) : file(f) {} void write(const void * src, size_t size) override { file->write_raw(src, size); size_written += size; } - void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override { temp_buffer.resize(size); ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); write(temp_buffer.data(), temp_buffer.size()); } - size_t get_size_written() override { + size_t n_bytes() override { return size_written; } @@ -3420,10 +3234,9 @@ struct llama_data_write_file : llama_data_write { std::vector temp_buffer; }; -struct llama_data_read_file : llama_data_read { - llama_data_read_file( - llama_context_kv_self * ctx, - llama_file * f) : llama_data_read(ctx), file(f) {} +class llama_io_read_file : public llama_io_read_i { +public: + llama_io_read_file(llama_file * f) : file(f) {} void read_to(void * dst, size_t size) override { file->read_raw(dst, size); @@ -3436,7 +3249,7 @@ struct llama_data_read_file : llama_data_read { return temp_buffer.data(); } - size_t get_size_read() override { + size_t n_bytes() override { return size_read; } @@ -3446,9 +3259,9 @@ struct llama_data_read_file : llama_data_read { }; size_t llama_context_kv_self::state_get_size() { - llama_data_write_dummy data_ctx(this); + llama_io_write_dummy io; try { - return state_get_data(data_ctx); + return state_get_data(io); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); return 0; @@ -3456,9 +3269,9 @@ size_t llama_context_kv_self::state_get_size() { } size_t llama_context_kv_self::state_get_data(uint8_t * dst, size_t size) { - llama_data_write_buffer data_ctx(this, dst, size); + llama_io_write_buffer io(dst, size); try { - return state_get_data(data_ctx); + return state_get_data(io); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); return 0; @@ -3466,9 +3279,9 @@ size_t llama_context_kv_self::state_get_data(uint8_t * dst, size_t size) { } size_t llama_context_kv_self::state_set_data(const uint8_t * src, size_t size) { - llama_data_read_buffer data_ctx(this, src, size); + llama_io_read_buffer io(src, size); try { - return state_set_data(data_ctx); + return state_set_data(io); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); return 0; @@ -3476,9 +3289,9 @@ size_t llama_context_kv_self::state_set_data(const uint8_t * src, size_t size) { } size_t llama_context_kv_self::state_seq_get_size(llama_seq_id seq_id) { - llama_data_write_dummy data_ctx(this); + llama_io_write_dummy io; try { - return state_seq_get_data(data_ctx, seq_id); + return state_seq_get_data(io, seq_id); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); return 0; @@ -3486,9 +3299,9 @@ size_t llama_context_kv_self::state_seq_get_size(llama_seq_id seq_id) { } size_t llama_context_kv_self::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { - llama_data_write_buffer data_ctx(this, dst, size); + llama_io_write_buffer io(dst, size); try { - return state_seq_get_data(data_ctx, seq_id); + return state_seq_get_data(io, seq_id); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); return 0; @@ -3496,9 +3309,9 @@ size_t llama_context_kv_self::state_seq_get_data(llama_seq_id seq_id, uint8_t * } size_t llama_context_kv_self::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { - llama_data_read_buffer data_ctx(this, src, size); + llama_io_read_buffer io(src, size); try { - return state_seq_set_data(data_ctx, seq_id); + return state_seq_set_data(io, seq_id); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); return 0; @@ -3536,8 +3349,8 @@ bool llama_context_kv_self::state_load_file(const char * filepath, llama_token * { const size_t n_state_size_cur = file.size() - file.tell(); - llama_data_read_file data_ctx(this, &file); - const size_t n_read = state_set_data(data_ctx); + llama_io_read_file io( &file); + const size_t n_read = state_set_data(io); if (n_read != n_state_size_cur) { LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read); @@ -3559,8 +3372,8 @@ bool llama_context_kv_self::state_save_file(const char * filepath, const llama_t file.write_raw(tokens, sizeof(llama_token) * n_token_count); // save the context state using stream saving - llama_data_write_file data_ctx(this, &file); - state_get_data(data_ctx); + llama_io_write_file io(&file); + state_get_data(io); return true; } @@ -3595,8 +3408,8 @@ size_t llama_context_kv_self::state_seq_load_file(llama_seq_id seq_id, const cha // restore the context state { const size_t state_size = file.size() - file.tell(); - llama_data_read_file data_ctx(this, &file); - const size_t nread = state_seq_set_data(data_ctx, seq_id); + llama_io_read_file io(&file); + const size_t nread = state_seq_set_data(io, seq_id); if (!nread) { LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); return 0; @@ -3619,116 +3432,171 @@ size_t llama_context_kv_self::state_seq_save_file(llama_seq_id seq_id, const cha file.write_raw(tokens, sizeof(llama_token) * n_token_count); // save the context state using stream saving - llama_data_write_file data_ctx(this, &file); - state_seq_get_data(data_ctx, seq_id); + llama_io_write_file io(&file); + state_seq_get_data(io, seq_id); const size_t res = file.tell(); - GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written()); + GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes()); return res; } -/** copy state data into either a buffer or file depending on the passed in context - * - * file context: - * llama_file file("/path", "wb"); - * llama_data_write_file data_ctx(&file); - * llama_state_get_data_internal(ctx, data_ctx); - * - * buffer context: - * std::vector buf(max_size, 0); - * llama_data_write_buffer data_ctx(buf.data(), max_size); - * llama_state_get_data_internal(ctx, data_ctx); - * -*/ -size_t llama_context_kv_self::state_get_data(llama_data_write & data_ctx) { +size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { synchronize(); - data_ctx.write_model_info(); - - // copy outputs - data_ctx.write_output_ids(); - data_ctx.write_logits(); - data_ctx.write_embeddings(); - - llama_kv_cache::io io = { - /* .write = */ [&](const void * src, size_t size) { - data_ctx.write(src, size); - }, - /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { - data_ctx.write_tensor_data(tensor, offset, size); - }, - /* .read = */ nullptr, - /* .read_to = */ nullptr, - }; + // write model info + { + const std::string arch_str = llm_arch_name(model.arch); + io.write_string(arch_str); + // TODO: add more model-specific info which should prevent loading the session file if not identical + } + + // write output ids + { + reorder_outputs(); + + const uint32_t n_outputs = this->n_outputs; + const auto & output_ids = this->output_ids; + + std::vector w_output_pos; + + GGML_ASSERT(n_outputs <= output_size); + + w_output_pos.resize(n_outputs); + + // build a more compact representation of the output ids + for (size_t i = 0; i < n_batch(); ++i) { + // map an output id to a position in the batch + int32_t pos = output_ids[i]; + if (pos >= 0) { + GGML_ASSERT((uint32_t) pos < n_outputs); + w_output_pos[pos] = i; + } + } + + io.write(&n_outputs, sizeof(n_outputs)); + + if (n_outputs) { + io.write(w_output_pos.data(), n_outputs * sizeof(int32_t)); + } + } + + // write logits + { + const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens()); + + io.write(&logits_size, sizeof(logits_size)); + + if (logits_size) { + io.write(logits, logits_size * sizeof(float)); + } + } + + // write mbeddings + { + const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd); + + io.write(&embd_size, sizeof(embd_size)); + + if (embd_size) { + io.write(embd, embd_size * sizeof(float)); + } + } kv_self.state_write(io, model.hparams); - return data_ctx.get_size_written(); + return io.n_bytes(); } -size_t llama_context_kv_self::state_set_data(llama_data_read & data_ctx) { +size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { synchronize(); - data_ctx.read_model_info(); - - // set outputs - data_ctx.read_output_ids(); - data_ctx.read_logits(); - data_ctx.read_embeddings(); - - llama_kv_cache::io io = { - /* .write = */ nullptr, - /* .write_tensor_data = */ nullptr, - /* .read = */ [&](size_t size) { - return data_ctx.read(size); - }, - /* .read_to = */ [&](void * dst, size_t size) { - data_ctx.read_to(dst, size); - }, - }; + // read model info + { + const std::string cur_arch_str = llm_arch_name(model.arch); + + std::string arch_str; + io.read_string(arch_str); + if (cur_arch_str != arch_str) { + throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); + } + // TODO: add more info which needs to be identical but which is not verified otherwise + } + + // read output ids + { + std::vector output_pos; + + uint32_t n_outputs; + io.read_to(&n_outputs, sizeof(n_outputs)); + + if (n_outputs > reserve_outputs(n_outputs)) { + throw std::runtime_error("could not reserve outputs"); + } + + if (n_outputs) { + output_pos.resize(n_outputs); + io.read_to(output_pos.data(), n_outputs * sizeof(int32_t)); + + for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { + int32_t id = output_pos[i]; + if ((uint32_t) id >= n_batch()) { + throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch())); + } + this->output_ids[id] = i; + } + + this->n_outputs = n_outputs; + } + } + + // read logits + { + uint64_t logits_size; + io.read_to(&logits_size, sizeof(logits_size)); + + if (this->logits_size < logits_size) { + throw std::runtime_error("logits buffer too small"); + } + + if (logits_size) { + io.read_to(this->logits, logits_size * sizeof(float)); + } + } + + // read embeddings + { + uint64_t embd_size; + io.read_to(&embd_size, sizeof(embd_size)); + + if (this->embd_size < embd_size) { + throw std::runtime_error("embeddings buffer too small"); + } + + if (embd_size) { + io.read_to(this->embd, embd_size * sizeof(float)); + } + } kv_self.state_read(io, model.hparams); - return data_ctx.get_size_read(); + return io.n_bytes(); } -size_t llama_context_kv_self::state_seq_get_data(llama_data_write & data_ctx, llama_seq_id seq_id) { +size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { synchronize(); - llama_kv_cache::io io = { - /* .write = */ [&](const void * src, size_t size) { - data_ctx.write(src, size); - }, - /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { - data_ctx.write_tensor_data(tensor, offset, size); - }, - /* .read = */ nullptr, - /* .read_to = */ nullptr, - }; - kv_self.state_write(io, model.hparams, seq_id); - return data_ctx.get_size_written(); + return io.n_bytes(); } -size_t llama_context_kv_self::state_seq_set_data(llama_data_read & data_ctx, llama_seq_id seq_id) { +size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { synchronize(); - llama_kv_cache::io io = { - /* .write = */ nullptr, - /* .write_tensor_data = */ nullptr, - /* .read = */ [&](size_t size) { - return data_ctx.read(size); - }, - /* .read_to = */ [&](void * dst, size_t size) { - data_ctx.read_to(dst, size); - }, - }; - kv_self.state_read(io, model.hparams, seq_id); - return data_ctx.get_size_read(); + return io.n_bytes(); } // diff --git a/src/llama-context.h b/src/llama-context.h index 648a41045a070..204793d75a5b1 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -15,6 +15,9 @@ #include #include +class llama_io_read_i; +class llama_io_write_i; + using llama_loras = std::unordered_map; struct llama_context : public llama_graph_i { @@ -178,9 +181,10 @@ struct llama_context : public llama_graph_i { virtual llama_perf_context_data perf_get_data() const; virtual void perf_reset(); +protected: + // members -protected: const llama_model & model; llama_cparams cparams; @@ -502,11 +506,11 @@ class llama_context_kv_self : public llama_context { size_t n_token_count) override; private: - size_t state_get_data(struct llama_data_write & data_ctx); - size_t state_set_data(struct llama_data_read & data_ctx); + size_t state_get_data(llama_io_write_i & io); + size_t state_set_data(llama_io_read_i & io); - size_t state_seq_get_data(struct llama_data_write & data_ctx, llama_seq_id seq_id); - size_t state_seq_set_data(struct llama_data_read & data_ctx, llama_seq_id seq_id); + size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id); + size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id); }; // For internal test use diff --git a/src/llama-io.cpp b/src/llama-io.cpp new file mode 100644 index 0000000000000..7ad70d163343d --- /dev/null +++ b/src/llama-io.cpp @@ -0,0 +1,15 @@ +#include "llama-io.h" + +void llama_io_write_i::write_string(const std::string & str) { + uint32_t str_size = str.size(); + + write(&str_size, sizeof(str_size)); + write(str.data(), str_size); +} + +void llama_io_read_i::read_string(std::string & str) { + uint32_t str_size; + read_to(&str_size, sizeof(str_size)); + + str.assign((const char *) read(str_size), str_size); +} diff --git a/src/llama-io.h b/src/llama-io.h new file mode 100644 index 0000000000000..ce9216b83b192 --- /dev/null +++ b/src/llama-io.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include + +struct ggml_tensor; + +class llama_io_write_i { +public: + llama_io_write_i() = default; + virtual ~llama_io_write_i() = default; + + virtual void write(const void * src, size_t size) = 0; + virtual void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) = 0; + + // bytes written so far + virtual size_t n_bytes() = 0; + + void write_string(const std::string & str); +}; + +class llama_io_read_i { +public: + llama_io_read_i() = default; + virtual ~llama_io_read_i() = default; + + virtual const uint8_t * read(size_t size) = 0; + virtual void read_to(void * dst, size_t size) = 0; + + // bytes read so far + virtual size_t n_bytes() = 0; + + void read_string(std::string & str); +}; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index b79c2ff934a6e..c93410f0a412c 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -698,7 +698,7 @@ size_t llama_kv_cache::size_v_bytes() const { return size_v_bytes; } -void llama_kv_cache::state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) const { +void llama_kv_cache::state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id) const { std::vector> cell_ranges; // ranges, from inclusive, to exclusive uint32_t cell_count = 0; @@ -736,7 +736,7 @@ void llama_kv_cache::state_write(const io & io, const llama_hparams & hparams, l state_write_data(io, cell_ranges, hparams); } -void llama_kv_cache::state_read(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) { +void llama_kv_cache::state_read(llama_io_read_i & io, const llama_hparams & hparams, llama_seq_id seq_id) { uint32_t cell_count; io.read_to(&cell_count, sizeof(cell_count)); @@ -754,7 +754,7 @@ void llama_kv_cache::state_read(const io & io, const llama_hparams & hparams, ll } } -void llama_kv_cache::state_write_meta(const io & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { +void llama_kv_cache::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { for (const auto & range : cell_ranges) { for (uint32_t i = range.first; i < range.second; ++i) { const auto & cell = cells[i]; @@ -773,7 +773,7 @@ void llama_kv_cache::state_write_meta(const io & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const { +void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const { const uint32_t v_trans = this->v_trans ? 1 : 0; const uint32_t n_layer = hparams.n_layer; @@ -799,7 +799,7 @@ void llama_kv_cache::state_write_data(const io & io, const std::vector write; - std::function write_tensor_data; - - std::function read; - std::function read_to; - }; - - void state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const; - void state_read (const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1); + void state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const; + void state_read (llama_io_read_i & io, const llama_hparams & hparams, llama_seq_id seq_id = -1); private: ggml_type type_k = GGML_TYPE_F16; @@ -132,11 +125,11 @@ struct llama_kv_cache { std::vector ctxs; std::vector bufs; - void state_write_meta(const io & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; - void state_write_data(const io & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const; + void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; + void state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const; - bool state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1); - bool state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count); + bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1); + bool state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count); }; // From f7c7757babe54db018f8f16953148cb79a287d17 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Feb 2025 12:37:28 +0200 Subject: [PATCH 40/84] context : abstract state read/write ggml-ci --- src/llama-context.cpp | 2882 +++++++++++++++++++++-------------------- src/llama-context.h | 72 +- 2 files changed, 1482 insertions(+), 1472 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d6618f1438869..bde6659531024 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -326,1027 +326,1169 @@ ggml_tensor * llama_context::build_rope_factors(int il) { return model.layers[il].rope_short; } -void llama_context::perf_reset() { - t_start_us = ggml_time_us(); - t_eval_us = n_eval = 0; - t_p_eval_us = n_p_eval = 0; -} - // -// llama_context_kv_self +// state // -llama_context_kv_self::llama_context_kv_self( - const llama_model & model, - const llama_context_params & params) : llama_context(model) { - const auto & hparams = model.hparams; - - cparams.n_seq_max = std::max(1u, params.n_seq_max); - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch; - cparams.yarn_ext_factor = params.yarn_ext_factor; - cparams.yarn_attn_factor = params.yarn_attn_factor; - cparams.yarn_beta_fast = params.yarn_beta_fast; - cparams.yarn_beta_slow = params.yarn_beta_slow; - cparams.defrag_thold = params.defrag_thold; - cparams.embeddings = params.embeddings; - cparams.offload_kqv = params.offload_kqv; - cparams.flash_attn = params.flash_attn; - cparams.no_perf = params.no_perf; - cparams.pooling_type = params.pooling_type; - - cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; - cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; - - cparams.n_ctx = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams)); +class llama_io_write_dummy : public llama_io_write_i { +public: + llama_io_write_dummy() = default; - // with causal attention, the batch size is limited by the context size - cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; + void write(const void * /* src */, size_t size) override { + size_written += size; + } - // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask - // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) - // ref: https://github.com/ggerganov/llama.cpp/pull/5021 - if (cparams.n_batch < GGML_KQ_MASK_PAD) { - LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); - cparams.n_batch = GGML_KQ_MASK_PAD; + void write_tensor(const ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { + size_written += size; } - cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + size_t n_bytes() override { + return size_written; + } - cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : - hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : - hparams.n_ctx_train; + size_t size_written = 0; +}; - cparams.cb_eval = params.cb_eval; - cparams.cb_eval_user_data = params.cb_eval_user_data; +class llama_io_write_buffer : public llama_io_write_i { +public: + llama_io_write_buffer( + uint8_t * p, size_t len) : ptr(p), buf_size(len) {} - auto rope_scaling_type = params.rope_scaling_type; - if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { - rope_scaling_type = hparams.rope_scaling_type_train; + void write(const void * src, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + memcpy(ptr, src, size); + ptr += size; + size_written += size; + buf_size -= size; } - if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { - cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none + void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + ggml_backend_tensor_get(tensor, ptr, offset, size); + ptr += size; + size_written += size; + buf_size -= size; } - if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' - cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; + size_t n_bytes() override { + return size_written; } - cparams.yarn_attn_factor *= hparams.rope_attn_factor; + uint8_t * ptr; + size_t buf_size = 0; + size_t size_written = 0; +}; - if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { - if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { - cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; - } else { - cparams.pooling_type = hparams.pooling_type; +class llama_io_read_buffer : public llama_io_read_i { +public: + llama_io_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {} + + const uint8_t * read(size_t size) override { + const uint8_t * base_ptr = ptr; + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); } + ptr += size; + size_read += size; + buf_size -= size; + return base_ptr; } - if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) { - cparams.causal_attn = hparams.causal_attn; - } else { - cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; + void read_to(void * dst, size_t size) override { + memcpy(dst, read(size), size); } - const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; + size_t n_bytes() override { + return size_read; + } - LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); - LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); - LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); - LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); - LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); - LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); + const uint8_t * ptr; + size_t buf_size = 0; + size_t size_read = 0; +}; - if (n_ctx_per_seq < hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", - __func__, n_ctx_per_seq, hparams.n_ctx_train); +class llama_io_write_file : public llama_io_write_i { +public: + llama_io_write_file(llama_file * f) : file(f) {} + + void write(const void * src, size_t size) override { + file->write_raw(src, size); + size_written += size; } - if (n_ctx_per_seq > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", - __func__, n_ctx_per_seq, hparams.n_ctx_train); + void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override { + temp_buffer.resize(size); + ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); + write(temp_buffer.data(), temp_buffer.size()); } - logits_all = params.logits_all; + size_t n_bytes() override { + return size_written; + } - // build worst-case graph for encoder if a model contains encoder - is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder() + llama_file * file; + size_t size_written = 0; + std::vector temp_buffer; +}; - uint32_t kv_size = cparams.n_ctx; - ggml_type type_k = params.type_k; - ggml_type type_v = params.type_v; +class llama_io_read_file : public llama_io_read_i { +public: + llama_io_read_file(llama_file * f) : file(f) {} - // Mamba only needs a constant number of KV cache cells per sequence - if (llama_model_is_recurrent(&model)) { - // Mamba needs at least as many KV cells as there are sequences kept at any time - kv_size = std::max((uint32_t) 1, params.n_seq_max); - // it's probably best to keep as much precision as possible for the states - type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states - type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states + void read_to(void * dst, size_t size) override { + file->read_raw(dst, size); + size_read += size; } - GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); - GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); + const uint8_t * read(size_t size) override { + temp_buffer.resize(size); + read_to(temp_buffer.data(), size); + return temp_buffer.data(); + } - if (!hparams.vocab_only) { - // GPU backends - for (auto * dev : model.devices) { - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); - throw std::runtime_error("failed to initialize backend"); - } - backends.emplace_back(backend); - } + size_t n_bytes() override { + return size_read; + } - // add ACCEL backends (such as BLAS) - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - ggml_backend_dev_t dev = ggml_backend_dev_get(i); - if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); - throw std::runtime_error("failed to initialize backend"); - } - backends.emplace_back(backend); - } - } + llama_file * file; + size_t size_read = 0; + std::vector temp_buffer; +}; - // add CPU backend - backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - if (backend_cpu == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); - throw std::runtime_error("failed to initialize CPU backend"); - } - backends.emplace_back(backend_cpu); +size_t llama_context::state_get_size() { + llama_io_write_dummy io; + try { + return state_get_data(io); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); + return 0; + } +} - // create a list of the set_n_threads functions in the backends - for (auto & backend : backends) { - ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); - ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; - if (reg) { - auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); - if (ggml_backend_set_n_threads_fn) { - set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); - } - } - } +size_t llama_context::state_get_data(uint8_t * dst, size_t size) { + llama_io_write_buffer io(dst, size); + try { + return state_get_data(io); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); + return 0; + } +} - llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data); +size_t llama_context::state_set_data(const uint8_t * src, size_t size) { + llama_io_read_buffer io(src, size); + try { + return state_set_data(io); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); + return 0; + } +} - if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { - LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); - throw std::runtime_error("failed to initialize self-attention cache"); - } +size_t llama_context::state_seq_get_size(llama_seq_id seq_id) { + llama_io_write_dummy io; + try { + return state_seq_get_data(io, seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); + return 0; + } +} - { - const size_t memory_size_k = kv_self.size_k_bytes(); - const size_t memory_size_v = kv_self.size_v_bytes(); +size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { + llama_io_write_buffer io(dst, size); + try { + return state_seq_get_data(io, seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); + return 0; + } +} - LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, - (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), - ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); +size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { + llama_io_read_buffer io(src, size); + try { + return state_seq_set_data(io, seq_id); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); + return 0; + } +} + +bool llama_context::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(filepath, "rb"); + + // sanity checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { + LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return false; } + } - // graph outputs buffer - { - // resized during inference when a batch uses more outputs - if (reserve_outputs(params.n_seq_max) < params.n_seq_max) { - LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); - throw std::runtime_error("failed to reserve initial output buffer"); - } + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); - LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, - ggml_backend_buffer_name (buf_output.get()), - ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0); + if (n_token_count > n_token_capacity) { + LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return false; } - // scheduler and compute buffers - { - // buffer types used for the compute buffer of each backend - std::vector backend_buft; - std::vector backend_ptrs; - for (auto & backend : backends) { - auto * buft = ggml_backend_get_default_buffer_type(backend.get()); - auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); - if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) { - // use the host buffer of the first device CPU for faster transfer of the intermediate state - auto * dev = model.devices[0]; - auto * host_buft = ggml_backend_dev_host_buffer_type(dev); - if (host_buft) { - buft = host_buft; - } - } - backend_buft.push_back(buft); - backend_ptrs.push_back(backend.get()); - } + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; + } - const size_t max_nodes = model.max_nodes(); + // restore the context state + { + const size_t n_state_size_cur = file.size() - file.tell(); - // buffer used to store the computation graph and the tensor meta data - // TODO: move to base class - buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + llama_io_read_file io( &file); + const size_t n_read = state_set_data(io); - // TODO: move these checks to ggml_backend_sched - // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary - bool pipeline_parallel = - model.n_devices() > 1 && - model.params.n_gpu_layers > (int) model.hparams.n_layer && - model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && - params.offload_kqv; + if (n_read != n_state_size_cur) { + LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read); + return false; + } + } - // pipeline parallelism requires support for async compute and events in all devices - if (pipeline_parallel) { - for (auto & backend : backends) { - auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); - if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { - // ignore CPU backend - continue; - } - auto * dev = ggml_backend_get_device(backend.get()); - ggml_backend_dev_props props; - ggml_backend_dev_get_props(dev, &props); - if (!props.caps.async || !props.caps.events) { - // device does not support async compute or events - pipeline_parallel = false; - break; - } - } - } + return true; +} - sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); +bool llama_context::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) { + llama_file file(filepath, "wb"); - if (pipeline_parallel) { - LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); - } + file.write_u32(LLAMA_SESSION_MAGIC); + file.write_u32(LLAMA_SESSION_VERSION); - // initialize scheduler with the worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); - llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_pp = build_graph(ubatch_pp, true); + // save the context state using stream saving + llama_io_write_file io(&file); + state_get_data(io); - // reserve pp graph first so that buffers are only allocated once - ggml_backend_sched_reserve(sched.get(), gf_pp); - int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); - int n_nodes_pp = ggml_graph_n_nodes(gf_pp); + return true; +} - // reserve with tg graph to get the number of splits and nodes - llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_tg = build_graph(ubatch_tg, true); - ggml_backend_sched_reserve(sched.get(), gf_tg); - int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); - int n_nodes_tg = ggml_graph_n_nodes(gf_tg); +size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(filepath, "rb"); - // reserve again with pp graph to avoid ggml-alloc reallocations during inference - gf_pp = build_graph(ubatch_pp, true); - if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - throw std::runtime_error("failed to allocate compute buffers"); - } + // version checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); - for (size_t i = 0; i < backend_ptrs.size(); ++i) { - ggml_backend_t backend = backend_ptrs[i]; - ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); - if (size > 1) { - LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); - } - } + if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) { + LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version); + return 0; + } + } - if (n_nodes_pp == n_nodes_tg) { - LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); - } else { - LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); - } - if (n_splits_pp == n_splits_tg) { - LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); - } else { - LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); - } + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); + + if (n_token_count > n_token_capacity) { + LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return 0; } + + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; } -} -llama_context_kv_self::~llama_context_kv_self() = default; + // restore the context state + { + const size_t state_size = file.size() - file.tell(); + llama_io_read_file io(&file); + const size_t nread = state_seq_set_data(io, seq_id); + if (!nread) { + LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); + return 0; + } + GGML_ASSERT(nread <= state_size); + GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell()); + } -uint32_t llama_context_kv_self::n_seq_max() const { - // TODO: add notion of n_seq_max to llama_kv_cache and use it here - return kv_self.size; + return file.tell(); } -llama_kv_cache * llama_context_kv_self::get_kv_self() { - return &kv_self; -} +size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) { + llama_file file(filepath, "wb"); -const llama_kv_cache * llama_context_kv_self::get_kv_self() const { - return &kv_self; + file.write_u32(LLAMA_STATE_SEQ_MAGIC); + file.write_u32(LLAMA_STATE_SEQ_VERSION); + + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); + + // save the context state using stream saving + llama_io_write_file io(&file); + state_seq_get_data(io, seq_id); + + const size_t res = file.tell(); + GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes()); + + return res; } -float * llama_context_kv_self::get_logits() { - // reorder logits for backward compatibility - reorder_outputs(); +size_t llama_context::state_get_data(llama_io_write_i & io) { + // write model info + { + const std::string arch_str = llm_arch_name(model.arch); + io.write_string(arch_str); + // TODO: add more model-specific info which should prevent loading the session file if not identical + } - return logits; + return io.n_bytes(); } -float * llama_context_kv_self::get_logits_ith(int32_t i) { - int32_t j = -1; +size_t llama_context::state_set_data(llama_io_read_i & io) { + // read model info + { + const std::string cur_arch_str = llm_arch_name(model.arch); - try { - if (logits == nullptr) { - throw std::runtime_error("no logits"); - } - - if (i < 0) { - j = n_outputs + i; - if (j < 0) { - throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs)); - } - } else if ((size_t) i >= output_ids.size()) { - throw std::runtime_error(format("out of range [0, %zu)", output_ids.size())); - } else { - j = output_ids[i]; - } - - if (j < 0) { - throw std::runtime_error(format("batch.logits[%d] != true", i)); - } - if (j >= n_outputs) { - // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + std::string arch_str; + io.read_string(arch_str); + if (cur_arch_str != arch_str) { + throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); } - - return logits + j*model.vocab.n_tokens(); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); -#ifndef NDEBUG - GGML_ABORT("fatal error"); -#else - return nullptr; -#endif + // TODO: add more info which needs to be identical but which is not verified otherwise } -} - -float * llama_context_kv_self::get_embeddings() { - // reorder embeddings for backward compatibility - reorder_outputs(); - return embd; + return io.n_bytes(); } -float * llama_context_kv_self::get_embeddings_ith(int32_t i) { - int32_t j = -1; - - try { - if (embd == nullptr) { - throw std::runtime_error("no embeddings"); - } - - if (i < 0) { - j = n_outputs + i; - if (j < 0) { - throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs)); - } - } else if ((size_t) i >= output_ids.size()) { - throw std::runtime_error(format("out of range [0, %zu)", output_ids.size())); - } else { - j = output_ids[i]; - } - - if (j < 0) { - throw std::runtime_error(format("batch.logits[%d] != true", i)); - } - if (j >= n_outputs) { - // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); - } +size_t llama_context::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { + GGML_UNUSED(seq_id); - return embd + j*model.hparams.n_embd; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); -#ifndef NDEBUG - GGML_ABORT("fatal error"); -#else - return nullptr; -#endif - } + return io.n_bytes(); } -float * llama_context_kv_self::get_embeddings_seq(llama_seq_id seq_id) { - auto it = embd_seq.find(seq_id); - if (it == embd_seq.end()) { - return nullptr; - } +size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { + GGML_UNUSED(seq_id); - return it->second.data(); + return io.n_bytes(); } -ggml_context_ptr llama_context_kv_self::init() { - inp_tokens = nullptr; - inp_embd = nullptr; - inp_pos = nullptr; - inp_out_ids = nullptr; - inp_mean = nullptr; - inp_cls = nullptr; - inp_embd_enc = nullptr; - inp_pos_bucket = nullptr; - inp_KQ_mask = nullptr; - inp_KQ_mask_cnv = nullptr; - inp_KQ_mask_swa = nullptr; - inp_KQ_mask_swa_cnv = nullptr; - inp_KQ_mask_cross = nullptr; - inp_K_shift = nullptr; - inp_s_copy = nullptr; - inp_s_mask = nullptr; - - return llama_context::init(); +void llama_context::perf_reset() { + t_start_us = ggml_time_us(); + t_eval_us = n_eval = 0; + t_p_eval_us = n_p_eval = 0; } -struct llama_context_kv_self::batch_manager { - batch_manager(llama_context_kv_self & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { - const auto & model = lctx.model; - const auto & cparams = lctx.cparams; - const auto & hparams = lctx.model.hparams; - - const auto & kv_self = lctx.kv_self; - - const int64_t n_tokens_all = batch.n_tokens; - const int64_t n_embd = hparams.n_embd; +// +// llama_context_kv_self +// - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT +llama_context_kv_self::llama_context_kv_self( + const llama_model & model, + const llama_context_params & params) : llama_context(model) { + const auto & hparams = model.hparams; - if (batch.token) { - for (int64_t i = 0; i < n_tokens_all; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); - throw std::runtime_error("invalid token"); - } - } - } + cparams.n_seq_max = std::max(1u, params.n_seq_max); + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.defrag_thold = params.defrag_thold; + cparams.embeddings = params.embeddings; + cparams.offload_kqv = params.offload_kqv; + cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; + cparams.pooling_type = params.pooling_type; - GGML_ASSERT(n_tokens_all <= cparams.n_batch); + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; - GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); + cparams.n_ctx = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams)); - if (lctx.t_compute_start_us == 0) { - lctx.t_compute_start_us = ggml_time_us(); - } - lctx.n_queued_tokens += n_tokens_all; + // with causal attention, the batch size is limited by the context size + cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; - // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens - const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask + // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) + // ref: https://github.com/ggerganov/llama.cpp/pull/5021 + if (cparams.n_batch < GGML_KQ_MASK_PAD) { + LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); + cparams.n_batch = GGML_KQ_MASK_PAD; + } - lctx.embd_seq.clear(); + cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - // count outputs - if (batch.logits && !embd_pooled) { - for (uint32_t i = 0; i < n_tokens_all; ++i) { - n_outputs_all += batch.logits[i] != 0; - } - } else if (lctx.logits_all || embd_pooled) { - n_outputs_all = n_tokens_all; - } else { - // keep last output only - n_outputs_all = 1; - } + cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : + hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : + hparams.n_ctx_train; - const bool logits_all = n_outputs_all == n_tokens_all; + cparams.cb_eval = params.cb_eval; + cparams.cb_eval_user_data = params.cb_eval_user_data; - lctx.sbatch.from_batch(batch, n_embd, - /* simple_split */ !kv_self.recurrent, - /* logits_all */ logits_all); + auto rope_scaling_type = params.rope_scaling_type; + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { + rope_scaling_type = hparams.rope_scaling_type_train; } - ~batch_manager() { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { + cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none } - bool is_done() const { - return lctx.sbatch.n_tokens == 0; + if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } - llama_ubatch next() { - llama_ubatch ubatch = llama_ubatch(); - - const auto & cparams = lctx.cparams; - const auto & kv_self = lctx.kv_self; - - const auto & n_ubatch = cparams.n_ubatch; - - const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + cparams.yarn_attn_factor *= hparams.rope_attn_factor; - if (kv_self.recurrent) { - if (embd_pooled) { - // Pooled embeddings cannot be split across ubatches (yet) - ubatch = lctx.sbatch.split_seq(n_ubatch); - } else { - // recurrent model architectures are easier to implement - // with equal-length sequences - ubatch = lctx.sbatch.split_equal(n_ubatch); - } + if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { + if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { + cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; } else { - ubatch = lctx.sbatch.split_simple(n_ubatch); + cparams.pooling_type = hparams.pooling_type; } - - return ubatch; } - bool prepare(const llama_ubatch & ubatch) { - const auto & cparams = lctx.cparams; - const auto & hparams = lctx.model.hparams; - const auto & batch = lctx.sbatch.batch; + if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) { + cparams.causal_attn = hparams.causal_attn; + } else { + cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; + } - const auto n_tokens_all = batch->n_tokens; + const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; - auto & kv_self = lctx.kv_self; + LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); + LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); + LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); + LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); + LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - // count the outputs in this u_batch - { - int32_t n_outputs_new = 0; + if (n_ctx_per_seq < hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", + __func__, n_ctx_per_seq, hparams.n_ctx_train); + } - if (n_outputs_all == n_tokens_all) { - n_outputs_new = ubatch.n_tokens; - } else { - GGML_ASSERT(ubatch.output); - for (uint32_t i = 0; i < ubatch.n_tokens; i++) { - n_outputs_new += (int32_t) (ubatch.output[i] != 0); - } - } + if (n_ctx_per_seq > hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + __func__, n_ctx_per_seq, hparams.n_ctx_train); + } - // needs to happen before the graph is built - lctx.n_outputs = n_outputs_new; - } + logits_all = params.logits_all; - // non-causal masks do not use the KV cache - if (hparams.causal_attn) { - lctx.kv_self_update(); + // build worst-case graph for encoder if a model contains encoder + is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder() - // if we have enough unused cells before the current head -> - // better to start searching from the beginning of the cache, hoping to fill it - if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) { - kv_self.head = 0; + uint32_t kv_size = cparams.n_ctx; + ggml_type type_k = params.type_k; + ggml_type type_v = params.type_v; + + // Mamba only needs a constant number of KV cache cells per sequence + if (llama_model_is_recurrent(&model)) { + // Mamba needs at least as many KV cells as there are sequences kept at any time + kv_size = std::max((uint32_t) 1, params.n_seq_max); + // it's probably best to keep as much precision as possible for the states + type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states + type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states + } + + GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); + GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); + + if (!hparams.vocab_only) { + // GPU backends + for (auto * dev : model.devices) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + throw std::runtime_error("failed to initialize backend"); } + backends.emplace_back(backend); + } - const auto slot_info = kv_self.find_slot(ubatch); - if (!slot_info) { - return false; + // add ACCEL backends (such as BLAS) + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + throw std::runtime_error("failed to initialize backend"); + } + backends.emplace_back(backend); } + } - kv_slot_restorer.save(slot_info); + // add CPU backend + backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + if (backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + throw std::runtime_error("failed to initialize CPU backend"); + } + backends.emplace_back(backend_cpu); - if (!kv_self.recurrent) { - // a heuristic, to avoid attending the full cache if it is not yet utilized - // after enough generations, the benefit from this heuristic disappears - // if we start defragmenting the cache, the benefit from this will be more important - const uint32_t pad = kv_self.get_padding(cparams); - kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); - //kv_self.n = llama_kv_cache_cell_max(kv_self); + // create a list of the set_n_threads functions in the backends + for (auto & backend : backends) { + ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (reg) { + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn) { + set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); + } } } - //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data); - // reserve a worst case graph if needed - if (lctx.need_reserve) { - LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); + if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { + LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); + throw std::runtime_error("failed to initialize self-attention cache"); + } - const auto & cparams = lctx.cparams; - const auto & model = lctx.model; + { + const size_t memory_size_k = kv_self.size_k_bytes(); + const size_t memory_size_v = kv_self.size_v_bytes(); - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), + ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); + } - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + // graph outputs buffer + { + // resized during inference when a batch uses more outputs + if (reserve_outputs(params.n_seq_max) < params.n_seq_max) { + LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); + throw std::runtime_error("failed to reserve initial output buffer"); + } - ggml_cgraph * gf = lctx.build_graph(ubatch, true); + LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, + ggml_backend_buffer_name (buf_output.get()), + ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0); + } - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched.get()); - if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + // scheduler and compute buffers + { + // buffer types used for the compute buffer of each backend + std::vector backend_buft; + std::vector backend_ptrs; + for (auto & backend : backends) { + auto * buft = ggml_backend_get_default_buffer_type(backend.get()); + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) { + // use the host buffer of the first device CPU for faster transfer of the intermediate state + auto * dev = model.devices[0]; + auto * host_buft = ggml_backend_dev_host_buffer_type(dev); + if (host_buft) { + buft = host_buft; + } + } + backend_buft.push_back(buft); + backend_ptrs.push_back(backend.get()); } - lctx.need_reserve = false; - } + const size_t max_nodes = model.max_nodes(); - return true; - } + // buffer used to store the computation graph and the tensor meta data + // TODO: move to base class + buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); - void restore() { - kv_slot_restorer.restore(lctx.kv_self); - } + // TODO: move these checks to ggml_backend_sched + // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary + bool pipeline_parallel = + model.n_devices() > 1 && + model.params.n_gpu_layers > (int) model.hparams.n_layer && + model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && + params.offload_kqv; - void update(const llama_ubatch & ubatch) { - auto & kv_self = lctx.kv_self; + // pipeline parallelism requires support for async compute and events in all devices + if (pipeline_parallel) { + for (auto & backend : backends) { + auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + // ignore CPU backend + continue; + } + auto * dev = ggml_backend_get_device(backend.get()); + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + if (!props.caps.async || !props.caps.events) { + // device does not support async compute or events + pipeline_parallel = false; + break; + } + } + } - // update the kv ring buffer - { - kv_self.head += ubatch.n_tokens; + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); - // Ensure kv cache head points to a valid index. - if (kv_self.head >= kv_self.size) { - kv_self.head = 0; + if (pipeline_parallel) { + LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); } - } - } - void finalize() { - const auto & cparams = lctx.cparams; + // initialize scheduler with the worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - auto & kv_self = lctx.kv_self; + llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + ggml_cgraph * gf_pp = build_graph(ubatch_pp, true); - // decide if we need to defrag the kv cache - if (cparams.causal_attn && cparams.defrag_thold > 0.0f) { - // - do not defrag small contexts (i.e. < 2048 tokens) - // - count the padding towards the number of used tokens - const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + lctx.get_ctx_padding(cparams))/float(kv_self.n)) : 0.0f; + // reserve pp graph first so that buffers are only allocated once + ggml_backend_sched_reserve(sched.get(), gf_pp); + int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); + int n_nodes_pp = ggml_graph_n_nodes(gf_pp); - // queue defragmentation for next llama_kv_cache_update - if (fragmentation > cparams.defrag_thold) { - LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); + // reserve with tg graph to get the number of splits and nodes + llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + ggml_cgraph * gf_tg = build_graph(ubatch_tg, true); + ggml_backend_sched_reserve(sched.get(), gf_tg); + int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); + int n_nodes_tg = ggml_graph_n_nodes(gf_tg); - kv_self.defrag(); + // reserve again with pp graph to avoid ggml-alloc reallocations during inference + gf_pp = build_graph(ubatch_pp, true); + if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); + } + + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; + ggml_backend_buffer_type_t buft = backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (size > 1) { + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } + } + + if (n_nodes_pp == n_nodes_tg) { + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); + } else { + LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); + } + if (n_splits_pp == n_splits_tg) { + LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); + } else { + LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); } } } +} - int64_t n_outputs_all = 0; +llama_context_kv_self::~llama_context_kv_self() = default; - llama_context_kv_self & lctx; +uint32_t llama_context_kv_self::n_seq_max() const { + // TODO: add notion of n_seq_max to llama_kv_cache and use it here + return kv_self.size; +} - const llama_batch & batch; - - llama_kv_slot_restorer kv_slot_restorer; -}; +llama_kv_cache * llama_context_kv_self::get_kv_self() { + return &kv_self; +} -std::unique_ptr llama_context_kv_self::prepare_batch(const llama_batch & batch) { - return std::make_unique(*this, batch); +const llama_kv_cache * llama_context_kv_self::get_kv_self() const { + return &kv_self; } -int llama_context_kv_self::decode(llama_batch & inp_batch) { - is_encoding = false; +float * llama_context_kv_self::get_logits() { + // reorder logits for backward compatibility + reorder_outputs(); - if (inp_batch.n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); - return -1; - } + return logits; +} - // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); +float * llama_context_kv_self::get_logits_ith(int32_t i) { + int32_t j = -1; - const llama_batch & batch = batch_allocr.batch; + try { + if (logits == nullptr) { + throw std::runtime_error("no logits"); + } - const auto & vocab = model.vocab; - const auto & hparams = model.hparams; + if (i < 0) { + j = n_outputs + i; + if (j < 0) { + throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs)); + } + } else if ((size_t) i >= output_ids.size()) { + throw std::runtime_error(format("out of range [0, %zu)", output_ids.size())); + } else { + j = output_ids[i]; + } - const int32_t n_vocab = vocab.n_tokens(); - const int64_t n_embd = hparams.n_embd; + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + } + if (j >= n_outputs) { + // This should not happen + throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + } - // TODO: try catch - auto bman = prepare_batch(batch); + return logits + j*model.vocab.n_tokens(); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ABORT("fatal error"); +#else + return nullptr; +#endif + } +} - const auto n_outputs_all = bman->n_outputs_all; +float * llama_context_kv_self::get_embeddings() { + // reorder embeddings for backward compatibility + reorder_outputs(); - // reserve output buffer - // TODO: move to batch manager? - if (reserve_outputs(bman->n_outputs_all) < (size_t) n_outputs_all) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); - return -2; - }; + return embd; +} - int64_t n_outputs_prev = 0; +float * llama_context_kv_self::get_embeddings_ith(int32_t i) { + int32_t j = -1; - while (!bman->is_done()) { - llama_ubatch ubatch = bman->next(); + try { + if (embd == nullptr) { + throw std::runtime_error("no embeddings"); + } - if (!bman->prepare(ubatch)) { - LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); - bman->restore(); - return -3; + if (i < 0) { + j = n_outputs + i; + if (j < 0) { + throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs)); + } + } else if ((size_t) i >= output_ids.size()) { + throw std::runtime_error(format("out of range [0, %zu)", output_ids.size())); + } else { + j = output_ids[i]; } - ggml_backend_sched_reset(sched.get()); - ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + } + if (j >= n_outputs) { + // This should not happen + throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + } - ggml_cgraph * gf = build_graph(ubatch, false); + return embd + j*model.hparams.n_embd; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ABORT("fatal error"); +#else + return nullptr; +#endif + } +} - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); +float * llama_context_kv_self::get_embeddings_seq(llama_seq_id seq_id) { + auto it = embd_seq.find(seq_id); + if (it == embd_seq.end()) { + return nullptr; + } - ggml_backend_sched_alloc_graph(sched.get(), gf); + return it->second.data(); +} - set_inputs(ubatch); +ggml_context_ptr llama_context_kv_self::init() { + inp_tokens = nullptr; + inp_embd = nullptr; + inp_pos = nullptr; + inp_out_ids = nullptr; + inp_mean = nullptr; + inp_cls = nullptr; + inp_embd_enc = nullptr; + inp_pos_bucket = nullptr; + inp_KQ_mask = nullptr; + inp_KQ_mask_cnv = nullptr; + inp_KQ_mask_swa = nullptr; + inp_KQ_mask_swa_cnv = nullptr; + inp_KQ_mask_cross = nullptr; + inp_K_shift = nullptr; + inp_s_copy = nullptr; + inp_s_mask = nullptr; - // the output is always the last tensor in the graph - struct ggml_tensor * t_logits = ggml_graph_node(gf, -1); - struct ggml_tensor * t_embd = ggml_graph_node(gf, -2); + return llama_context::init(); +} - if (n_outputs == 0) { - // no output - t_logits = nullptr; - t_embd = nullptr; - } else if (cparams.embeddings) { - t_logits = nullptr; // do not extract logits for embedding case - t_embd = nullptr; - for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { - if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { - t_embd = ggml_graph_node(gf, i); - break; +struct llama_context_kv_self::batch_manager { + batch_manager(llama_context_kv_self & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { + const auto & model = lctx.model; + const auto & cparams = lctx.cparams; + const auto & hparams = lctx.model.hparams; + + const auto & kv_self = lctx.kv_self; + + const int64_t n_tokens_all = batch.n_tokens; + const int64_t n_embd = hparams.n_embd; + + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + + if (batch.token) { + for (int64_t i = 0; i < n_tokens_all; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); + throw std::runtime_error("invalid token"); } } - GGML_ASSERT(t_embd != nullptr && "missing embeddings tensor"); - } else { - t_embd = nullptr; // do not extract embeddings when not needed - GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor"); } - const auto compute_status = compute_graph(gf, ubatch.n_tokens > 1); - if (compute_status != GGML_STATUS_SUCCESS) { - bman->restore(); - switch (compute_status) { - case GGML_STATUS_ABORTED: - return 2; - case GGML_STATUS_ALLOC_FAILED: - return -2; - case GGML_STATUS_FAILED: - default: - return -3; - } - } + GGML_ASSERT(n_tokens_all <= cparams.n_batch); - bman->update(ubatch); + GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); - // plot the computation graph in dot format (for debugging purposes) - //if (n_past%100 == 0) { - // ggml_graph_dump_dot(gf, NULL, "llama.dot"); - //} + if (lctx.t_compute_start_us == 0) { + lctx.t_compute_start_us = ggml_time_us(); + } + lctx.n_queued_tokens += n_tokens_all; - // extract logits - if (t_logits) { - ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); - GGML_ASSERT(backend_res != nullptr); - GGML_ASSERT(logits != nullptr); + // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - float * logits_out = logits + n_outputs_prev*n_vocab; + lctx.embd_seq.clear(); - if (n_outputs) { - GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size); - ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + // count outputs + if (batch.logits && !embd_pooled) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + n_outputs_all += batch.logits[i] != 0; } + } else if (lctx.logits_all || embd_pooled) { + n_outputs_all = n_tokens_all; + } else { + // keep last output only + n_outputs_all = 1; } - // extract embeddings - if (t_embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); - GGML_ASSERT(backend_embd != nullptr); + const bool logits_all = n_outputs_all == n_tokens_all; - switch (cparams.pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - // extract token embeddings - GGML_ASSERT(embd != nullptr); - float * embd_out = embd + n_outputs_prev*n_embd; + lctx.sbatch.from_batch(batch, n_embd, + /* simple_split */ !kv_self.recurrent, + /* logits_all */ logits_all); + } - if (n_outputs) { - GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_MEAN: - case LLAMA_POOLING_TYPE_CLS: - case LLAMA_POOLING_TYPE_LAST: - { - // extract sequence embeddings (cleared before processing each batch) - auto & embd_seq_out = embd_seq; + ~batch_manager() { + } - for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_RANK: - { - // extract the rerank score - a single float per sequence - auto & embd_seq_out = embd_seq; + bool is_done() const { + return lctx.sbatch.n_tokens == 0; + } - for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(1); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_UNSPECIFIED: - { - GGML_ABORT("unknown pooling type"); - } - } - } + llama_ubatch next() { + llama_ubatch ubatch = llama_ubatch(); - n_outputs_prev += n_outputs; - } + const auto & cparams = lctx.cparams; + const auto & kv_self = lctx.kv_self; - // set output mappings - { - bool sorted_output = true; + const auto & n_ubatch = cparams.n_ubatch; - GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all); + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - for (size_t i = 0; i < (size_t) n_outputs_all; ++i) { - size_t out_id = sbatch.out_ids[i]; - output_ids[out_id] = i; - if (out_id != i) { - sorted_output = false; + if (kv_self.recurrent) { + if (embd_pooled) { + // Pooled embeddings cannot be split across ubatches (yet) + ubatch = lctx.sbatch.split_seq(n_ubatch); + } else { + // recurrent model architectures are easier to implement + // with equal-length sequences + ubatch = lctx.sbatch.split_equal(n_ubatch); } + } else { + ubatch = lctx.sbatch.split_simple(n_ubatch); } - if (sorted_output) { - sbatch.out_ids.clear(); - } + return ubatch; } - // set to total number of outputs in the batch, for use in llama_get_logits_ith - n_outputs = n_outputs_all; - - // wait for the computation to finish (automatically done when obtaining the model output) - //synchronize(); + bool prepare(const llama_ubatch & ubatch) { + const auto & cparams = lctx.cparams; + const auto & hparams = lctx.model.hparams; + const auto & batch = lctx.sbatch.batch; - bman->finalize(); + const auto n_tokens_all = batch->n_tokens; - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - ggml_backend_sched_reset(sched.get()); + auto & kv_self = lctx.kv_self; - return 0; -} + // count the outputs in this u_batch + { + int32_t n_outputs_new = 0; -int llama_context_kv_self::encode(llama_batch & inp_batch) { - is_encoding = true; + if (n_outputs_all == n_tokens_all) { + n_outputs_new = ubatch.n_tokens; + } else { + GGML_ASSERT(ubatch.output); + for (uint32_t i = 0; i < ubatch.n_tokens; i++) { + n_outputs_new += (int32_t) (ubatch.output[i] != 0); + } + } - if (inp_batch.n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); - return -1; - } + // needs to happen before the graph is built + lctx.n_outputs = n_outputs_new; + } - // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); + // non-causal masks do not use the KV cache + if (hparams.causal_attn) { + lctx.kv_self_update(); - const llama_batch & batch = batch_allocr.batch; - const uint32_t n_tokens = batch.n_tokens; + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) { + kv_self.head = 0; + } - const auto & hparams = model.hparams; + const auto slot_info = kv_self.find_slot(ubatch); + if (!slot_info) { + return false; + } - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + kv_slot_restorer.save(slot_info); - if (batch.token) { - for (uint32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); - return -1; + if (!kv_self.recurrent) { + // a heuristic, to avoid attending the full cache if it is not yet utilized + // after enough generations, the benefit from this heuristic disappears + // if we start defragmenting the cache, the benefit from this will be more important + const uint32_t pad = kv_self.get_padding(cparams); + kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); + //kv_self.n = llama_kv_cache_cell_max(kv_self); } } - } - // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot - GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); + //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - if (t_compute_start_us == 0) { - t_compute_start_us = ggml_time_us(); - } + // reserve a worst case graph if needed + if (lctx.need_reserve) { + LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); - n_queued_tokens += n_tokens; + const auto & cparams = lctx.cparams; + const auto & model = lctx.model; - const int64_t n_embd = hparams.n_embd; + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - const llama_ubatch ubatch = sbatch.split_simple(n_tokens); + ggml_cgraph * gf = lctx.build_graph(ubatch, true); - // reserve output buffer - if (reserve_outputs(n_tokens) < n_tokens) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); - return -2; - }; + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(lctx.sched.get()); + if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } - for (uint32_t i = 0; i < n_tokens; ++i) { - output_ids[i] = i; + lctx.need_reserve = false; + } + + return true; } - inp_embd_enc = NULL; - n_outputs = n_tokens; + void restore() { + kv_slot_restorer.restore(lctx.kv_self); + } - //batch_manager->prepare(ubatch); + void update(const llama_ubatch & ubatch) { + auto & kv_self = lctx.kv_self; - // TODO: do reserve - GGML_ASSERT(need_reserve == false); + // update the kv ring buffer + { + kv_self.head += ubatch.n_tokens; - ggml_backend_sched_reset(sched.get()); - ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + // Ensure kv cache head points to a valid index. + if (kv_self.head >= kv_self.size) { + kv_self.head = 0; + } + } + } - ggml_cgraph * gf = build_graph(ubatch, false); + void finalize() { + const auto & cparams = lctx.cparams; - ggml_backend_sched_alloc_graph(sched.get(), gf); + auto & kv_self = lctx.kv_self; - set_inputs(ubatch); + // decide if we need to defrag the kv cache + if (cparams.causal_attn && cparams.defrag_thold > 0.0f) { + // - do not defrag small contexts (i.e. < 2048 tokens) + // - count the padding towards the number of used tokens + const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + lctx.get_ctx_padding(cparams))/float(kv_self.n)) : 0.0f; - // the output embeddings after the final encoder normalization - struct ggml_tensor * t_embd = nullptr; + // queue defragmentation for next llama_kv_cache_update + if (fragmentation > cparams.defrag_thold) { + LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); - // there are two cases here - if (llama_model_has_decoder(&model)) { - // first case is an encoder-decoder T5 model where embeddings are passed to decoder - t_embd = ggml_graph_node(gf, -1); - GGML_ASSERT(strcmp(t_embd->name, "result_norm") == 0 && "missing result_output tensor"); - } else { - // second case is an encoder-only T5 model - if (cparams.embeddings) { - // only output embeddings if required - t_embd = ggml_graph_node(gf, -1); - if (strcmp(t_embd->name, "result_embd_pooled") != 0) { - t_embd = ggml_graph_node(gf, -2); + kv_self.defrag(); } - GGML_ASSERT(strcmp(t_embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); } } - const auto compute_status = compute_graph(gf, n_tokens > 1); - switch (compute_status) { - case GGML_STATUS_SUCCESS: - break; - case GGML_STATUS_ABORTED: - return 2; - case GGML_STATUS_ALLOC_FAILED: - return -2; - case GGML_STATUS_FAILED: - default: - return -3; - } + int64_t n_outputs_all = 0; - // extract embeddings - if (t_embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); - GGML_ASSERT(backend_embd != nullptr); + llama_context_kv_self & lctx; - if (llama_model_has_decoder(&model)) { - embd_enc.resize(n_tokens*n_embd); - float * embd_out = embd_enc.data(); + const llama_batch & batch; - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + llama_kv_slot_restorer kv_slot_restorer; +}; - // remember the sequence ids used during the encoding - needed for cross attention later - seq_ids_enc.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - for (int s = 0; s < ubatch.n_seq_id[i]; s++) { - llama_seq_id seq_id = ubatch.seq_id[i][s]; - seq_ids_enc[i].insert(seq_id); - } +std::unique_ptr llama_context_kv_self::prepare_batch(const llama_batch & batch) { + return std::make_unique(*this, batch); +} + +int llama_context_kv_self::decode(llama_batch & inp_batch) { + is_encoding = false; + + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } + + // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); + + const llama_batch & batch = batch_allocr.batch; + + const auto & vocab = model.vocab; + const auto & hparams = model.hparams; + + const int32_t n_vocab = vocab.n_tokens(); + const int64_t n_embd = hparams.n_embd; + + // TODO: try catch + auto bman = prepare_batch(batch); + + const auto n_outputs_all = bman->n_outputs_all; + + // reserve output buffer + // TODO: move to batch manager? + if (reserve_outputs(bman->n_outputs_all) < (size_t) n_outputs_all) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + return -2; + }; + + int64_t n_outputs_prev = 0; + + while (!bman->is_done()) { + llama_ubatch ubatch = bman->next(); + + if (!bman->prepare(ubatch)) { + LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); + bman->restore(); + return -3; + } + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + ggml_cgraph * gf = build_graph(ubatch, false); + + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + set_inputs(ubatch); + + // the output is always the last tensor in the graph + struct ggml_tensor * t_logits = ggml_graph_node(gf, -1); + struct ggml_tensor * t_embd = ggml_graph_node(gf, -2); + + if (n_outputs == 0) { + // no output + t_logits = nullptr; + t_embd = nullptr; + } else if (cparams.embeddings) { + t_logits = nullptr; // do not extract logits for embedding case + t_embd = nullptr; + for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { + t_embd = ggml_graph_node(gf, i); + break; + } } + GGML_ASSERT(t_embd != nullptr && "missing embeddings tensor"); } else { - GGML_ASSERT(embd != nullptr); + t_embd = nullptr; // do not extract embeddings when not needed + GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor"); + } + + const auto compute_status = compute_graph(gf, ubatch.n_tokens > 1); + if (compute_status != GGML_STATUS_SUCCESS) { + bman->restore(); + switch (compute_status) { + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + } + + bman->update(ubatch); + + // plot the computation graph in dot format (for debugging purposes) + //if (n_past%100 == 0) { + // ggml_graph_dump_dot(gf, NULL, "llama.dot"); + //} + + // extract logits + if (t_logits) { + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); + GGML_ASSERT(backend_res != nullptr); + GGML_ASSERT(logits != nullptr); + + float * logits_out = logits + n_outputs_prev*n_vocab; + + if (n_outputs) { + GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size); + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + } + } + + // extract embeddings + if (t_embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); switch (cparams.pooling_type) { case LLAMA_POOLING_TYPE_NONE: { // extract token embeddings GGML_ASSERT(embd != nullptr); - float * embd_out = embd; + float * embd_out = embd + n_outputs_prev*n_embd; - GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + if (n_outputs) { + GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float)); + } } break; case LLAMA_POOLING_TYPE_MEAN: case LLAMA_POOLING_TYPE_CLS: case LLAMA_POOLING_TYPE_LAST: { - // extract sequence embeddings + // extract sequence embeddings (cleared before processing each batch) auto & embd_seq_out = embd_seq; - embd_seq_out.clear(); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - - for (uint32_t i = 0; i < n_tokens; i++) { - const llama_seq_id seq_id = ubatch.seq_id[i][0]; + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { continue; } @@ -1356,93 +1498,306 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { } break; case LLAMA_POOLING_TYPE_RANK: { - // TODO: this likely should be the same logic as in llama_decoder_internal, but better to - // wait for an encoder model that requires this pooling type in order to test it - // https://github.com/ggerganov/llama.cpp/pull/9510 - GGML_ABORT("RANK pooling not implemented yet"); - } + // extract the rerank score - a single float per sequence + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(1); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); + } + } break; case LLAMA_POOLING_TYPE_UNSPECIFIED: { GGML_ABORT("unknown pooling type"); } } } - } - - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - ggml_backend_sched_reset(sched.get()); - return 0; -} + n_outputs_prev += n_outputs; + } -enum ggml_status llama_context_kv_self::compute_graph( - ggml_cgraph * graph, - bool batched) { - int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; - ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; + // set output mappings + { + bool sorted_output = true; - if (backend_cpu != nullptr) { - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); - auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); - set_threadpool_fn(backend_cpu, tp); - } + GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all); - // set the number of threads for all the backends - for (const auto & set_n_threads_fn : set_n_threads_fns) { - set_n_threads_fn.second(set_n_threads_fn.first, n_threads); - } + for (size_t i = 0; i < (size_t) n_outputs_all; ++i) { + size_t out_id = sbatch.out_ids[i]; + output_ids[out_id] = i; + if (out_id != i) { + sorted_output = false; + } + } - auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph); - if (status != GGML_STATUS_SUCCESS) { - LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); + if (sorted_output) { + sbatch.out_ids.clear(); + } } - // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); + // set to total number of outputs in the batch, for use in llama_get_logits_ith + n_outputs = n_outputs_all; - return status; -} + // wait for the computation to finish (automatically done when obtaining the model output) + //synchronize(); -llama_pos llama_context_kv_self::pos_max() const { - return kv_self.pos_max(); -} + bman->finalize(); -uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) const { - return kv_self.get_padding(cparams); -} + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); -void llama_context_kv_self::prepare_k_shift() { + return 0; } -void llama_context_kv_self::prepare_defrag() { -} +int llama_context_kv_self::encode(llama_batch & inp_batch) { + is_encoding = true; -// llama input + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } -void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) { - const llama_hparams & hparams = model.hparams; + // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); - // - // set input data - // + const llama_batch & batch = batch_allocr.batch; + const uint32_t n_tokens = batch.n_tokens; - if (inp_K_shift) { - assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); + const auto & hparams = model.hparams; - int32_t * data = (int32_t *) inp_K_shift->data; + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - for (uint32_t i = 0; i < kv_self.size; ++i) { - data[i] = kv_self.cells[i].delta; + if (batch.token) { + for (uint32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + return -1; + } } - - // the K-shift graph requires just this input - return; } - if (ubatch.token) { - const int64_t n_tokens = ubatch.n_tokens; + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot + GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); - ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); + if (t_compute_start_us == 0) { + t_compute_start_us = ggml_time_us(); + } + + n_queued_tokens += n_tokens; + + const int64_t n_embd = hparams.n_embd; + + sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); + + const llama_ubatch ubatch = sbatch.split_simple(n_tokens); + + // reserve output buffer + if (reserve_outputs(n_tokens) < n_tokens) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); + return -2; + }; + + for (uint32_t i = 0; i < n_tokens; ++i) { + output_ids[i] = i; + } + + inp_embd_enc = NULL; + n_outputs = n_tokens; + + //batch_manager->prepare(ubatch); + + // TODO: do reserve + GGML_ASSERT(need_reserve == false); + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + ggml_cgraph * gf = build_graph(ubatch, false); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + set_inputs(ubatch); + + // the output embeddings after the final encoder normalization + struct ggml_tensor * t_embd = nullptr; + + // there are two cases here + if (llama_model_has_decoder(&model)) { + // first case is an encoder-decoder T5 model where embeddings are passed to decoder + t_embd = ggml_graph_node(gf, -1); + GGML_ASSERT(strcmp(t_embd->name, "result_norm") == 0 && "missing result_output tensor"); + } else { + // second case is an encoder-only T5 model + if (cparams.embeddings) { + // only output embeddings if required + t_embd = ggml_graph_node(gf, -1); + if (strcmp(t_embd->name, "result_embd_pooled") != 0) { + t_embd = ggml_graph_node(gf, -2); + } + GGML_ASSERT(strcmp(t_embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + } + } + + const auto compute_status = compute_graph(gf, n_tokens > 1); + switch (compute_status) { + case GGML_STATUS_SUCCESS: + break; + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + + // extract embeddings + if (t_embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); + + if (llama_model_has_decoder(&model)) { + embd_enc.resize(n_tokens*n_embd); + float * embd_out = embd_enc.data(); + + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + // remember the sequence ids used during the encoding - needed for cross attention later + seq_ids_enc.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + for (int s = 0; s < ubatch.n_seq_id[i]; s++) { + llama_seq_id seq_id = ubatch.seq_id[i][s]; + seq_ids_enc[i].insert(seq_id); + } + } + } else { + GGML_ASSERT(embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(embd != nullptr); + float * embd_out = embd; + + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = embd_seq; + embd_seq_out.clear(); + + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + for (uint32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = ubatch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // TODO: this likely should be the same logic as in llama_decoder_internal, but better to + // wait for an encoder model that requires this pooling type in order to test it + // https://github.com/ggerganov/llama.cpp/pull/9510 + GGML_ABORT("RANK pooling not implemented yet"); + } + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } + } + } + } + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + return 0; +} + +enum ggml_status llama_context_kv_self::compute_graph( + ggml_cgraph * graph, + bool batched) { + int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; + ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; + + if (backend_cpu != nullptr) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(backend_cpu, tp); + } + + // set the number of threads for all the backends + for (const auto & set_n_threads_fn : set_n_threads_fns) { + set_n_threads_fn.second(set_n_threads_fn.first, n_threads); + } + + auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph); + if (status != GGML_STATUS_SUCCESS) { + LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); + } + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); + + return status; +} + +llama_pos llama_context_kv_self::pos_max() const { + return kv_self.pos_max(); +} + +uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) const { + return kv_self.get_padding(cparams); +} + +void llama_context_kv_self::prepare_k_shift() { +} + +void llama_context_kv_self::prepare_defrag() { +} + +// llama input + +void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) { + const llama_hparams & hparams = model.hparams; + + // + // set input data + // + + if (inp_K_shift) { + assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); + + int32_t * data = (int32_t *) inp_K_shift->data; + + for (uint32_t i = 0; i < kv_self.size; ++i) { + data[i] = kv_self.cells[i].delta; + } + + // the K-shift graph requires just this input + return; + } + + if (ubatch.token) { + const int64_t n_tokens = ubatch.n_tokens; + + ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); } if (ubatch.embd) { @@ -2810,646 +3165,323 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer( ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs); // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} - cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); - - // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} - struct ggml_tensor * xz = build_lora_mm(ctx0, model.layers[il].ssm_in, cur); - // split the above in two - // => {d_inner, n_seq_tokens, n_seqs} - struct ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); - struct ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz)); - - // conv - { - // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} - struct ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0); - - // copy last (d_conv - 1) columns back into the state cache - struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); - - ggml_build_forward_expand(graph, - ggml_cpy(ctx0, last_conv, - ggml_view_1d(ctx0, conv_states_all, - (d_conv - 1)*(d_inner)*(n_seqs), - kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); - - // 1D convolution - // The equivalent is to make a self-overlapping view of conv_x - // over d_conv columns at each stride in the 3rd dimension, - // then element-wise multiply that with the conv1d weight, - // then sum the elements of each row, - // (the last two steps are a dot product over rows (also doable with mul_mat)) - // then permute away the ne[0] dimension, - // and then you're left with the resulting x tensor. - // For simultaneous sequences, all sequences need to have the same length. - x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); - - // bias - x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b); - - x = ggml_silu(ctx0, x); - } - - // ssm - { - // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} - struct ggml_tensor * x_db = build_lora_mm(ctx0, model.layers[il].ssm_x, x); - // split - struct ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); - struct ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); - struct ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); - - // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers - if (ssm_dt_b_c_rms) { - dt = ggml_rms_norm(ctx0, dt, norm_rms_eps); - B = ggml_rms_norm(ctx0, B, norm_rms_eps); - C = ggml_rms_norm(ctx0, C, norm_rms_eps); - } - - // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} - dt = build_lora_mm(ctx0, model.layers[il].ssm_dt, dt); - dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); - - // Custom operator to optimize the parallel associative scan - // as described in the Annex D of the Mamba paper. - // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} - struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C); - - // store last states - ggml_build_forward_expand(graph, - ggml_cpy(ctx0, - ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), - ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); - - struct ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); - - // TODO: skip computing output earlier for unused tokens - - // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} - y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); - y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z))); - - // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} - cur = build_lora_mm(ctx0, model.layers[il].ssm_out, y); - } - - // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); - //cb(cur, "mamba_out", il); - - return cur; -} - - -ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load( - ggml_context * ctx0, - ggml_cgraph * graph, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il, - bool worst_case) { - const auto & hparams = model.hparams; - - const auto token_shift_count = hparams.token_shift_count; - - const auto & n_tokens = ubatch.n_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - struct ggml_tensor * token_shift_all = kv_self.k_l[il]; - - struct ggml_tensor * token_shift = build_copy_mask_state( - ctx0, graph, token_shift_all, state_copy, state_mask, - n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); - - token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs); - - return token_shift; -} - - -ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store( - ggml_context * ctx0, - ggml_tensor * token_shift, - const llama_ubatch & ubatch, - int il, - bool worst_case) { - const auto & hparams = model.hparams; - - const auto token_shift_count = hparams.token_shift_count; - const auto n_embd = hparams.n_embd; - - const auto & n_tokens = ubatch.n_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; - - return ggml_cpy( - ctx0, - ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0), - ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) - ); -} - - -ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( - ggml_context * ctx0, - ggml_cgraph * graph, - ggml_tensor * cur, - ggml_tensor * x_prev, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il, - bool worst_case) { - const auto & hparams = model.hparams; - - const auto n_tokens = ubatch.n_tokens; - const auto n_seqs = ubatch.n_seqs; - const auto n_embd = hparams.n_embd; - const auto head_size = hparams.wkv_head_size; - const auto n_head = n_embd / head_size; - const auto n_head_kv = hparams.n_head_kv(il); - - const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; - - const auto layer = &model.layers[il]; - - bool is_qrwkv = layer->time_mix_first == nullptr; - - struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); - struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur); - - xxx = ggml_reshape_4d( - ctx0, - ggml_tanh( - ctx0, - ggml_mul_mat(ctx0, layer->time_mix_w1, xxx) - ), - layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens - ); - - xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2)); - - xxx = ggml_mul_mat( - ctx0, - ggml_reshape_4d( - ctx0, - layer->time_mix_w2, - layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 - ), - xxx - ); - - struct ggml_tensor *xw, *xk, *xv, *xr, *xg; - if (layer->time_mix_lerp_fused) { - // fusing these weights makes some performance improvement - sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens); - cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); - xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur); - xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); - xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - } else { - // for backward compatibility - xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); - xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - - xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur); - xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur); - xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur); - xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur); - xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur); - } - - struct ggml_tensor * r = build_lora_mm(ctx0, layer->time_mix_receptance, xr); - struct ggml_tensor * k = build_lora_mm(ctx0, layer->time_mix_key, xk); - struct ggml_tensor * v = build_lora_mm(ctx0, layer->time_mix_value, xv); - if (layer->time_mix_receptance_b) { - r = ggml_add(ctx0, r, layer->time_mix_receptance_b); - } - if (layer->time_mix_key_b) { - k = ggml_add(ctx0, k, layer->time_mix_key_b); - } - if (layer->time_mix_value_b) { - v = ggml_add(ctx0, v, layer->time_mix_value_b); - } - - struct ggml_tensor * g = build_lora_mm(ctx0, layer->time_mix_gate, xg); - if (is_qrwkv) { - g = ggml_sigmoid(ctx0, g); - } else { - g = ggml_silu(ctx0, g); - } - - if (n_head_kv != 0 && n_head_kv != n_head) { - GGML_ASSERT(n_head % n_head_kv == 0); - k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens); - v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens); - struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens); - k = ggml_repeat(ctx0, k, tmp); - v = ggml_repeat(ctx0, v, tmp); - } - - k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens); - v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens); - r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens); - - struct ggml_tensor * w = ggml_mul_mat( - ctx0, - layer->time_mix_decay_w2, - ggml_tanh( - ctx0, - ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw) - ) - ); - - w = ggml_add(ctx0, w, layer->time_mix_decay); - w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w))); - w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens); - - if (is_qrwkv) { - // k = k * (1 - w) - k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w)); - } - - struct ggml_tensor * wkv_state = build_copy_mask_state( - ctx0, graph, kv_self.v_l[il], state_copy, state_mask, - n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); - - struct ggml_tensor * wkv_output; - if (is_qrwkv) { - wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f)); - } else { - wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, wkv_state); - } - cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); - wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); - - ggml_build_forward_expand( - graph, - ggml_cpy( - ctx0, - wkv_state, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) - ) - ); - - if (!is_qrwkv) { - // group norm with head_count groups - cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens); - cur = ggml_norm(ctx0, cur, 64e-5f); - - // Convert back to regular vectors. - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b); - } else { - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - } - - cur = ggml_mul(ctx0, cur, g); - cur = build_lora_mm(ctx0, layer->time_mix_output, cur); - - return cur; -} - -// -// state -// - -// TODO: this needs a big rework - -class llama_io_write_dummy : public llama_io_write_i { -public: - llama_io_write_dummy() = default; - - void write(const void * /* src */, size_t size) override { - size_written += size; - } - - void write_tensor(const ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { - size_written += size; - } + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); - size_t n_bytes() override { - return size_written; - } + // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} + struct ggml_tensor * xz = build_lora_mm(ctx0, model.layers[il].ssm_in, cur); + // split the above in two + // => {d_inner, n_seq_tokens, n_seqs} + struct ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); + struct ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz)); - size_t size_written = 0; -}; + // conv + { + // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} + struct ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0); -class llama_io_write_buffer : public llama_io_write_i { -public: - llama_io_write_buffer( - uint8_t * p, size_t len) : ptr(p), buf_size(len) {} + // copy last (d_conv - 1) columns back into the state cache + struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); - void write(const void * src, size_t size) override { - if (size > buf_size) { - throw std::runtime_error("unexpectedly reached end of buffer"); - } - memcpy(ptr, src, size); - ptr += size; - size_written += size; - buf_size -= size; - } + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, last_conv, + ggml_view_1d(ctx0, conv_states_all, + (d_conv - 1)*(d_inner)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); - void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override { - if (size > buf_size) { - throw std::runtime_error("unexpectedly reached end of buffer"); - } - ggml_backend_tensor_get(tensor, ptr, offset, size); - ptr += size; - size_written += size; - buf_size -= size; - } + // 1D convolution + // The equivalent is to make a self-overlapping view of conv_x + // over d_conv columns at each stride in the 3rd dimension, + // then element-wise multiply that with the conv1d weight, + // then sum the elements of each row, + // (the last two steps are a dot product over rows (also doable with mul_mat)) + // then permute away the ne[0] dimension, + // and then you're left with the resulting x tensor. + // For simultaneous sequences, all sequences need to have the same length. + x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); - size_t n_bytes() override { - return size_written; - } + // bias + x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b); - uint8_t * ptr; - size_t buf_size = 0; - size_t size_written = 0; -}; + x = ggml_silu(ctx0, x); + } -class llama_io_read_buffer : public llama_io_read_i { -public: - llama_io_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {} + // ssm + { + // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} + struct ggml_tensor * x_db = build_lora_mm(ctx0, model.layers[il].ssm_x, x); + // split + struct ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); + struct ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); + struct ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); - const uint8_t * read(size_t size) override { - const uint8_t * base_ptr = ptr; - if (size > buf_size) { - throw std::runtime_error("unexpectedly reached end of buffer"); + // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers + if (ssm_dt_b_c_rms) { + dt = ggml_rms_norm(ctx0, dt, norm_rms_eps); + B = ggml_rms_norm(ctx0, B, norm_rms_eps); + C = ggml_rms_norm(ctx0, C, norm_rms_eps); } - ptr += size; - size_read += size; - buf_size -= size; - return base_ptr; - } - void read_to(void * dst, size_t size) override { - memcpy(dst, read(size), size); - } + // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} + dt = build_lora_mm(ctx0, model.layers[il].ssm_dt, dt); + dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); - size_t n_bytes() override { - return size_read; - } + // Custom operator to optimize the parallel associative scan + // as described in the Annex D of the Mamba paper. + // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C); - const uint8_t * ptr; - size_t buf_size = 0; - size_t size_read = 0; -}; + // store last states + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), + ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); -class llama_io_write_file : public llama_io_write_i { -public: - llama_io_write_file(llama_file * f) : file(f) {} + struct ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); - void write(const void * src, size_t size) override { - file->write_raw(src, size); - size_written += size; - } + // TODO: skip computing output earlier for unused tokens - void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override { - temp_buffer.resize(size); - ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); - write(temp_buffer.data(), temp_buffer.size()); - } + // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} + y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); + y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z))); - size_t n_bytes() override { - return size_written; + // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} + cur = build_lora_mm(ctx0, model.layers[il].ssm_out, y); } - llama_file * file; - size_t size_written = 0; - std::vector temp_buffer; -}; + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); + //cb(cur, "mamba_out", il); -class llama_io_read_file : public llama_io_read_i { -public: - llama_io_read_file(llama_file * f) : file(f) {} + return cur; +} - void read_to(void * dst, size_t size) override { - file->read_raw(dst, size); - size_read += size; - } - const uint8_t * read(size_t size) override { - temp_buffer.resize(size); - read_to(temp_buffer.data(), size); - return temp_buffer.data(); - } +ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; - size_t n_bytes() override { - return size_read; - } + const auto token_shift_count = hparams.token_shift_count; - llama_file * file; - size_t size_read = 0; - std::vector temp_buffer; -}; + const auto & n_tokens = ubatch.n_tokens; + const int64_t n_seqs = ubatch.n_seqs; -size_t llama_context_kv_self::state_get_size() { - llama_io_write_dummy io; - try { - return state_get_data(io); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); - return 0; - } -} + struct ggml_tensor * token_shift_all = kv_self.k_l[il]; -size_t llama_context_kv_self::state_get_data(uint8_t * dst, size_t size) { - llama_io_write_buffer io(dst, size); - try { - return state_get_data(io); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); - return 0; - } -} + struct ggml_tensor * token_shift = build_copy_mask_state( + ctx0, graph, token_shift_all, state_copy, state_mask, + n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); -size_t llama_context_kv_self::state_set_data(const uint8_t * src, size_t size) { - llama_io_read_buffer io(src, size); - try { - return state_set_data(io); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); - return 0; - } -} + token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs); -size_t llama_context_kv_self::state_seq_get_size(llama_seq_id seq_id) { - llama_io_write_dummy io; - try { - return state_seq_get_data(io, seq_id); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); - return 0; - } + return token_shift; } -size_t llama_context_kv_self::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { - llama_io_write_buffer io(dst, size); - try { - return state_seq_get_data(io, seq_id); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); - return 0; - } -} -size_t llama_context_kv_self::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { - llama_io_read_buffer io(src, size); - try { - return state_seq_set_data(io, seq_id); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); - return 0; - } -} +ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store( + ggml_context * ctx0, + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; -bool llama_context_kv_self::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_file file(filepath, "rb"); + const auto token_shift_count = hparams.token_shift_count; + const auto n_embd = hparams.n_embd; - // sanity checks - { - const uint32_t magic = file.read_u32(); - const uint32_t version = file.read_u32(); + const auto & n_tokens = ubatch.n_tokens; + const int64_t n_seqs = ubatch.n_seqs; - if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { - LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); - return false; - } - } + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + return ggml_cpy( + ctx0, + ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0), + ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) + ); +} - // load the prompt - { - const uint32_t n_token_count = file.read_u32(); - if (n_token_count > n_token_capacity) { - LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); - return false; - } +ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); - *n_token_count_out = n_token_count; - } + const auto n_tokens = ubatch.n_tokens; + const auto n_seqs = ubatch.n_seqs; + const auto n_embd = hparams.n_embd; + const auto head_size = hparams.wkv_head_size; + const auto n_head = n_embd / head_size; + const auto n_head_kv = hparams.n_head_kv(il); - // restore the context state - { - const size_t n_state_size_cur = file.size() - file.tell(); + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; - llama_io_read_file io( &file); - const size_t n_read = state_set_data(io); + const auto layer = &model.layers[il]; - if (n_read != n_state_size_cur) { - LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read); - return false; - } - } + bool is_qrwkv = layer->time_mix_first == nullptr; - return true; -} + struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur); -bool llama_context_kv_self::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) { - llama_file file(filepath, "wb"); + xxx = ggml_reshape_4d( + ctx0, + ggml_tanh( + ctx0, + ggml_mul_mat(ctx0, layer->time_mix_w1, xxx) + ), + layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens + ); - file.write_u32(LLAMA_SESSION_MAGIC); - file.write_u32(LLAMA_SESSION_VERSION); + xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2)); - // save the prompt - file.write_u32((uint32_t) n_token_count); - file.write_raw(tokens, sizeof(llama_token) * n_token_count); + xxx = ggml_mul_mat( + ctx0, + ggml_reshape_4d( + ctx0, + layer->time_mix_w2, + layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 + ), + xxx + ); - // save the context state using stream saving - llama_io_write_file io(&file); - state_get_data(io); + struct ggml_tensor *xw, *xk, *xv, *xr, *xg; + if (layer->time_mix_lerp_fused) { + // fusing these weights makes some performance improvement + sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens); + cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); + xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur); + xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + } else { + // for backward compatibility + xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - return true; -} + xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur); + xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur); + xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur); + xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur); + xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur); + } -size_t llama_context_kv_self::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_file file(filepath, "rb"); + struct ggml_tensor * r = build_lora_mm(ctx0, layer->time_mix_receptance, xr); + struct ggml_tensor * k = build_lora_mm(ctx0, layer->time_mix_key, xk); + struct ggml_tensor * v = build_lora_mm(ctx0, layer->time_mix_value, xv); + if (layer->time_mix_receptance_b) { + r = ggml_add(ctx0, r, layer->time_mix_receptance_b); + } + if (layer->time_mix_key_b) { + k = ggml_add(ctx0, k, layer->time_mix_key_b); + } + if (layer->time_mix_value_b) { + v = ggml_add(ctx0, v, layer->time_mix_value_b); + } - // version checks - { - const uint32_t magic = file.read_u32(); - const uint32_t version = file.read_u32(); + struct ggml_tensor * g = build_lora_mm(ctx0, layer->time_mix_gate, xg); + if (is_qrwkv) { + g = ggml_sigmoid(ctx0, g); + } else { + g = ggml_silu(ctx0, g); + } - if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) { - LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version); - return 0; - } + if (n_head_kv != 0 && n_head_kv != n_head) { + GGML_ASSERT(n_head % n_head_kv == 0); + k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens); + v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens); + struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens); + k = ggml_repeat(ctx0, k, tmp); + v = ggml_repeat(ctx0, v, tmp); } - // load the prompt - { - const uint32_t n_token_count = file.read_u32(); + k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens); + v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens); + r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens); - if (n_token_count > n_token_capacity) { - LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); - return 0; - } + struct ggml_tensor * w = ggml_mul_mat( + ctx0, + layer->time_mix_decay_w2, + ggml_tanh( + ctx0, + ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw) + ) + ); - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); - *n_token_count_out = n_token_count; - } + w = ggml_add(ctx0, w, layer->time_mix_decay); + w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w))); + w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens); - // restore the context state - { - const size_t state_size = file.size() - file.tell(); - llama_io_read_file io(&file); - const size_t nread = state_seq_set_data(io, seq_id); - if (!nread) { - LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); - return 0; - } - GGML_ASSERT(nread <= state_size); - GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell()); + if (is_qrwkv) { + // k = k * (1 - w) + k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w)); } - return file.tell(); -} + struct ggml_tensor * wkv_state = build_copy_mask_state( + ctx0, graph, kv_self.v_l[il], state_copy, state_mask, + n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); -size_t llama_context_kv_self::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) { - llama_file file(filepath, "wb"); + struct ggml_tensor * wkv_output; + if (is_qrwkv) { + wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f)); + } else { + wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, wkv_state); + } + cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); + wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); - file.write_u32(LLAMA_STATE_SEQ_MAGIC); - file.write_u32(LLAMA_STATE_SEQ_VERSION); + ggml_build_forward_expand( + graph, + ggml_cpy( + ctx0, + wkv_state, + ggml_view_1d( + ctx0, + kv_self.v_l[il], + hparams.n_embd_v_s() * n_seqs, + hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + ) + ) + ); - // save the prompt - file.write_u32((uint32_t) n_token_count); - file.write_raw(tokens, sizeof(llama_token) * n_token_count); + if (!is_qrwkv) { + // group norm with head_count groups + cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens); + cur = ggml_norm(ctx0, cur, 64e-5f); - // save the context state using stream saving - llama_io_write_file io(&file); - state_seq_get_data(io, seq_id); + // Convert back to regular vectors. + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b); + } else { + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + } - const size_t res = file.tell(); - GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes()); + cur = ggml_mul(ctx0, cur, g); + cur = build_lora_mm(ctx0, layer->time_mix_output, cur); - return res; + return cur; } -size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { - synchronize(); +// state save/load - // write model info - { - const std::string arch_str = llm_arch_name(model.arch); - io.write_string(arch_str); - // TODO: add more model-specific info which should prevent loading the session file if not identical - } +size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { + llama_context::state_get_data(io); // write output ids { @@ -3492,7 +3524,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { } } - // write mbeddings + // write embeddings { const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd); @@ -3509,19 +3541,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { } size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { - synchronize(); - - // read model info - { - const std::string cur_arch_str = llm_arch_name(model.arch); - - std::string arch_str; - io.read_string(arch_str); - if (cur_arch_str != arch_str) { - throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); - } - // TODO: add more info which needs to be identical but which is not verified otherwise - } + llama_context::state_set_data(io); // read output ids { @@ -3584,7 +3604,7 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { } size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { - synchronize(); + llama_context::state_seq_get_data(io, seq_id); kv_self.state_write(io, model.hparams, seq_id); @@ -3592,7 +3612,7 @@ size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_se } size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { - synchronize(); + llama_context::state_seq_set_data(io, seq_id); kv_self.state_read(io, model.hparams, seq_id); @@ -3937,15 +3957,21 @@ size_t llama_state_get_size(struct llama_context * ctx) { } size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) { + ctx->synchronize(); + return ctx->state_get_data(dst, size); } // Sets the state reading from the specified source address size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) { + ctx->synchronize(); + return ctx->state_set_data(src, size); } bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + ctx->synchronize(); + try { return ctx->state_load_file(path_session, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { @@ -3955,6 +3981,8 @@ bool llama_state_load_file(struct llama_context * ctx, const char * path_session } bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + ctx->synchronize(); + try { return ctx->state_save_file(path_session, tokens, n_token_count); } catch (const std::exception & err) { @@ -3968,14 +3996,20 @@ size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) } size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) { + ctx->synchronize(); + return ctx->state_seq_get_data(seq_id, dst, size); } size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) { + ctx->synchronize(); + return ctx->state_seq_set_data(seq_id, src, size); } size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { + ctx->synchronize(); + try { return ctx->state_seq_save_file(seq_id, filepath, tokens, n_token_count); } catch (const std::exception & err) { @@ -3985,6 +4019,8 @@ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepa } size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + ctx->synchronize(); + try { return ctx->state_seq_load_file(dest_seq_id, filepath, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { diff --git a/src/llama-context.h b/src/llama-context.h index 204793d75a5b1..235fcfee4fb91 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -144,37 +144,37 @@ struct llama_context : public llama_graph_i { // state save/load - virtual size_t state_get_size() = 0; - virtual size_t state_get_data( uint8_t * dst, size_t size) = 0; - virtual size_t state_set_data(const uint8_t * src, size_t size) = 0; + virtual size_t state_get_size(); + virtual size_t state_get_data( uint8_t * dst, size_t size); + virtual size_t state_set_data(const uint8_t * src, size_t size); - virtual size_t state_seq_get_size(llama_seq_id seq_id) = 0; - virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) = 0; - virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) = 0; + virtual size_t state_seq_get_size(llama_seq_id seq_id); + virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size); + virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size); virtual bool state_load_file( const char * filepath, llama_token * tokens_out, size_t n_token_capacity, - size_t * n_token_count_out) = 0; + size_t * n_token_count_out); virtual bool state_save_file( const char * filepath, const llama_token * tokens, - size_t n_token_count) = 0; + size_t n_token_count); virtual size_t state_seq_load_file( llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, - size_t * n_token_count_out) = 0; + size_t * n_token_count_out); virtual size_t state_seq_save_file( llama_seq_id seq_id, const char * filepath, const llama_token * tokens, - size_t n_token_count) = 0; + size_t n_token_count); // perf @@ -183,6 +183,14 @@ struct llama_context : public llama_graph_i { protected: + // state save/load + + virtual size_t state_get_data(llama_io_write_i & io); + virtual size_t state_set_data(llama_io_read_i & io); + + virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id); + virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id); + // members const llama_model & model; @@ -471,46 +479,12 @@ class llama_context_kv_self : public llama_context { int il, bool worst_case) override; - // state save/load - - virtual size_t state_get_size() override; - virtual size_t state_get_data( uint8_t * dst, size_t size) override; - virtual size_t state_set_data(const uint8_t * src, size_t size) override; - - virtual size_t state_seq_get_size(llama_seq_id seq_id) override; - virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) override; - virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override; - - virtual bool state_load_file( - const char * filepath, - llama_token * tokens_out, - size_t n_token_capacity, - size_t * n_token_count_out) override; - - virtual bool state_save_file( - const char * filepath, - const llama_token * tokens, - size_t n_token_count) override; - - virtual size_t state_seq_load_file( - llama_seq_id seq_id, - const char * filepath, - llama_token * tokens_out, - size_t n_token_capacity, - size_t * n_token_count_out) override; - - virtual size_t state_seq_save_file( - llama_seq_id seq_id, - const char * filepath, - const llama_token * tokens, - size_t n_token_count) override; - -private: - size_t state_get_data(llama_io_write_i & io); - size_t state_set_data(llama_io_read_i & io); +protected: + virtual size_t state_get_data(llama_io_write_i & io) override; + virtual size_t state_set_data(llama_io_read_i & io) override; - size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id); - size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id); + virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; + virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; }; // For internal test use From e08f38df69b0cf47b461c16d2541e78ddd3b9cb7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Feb 2025 12:50:53 +0200 Subject: [PATCH 41/84] context : minor cleanup ggml-ci --- src/llama-context.cpp | 57 +++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index bde6659531024..e234e3683bc39 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -10,30 +10,6 @@ #include #include -static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { - // TODO move to hparams if a T5 variant appears that uses a different value - const int64_t max_distance = 128; - - if (bidirectional) { - n_buckets >>= 1; - } - - const int64_t max_exact = n_buckets >> 1; - - int32_t relative_position = x - y; - int32_t relative_bucket = 0; - if (bidirectional) { - relative_bucket += (relative_position > 0) * n_buckets; - relative_position = abs(relative_position); - } else { - relative_position = -std::min(relative_position, 0); - } - int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact)); - relative_position_if_large = std::min(relative_position_if_large, n_buckets - 1); - relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large); - return relative_bucket; -} - // // llama_context // @@ -346,6 +322,7 @@ class llama_io_write_dummy : public llama_io_write_i { return size_written; } +private: size_t size_written = 0; }; @@ -378,6 +355,7 @@ class llama_io_write_buffer : public llama_io_write_i { return size_written; } +private: uint8_t * ptr; size_t buf_size = 0; size_t size_written = 0; @@ -406,6 +384,7 @@ class llama_io_read_buffer : public llama_io_read_i { return size_read; } +private: const uint8_t * ptr; size_t buf_size = 0; size_t size_read = 0; @@ -430,6 +409,7 @@ class llama_io_write_file : public llama_io_write_i { return size_written; } +private: llama_file * file; size_t size_written = 0; std::vector temp_buffer; @@ -454,6 +434,7 @@ class llama_io_read_file : public llama_io_read_i { return size_read; } +private: llama_file * file; size_t size_read = 0; std::vector temp_buffer; @@ -2132,6 +2113,30 @@ void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) { GGML_ASSERT(ggml_backend_buffer_is_host(inp_pos_bucket->buffer)); GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing + static const auto relative_position_bucket = [](llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { + // TODO move to hparams if a T5 variant appears that uses a different value + const int64_t max_distance = 128; + + if (bidirectional) { + n_buckets >>= 1; + } + + const int64_t max_exact = n_buckets >> 1; + + int32_t relative_position = x - y; + int32_t relative_bucket = 0; + if (bidirectional) { + relative_bucket += (relative_position > 0) * n_buckets; + relative_position = abs(relative_position); + } else { + relative_position = -std::min(relative_position, 0); + } + int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact)); + relative_position_if_large = std::min(relative_position_if_large, n_buckets - 1); + relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large); + return relative_bucket; + }; + int32_t * data = (int32_t *) inp_pos_bucket->data; if (!is_encoding) { @@ -2139,7 +2144,7 @@ void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) { for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_kv; ++i) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); + data[h*(n_kv*n_tokens) + j*n_kv + i] = relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); } } } @@ -2147,7 +2152,7 @@ void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) { for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_tokens; ++i) { - data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); + data[h*(n_tokens*n_tokens) + j*n_tokens + i] = relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); } } } From 107d1e2c32612552676db06c028a2cf4d7f2aa03 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Feb 2025 15:42:14 +0200 Subject: [PATCH 42/84] context : move output functionality to base class ggml-ci --- src/llama-context.cpp | 760 +++++++++++++++++++++--------------------- src/llama-context.h | 97 +++--- 2 files changed, 419 insertions(+), 438 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e234e3683bc39..33c256feddc8a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -58,6 +58,105 @@ enum llama_pooling_type llama_context::pooling_type() const { return cparams.pooling_type; } +float * llama_context::get_logits() { + // reorder logits for backward compatibility + output_reorder(); + + return logits; +} + +float * llama_context::get_logits_ith(int32_t i) { + int32_t j = -1; + + try { + if (logits == nullptr) { + throw std::runtime_error("no logits"); + } + + if (i < 0) { + j = n_outputs + i; + if (j < 0) { + throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs)); + } + } else if ((size_t) i >= output_ids.size()) { + throw std::runtime_error(format("out of range [0, %zu)", output_ids.size())); + } else { + j = output_ids[i]; + } + + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + } + if (j >= n_outputs) { + // This should not happen + throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + } + + return logits + j*model.vocab.n_tokens(); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ABORT("fatal error"); +#else + return nullptr; +#endif + } +} + +float * llama_context::get_embeddings() { + // reorder embeddings for backward compatibility + output_reorder(); + + return embd; +} + +float * llama_context::get_embeddings_ith(int32_t i) { + int32_t j = -1; + + try { + if (embd == nullptr) { + throw std::runtime_error("no embeddings"); + } + + if (i < 0) { + j = n_outputs + i; + if (j < 0) { + throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs)); + } + } else if ((size_t) i >= output_ids.size()) { + throw std::runtime_error(format("out of range [0, %zu)", output_ids.size())); + } else { + j = output_ids[i]; + } + + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + } + if (j >= n_outputs) { + // This should not happen + throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + } + + return embd + j*model.hparams.n_embd; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ABORT("fatal error"); +#else + return nullptr; +#endif + } +} + +float * llama_context::get_embeddings_seq(llama_seq_id seq_id) { + auto it = embd_seq.find(seq_id); + if (it == embd_seq.end()) { + return nullptr; + } + + return it->second.data(); +} + int64_t llama_context::n_pos_per_token() const { return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; } @@ -631,6 +730,58 @@ size_t llama_context::state_get_data(llama_io_write_i & io) { // TODO: add more model-specific info which should prevent loading the session file if not identical } + // write output ids + { + output_reorder(); + + const uint32_t n_outputs = this->n_outputs; + const auto & output_ids = this->output_ids; + + std::vector w_output_pos; + + GGML_ASSERT(n_outputs <= output_size); + + w_output_pos.resize(n_outputs); + + // build a more compact representation of the output ids + for (size_t i = 0; i < n_batch(); ++i) { + // map an output id to a position in the batch + int32_t pos = output_ids[i]; + if (pos >= 0) { + GGML_ASSERT((uint32_t) pos < n_outputs); + w_output_pos[pos] = i; + } + } + + io.write(&n_outputs, sizeof(n_outputs)); + + if (n_outputs) { + io.write(w_output_pos.data(), n_outputs * sizeof(int32_t)); + } + } + + // write logits + { + const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens()); + + io.write(&logits_size, sizeof(logits_size)); + + if (logits_size) { + io.write(logits, logits_size * sizeof(float)); + } + } + + // write embeddings + { + const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd); + + io.write(&embd_size, sizeof(embd_size)); + + if (embd_size) { + io.write(embd, embd_size * sizeof(float)); + } + } + return io.n_bytes(); } @@ -647,6 +798,61 @@ size_t llama_context::state_set_data(llama_io_read_i & io) { // TODO: add more info which needs to be identical but which is not verified otherwise } + // read output ids + { + std::vector output_pos; + + uint32_t n_outputs; + io.read_to(&n_outputs, sizeof(n_outputs)); + + if (n_outputs > output_reserve(n_outputs)) { + throw std::runtime_error("could not reserve outputs"); + } + + if (n_outputs) { + output_pos.resize(n_outputs); + io.read_to(output_pos.data(), n_outputs * sizeof(int32_t)); + + for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { + int32_t id = output_pos[i]; + if ((uint32_t) id >= n_batch()) { + throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch())); + } + this->output_ids[id] = i; + } + + this->n_outputs = n_outputs; + } + } + + // read logits + { + uint64_t logits_size; + io.read_to(&logits_size, sizeof(logits_size)); + + if (this->logits_size < logits_size) { + throw std::runtime_error("logits buffer too small"); + } + + if (logits_size) { + io.read_to(this->logits, logits_size * sizeof(float)); + } + } + + // read embeddings + { + uint64_t embd_size; + io.read_to(&embd_size, sizeof(embd_size)); + + if (this->embd_size < embd_size) { + throw std::runtime_error("embeddings buffer too small"); + } + + if (embd_size) { + io.read_to(this->embd, embd_size * sizeof(float)); + } + } + return io.n_bytes(); } @@ -852,7 +1058,7 @@ llama_context_kv_self::llama_context_kv_self( // graph outputs buffer { // resized during inference when a batch uses more outputs - if (reserve_outputs(params.n_seq_max) < params.n_seq_max) { + if (output_reserve(params.n_seq_max) < params.n_seq_max) { LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); throw std::runtime_error("failed to reserve initial output buffer"); } @@ -988,105 +1194,6 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const { return &kv_self; } -float * llama_context_kv_self::get_logits() { - // reorder logits for backward compatibility - reorder_outputs(); - - return logits; -} - -float * llama_context_kv_self::get_logits_ith(int32_t i) { - int32_t j = -1; - - try { - if (logits == nullptr) { - throw std::runtime_error("no logits"); - } - - if (i < 0) { - j = n_outputs + i; - if (j < 0) { - throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs)); - } - } else if ((size_t) i >= output_ids.size()) { - throw std::runtime_error(format("out of range [0, %zu)", output_ids.size())); - } else { - j = output_ids[i]; - } - - if (j < 0) { - throw std::runtime_error(format("batch.logits[%d] != true", i)); - } - if (j >= n_outputs) { - // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); - } - - return logits + j*model.vocab.n_tokens(); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); -#ifndef NDEBUG - GGML_ABORT("fatal error"); -#else - return nullptr; -#endif - } -} - -float * llama_context_kv_self::get_embeddings() { - // reorder embeddings for backward compatibility - reorder_outputs(); - - return embd; -} - -float * llama_context_kv_self::get_embeddings_ith(int32_t i) { - int32_t j = -1; - - try { - if (embd == nullptr) { - throw std::runtime_error("no embeddings"); - } - - if (i < 0) { - j = n_outputs + i; - if (j < 0) { - throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs)); - } - } else if ((size_t) i >= output_ids.size()) { - throw std::runtime_error(format("out of range [0, %zu)", output_ids.size())); - } else { - j = output_ids[i]; - } - - if (j < 0) { - throw std::runtime_error(format("batch.logits[%d] != true", i)); - } - if (j >= n_outputs) { - // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); - } - - return embd + j*model.hparams.n_embd; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); -#ifndef NDEBUG - GGML_ABORT("fatal error"); -#else - return nullptr; -#endif - } -} - -float * llama_context_kv_self::get_embeddings_seq(llama_seq_id seq_id) { - auto it = embd_seq.find(seq_id); - if (it == embd_seq.end()) { - return nullptr; - } - - return it->second.data(); -} - ggml_context_ptr llama_context_kv_self::init() { inp_tokens = nullptr; inp_embd = nullptr; @@ -1357,7 +1464,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // reserve output buffer // TODO: move to batch manager? - if (reserve_outputs(bman->n_outputs_all) < (size_t) n_outputs_all) { + if (output_reserve(bman->n_outputs_all) < (size_t) n_outputs_all) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); return -2; }; @@ -1579,7 +1686,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { const llama_ubatch ubatch = sbatch.split_simple(n_tokens); // reserve output buffer - if (reserve_outputs(n_tokens) < n_tokens) { + if (output_reserve(n_tokens) < n_tokens) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); return -2; }; @@ -1712,33 +1819,6 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { return 0; } -enum ggml_status llama_context_kv_self::compute_graph( - ggml_cgraph * graph, - bool batched) { - int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; - ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; - - if (backend_cpu != nullptr) { - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); - auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); - set_threadpool_fn(backend_cpu, tp); - } - - // set the number of threads for all the backends - for (const auto & set_n_threads_fn : set_n_threads_fns) { - set_n_threads_fn.second(set_n_threads_fn.first, n_threads); - } - - auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph); - if (status != GGML_STATUS_SUCCESS) { - LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); - } - - // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); - - return status; -} - llama_pos llama_context_kv_self::pos_max() const { return kv_self.pos_max(); } @@ -1747,12 +1827,6 @@ uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) c return kv_self.get_padding(cparams); } -void llama_context_kv_self::prepare_k_shift() { -} - -void llama_context_kv_self::prepare_defrag() { -} - // llama input void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) { @@ -2192,117 +2266,10 @@ void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) { for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { for (int j = 0; j < n_output_enc; ++j) { data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY; - } - } - } - } -} - -void llama_context_kv_self::reorder_outputs() { - std::vector & out_ids = sbatch.out_ids; - if (!out_ids.empty()) { - const uint32_t n_vocab = model.vocab.n_tokens(); - const uint32_t n_embd = model.hparams.n_embd; - - GGML_ASSERT((size_t) n_outputs == out_ids.size()); - - // TODO: is there something more efficient which also minimizes swaps? - // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) - for (int32_t i = 0; i < n_outputs - 1; ++i) { - int32_t j_min = i; - for (int32_t j = i + 1; j < n_outputs; ++j) { - if (out_ids[j] < out_ids[j_min]) { - j_min = j; - } - } - if (j_min == i) { continue; } - std::swap(out_ids[i], out_ids[j_min]); - if (logits_size > 0) { - for (uint32_t k = 0; k < n_vocab; k++) { - std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); - } - } - if (embd_size > 0) { - for (uint32_t k = 0; k < n_embd; k++) { - std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); - } - } - } - std::fill(output_ids.begin(), output_ids.end(), -1); - for (int32_t i = 0; i < n_outputs; ++i) { - output_ids[out_ids[i]] = i; - } - out_ids.clear(); - } -} - -size_t llama_context_kv_self::reserve_outputs(size_t n_outputs) { - const auto & hparams = model.hparams; - const auto & vocab = model.vocab; - - const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); - - const auto n_batch = cparams.n_batch; - const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; - - // TODO: use a per-batch flag for logits presence instead - const bool has_logits = !cparams.embeddings; - const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); - - logits_size = has_logits ? n_vocab*n_outputs_max : 0; - embd_size = has_embd ? n_embd*n_outputs_max : 0; - - if (output_ids.empty()) { - // init, never resized afterwards - output_ids.resize(n_batch); - } - - const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; - const size_t new_size = (logits_size + embd_size) * sizeof(float); - - // alloc only when more than the current capacity is required - // TODO: also consider shrinking the buffer - if (!buf_output || prev_size < new_size) { - if (buf_output) { -#ifndef NDEBUG - // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) - LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); -#endif - buf_output = nullptr; - logits = nullptr; - embd = nullptr; - } - - auto * buft = ggml_backend_cpu_buffer_type(); - // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory - auto * output_dev = model.dev_output(); - auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; - if (output_dev_host_buft) { - buft = output_dev_host_buft; - } - buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); - if (buf_output == nullptr) { - LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); - return 0; + } + } } } - - float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get()); - - logits = has_logits ? output_base : nullptr; - embd = has_embd ? output_base + logits_size : nullptr; - - output_size = n_outputs_max; - - // set all ids as invalid (negative) - std::fill(output_ids.begin(), output_ids.end(), -1); - - ggml_backend_buffer_clear(buf_output.get(), 0); - - n_outputs = 0; - - return n_outputs_max; } void llama_context_kv_self::kv_self_update() { @@ -2315,8 +2282,6 @@ void llama_context_kv_self::kv_self_update() { // apply K-shift if needed if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { - prepare_k_shift(); - ggml_backend_sched_reset(sched.get()); auto ctx = init(); @@ -2346,8 +2311,6 @@ void llama_context_kv_self::kv_self_update() { // defragment the KV cache if needed if (kv.do_defrag) { - prepare_defrag(); - ggml_backend_sched_reset(sched.get()); auto ctx = init(); @@ -3333,20 +3296,20 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; - const auto layer = &model.layers[il]; + const auto & layer = model.layers[il]; - bool is_qrwkv = layer->time_mix_first == nullptr; + bool is_qrwkv = layer.time_mix_first == nullptr; struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); - struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur); + struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur); xxx = ggml_reshape_4d( ctx0, ggml_tanh( ctx0, - ggml_mul_mat(ctx0, layer->time_mix_w1, xxx) + ggml_mul_mat(ctx0, layer.time_mix_w1, xxx) ), - layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens + layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens ); xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2)); @@ -3355,18 +3318,18 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( ctx0, ggml_reshape_4d( ctx0, - layer->time_mix_w2, - layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 + layer.time_mix_w2, + layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5 ), xxx ); struct ggml_tensor *xw, *xk, *xv, *xr, *xg; - if (layer->time_mix_lerp_fused) { + if (layer.time_mix_lerp_fused) { // fusing these weights makes some performance improvement sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens); cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); - xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur); + xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer.time_mix_lerp_fused), sx), cur); xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); @@ -3380,27 +3343,27 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur); - xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur); - xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur); - xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur); - xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur); + xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer.time_mix_lerp_w), sx), cur); + xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer.time_mix_lerp_k), sx), cur); + xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer.time_mix_lerp_v), sx), cur); + xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer.time_mix_lerp_r), sx), cur); + xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer.time_mix_lerp_g), sx), cur); } - struct ggml_tensor * r = build_lora_mm(ctx0, layer->time_mix_receptance, xr); - struct ggml_tensor * k = build_lora_mm(ctx0, layer->time_mix_key, xk); - struct ggml_tensor * v = build_lora_mm(ctx0, layer->time_mix_value, xv); - if (layer->time_mix_receptance_b) { - r = ggml_add(ctx0, r, layer->time_mix_receptance_b); + struct ggml_tensor * r = build_lora_mm(ctx0, layer.time_mix_receptance, xr); + struct ggml_tensor * k = build_lora_mm(ctx0, layer.time_mix_key, xk); + struct ggml_tensor * v = build_lora_mm(ctx0, layer.time_mix_value, xv); + if (layer.time_mix_receptance_b) { + r = ggml_add(ctx0, r, layer.time_mix_receptance_b); } - if (layer->time_mix_key_b) { - k = ggml_add(ctx0, k, layer->time_mix_key_b); + if (layer.time_mix_key_b) { + k = ggml_add(ctx0, k, layer.time_mix_key_b); } - if (layer->time_mix_value_b) { - v = ggml_add(ctx0, v, layer->time_mix_value_b); + if (layer.time_mix_value_b) { + v = ggml_add(ctx0, v, layer.time_mix_value_b); } - struct ggml_tensor * g = build_lora_mm(ctx0, layer->time_mix_gate, xg); + struct ggml_tensor * g = build_lora_mm(ctx0, layer.time_mix_gate, xg); if (is_qrwkv) { g = ggml_sigmoid(ctx0, g); } else { @@ -3422,14 +3385,14 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( struct ggml_tensor * w = ggml_mul_mat( ctx0, - layer->time_mix_decay_w2, + layer.time_mix_decay_w2, ggml_tanh( ctx0, - ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw) + ggml_mul_mat(ctx0, layer.time_mix_decay_w1, xw) ) ); - w = ggml_add(ctx0, w, layer->time_mix_decay); + w = ggml_add(ctx0, w, layer.time_mix_decay); w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w))); w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens); @@ -3446,7 +3409,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( if (is_qrwkv) { wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f)); } else { - wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, wkv_state); + wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer.time_mix_first, w, wkv_state); } cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); @@ -3472,13 +3435,13 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( // Convert back to regular vectors. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b); } else { cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); } cur = ggml_mul(ctx0, cur, g); - cur = build_lora_mm(ctx0, layer->time_mix_output, cur); + cur = build_lora_mm(ctx0, layer.time_mix_output, cur); return cur; } @@ -3488,58 +3451,6 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { llama_context::state_get_data(io); - // write output ids - { - reorder_outputs(); - - const uint32_t n_outputs = this->n_outputs; - const auto & output_ids = this->output_ids; - - std::vector w_output_pos; - - GGML_ASSERT(n_outputs <= output_size); - - w_output_pos.resize(n_outputs); - - // build a more compact representation of the output ids - for (size_t i = 0; i < n_batch(); ++i) { - // map an output id to a position in the batch - int32_t pos = output_ids[i]; - if (pos >= 0) { - GGML_ASSERT((uint32_t) pos < n_outputs); - w_output_pos[pos] = i; - } - } - - io.write(&n_outputs, sizeof(n_outputs)); - - if (n_outputs) { - io.write(w_output_pos.data(), n_outputs * sizeof(int32_t)); - } - } - - // write logits - { - const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens()); - - io.write(&logits_size, sizeof(logits_size)); - - if (logits_size) { - io.write(logits, logits_size * sizeof(float)); - } - } - - // write embeddings - { - const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd); - - io.write(&embd_size, sizeof(embd_size)); - - if (embd_size) { - io.write(embd, embd_size * sizeof(float)); - } - } - kv_self.state_write(io, model.hparams); return io.n_bytes(); @@ -3548,61 +3459,6 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { llama_context::state_set_data(io); - // read output ids - { - std::vector output_pos; - - uint32_t n_outputs; - io.read_to(&n_outputs, sizeof(n_outputs)); - - if (n_outputs > reserve_outputs(n_outputs)) { - throw std::runtime_error("could not reserve outputs"); - } - - if (n_outputs) { - output_pos.resize(n_outputs); - io.read_to(output_pos.data(), n_outputs * sizeof(int32_t)); - - for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { - int32_t id = output_pos[i]; - if ((uint32_t) id >= n_batch()) { - throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch())); - } - this->output_ids[id] = i; - } - - this->n_outputs = n_outputs; - } - } - - // read logits - { - uint64_t logits_size; - io.read_to(&logits_size, sizeof(logits_size)); - - if (this->logits_size < logits_size) { - throw std::runtime_error("logits buffer too small"); - } - - if (logits_size) { - io.read_to(this->logits, logits_size * sizeof(float)); - } - } - - // read embeddings - { - uint64_t embd_size; - io.read_to(&embd_size, sizeof(embd_size)); - - if (this->embd_size < embd_size) { - throw std::runtime_error("embeddings buffer too small"); - } - - if (embd_size) { - io.read_to(this->embd, embd_size * sizeof(float)); - } - } - kv_self.state_read(io, model.hparams); return io.n_bytes(); @@ -3768,6 +3624,140 @@ int32_t llama_apply_adapter_cvec( return res ? 0 : -1; } +enum ggml_status llama_context::compute_graph( + ggml_cgraph * graph, + bool batched) { + int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; + ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; + + if (backend_cpu != nullptr) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(backend_cpu, tp); + } + + // set the number of threads for all the backends + for (const auto & set_n_threads_fn : set_n_threads_fns) { + set_n_threads_fn.second(set_n_threads_fn.first, n_threads); + } + + auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph); + if (status != GGML_STATUS_SUCCESS) { + LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); + } + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); + + return status; +} + +size_t llama_context::output_reserve(size_t n_outputs) { + const auto & hparams = model.hparams; + const auto & vocab = model.vocab; + + const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); + + const auto n_batch = cparams.n_batch; + const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; + + // TODO: use a per-batch flag for logits presence instead + const bool has_logits = !cparams.embeddings; + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + + logits_size = has_logits ? n_vocab*n_outputs_max : 0; + embd_size = has_embd ? n_embd*n_outputs_max : 0; + + if (output_ids.empty()) { + // init, never resized afterwards + output_ids.resize(n_batch); + } + + const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; + const size_t new_size = (logits_size + embd_size) * sizeof(float); + + // alloc only when more than the current capacity is required + // TODO: also consider shrinking the buffer + if (!buf_output || prev_size < new_size) { + if (buf_output) { +#ifndef NDEBUG + // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) + LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + buf_output = nullptr; + logits = nullptr; + embd = nullptr; + } + + auto * buft = ggml_backend_cpu_buffer_type(); + // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory + auto * output_dev = model.dev_output(); + auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; + if (output_dev_host_buft) { + buft = output_dev_host_buft; + } + buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); + if (buf_output == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); + return 0; + } + } + + float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get()); + + logits = has_logits ? output_base : nullptr; + embd = has_embd ? output_base + logits_size : nullptr; + + output_size = n_outputs_max; + + // set all ids as invalid (negative) + std::fill(output_ids.begin(), output_ids.end(), -1); + + ggml_backend_buffer_clear(buf_output.get(), 0); + + n_outputs = 0; + + return n_outputs_max; +} + +void llama_context::output_reorder() { + std::vector & out_ids = sbatch.out_ids; + if (!out_ids.empty()) { + const uint32_t n_vocab = model.vocab.n_tokens(); + const uint32_t n_embd = model.hparams.n_embd; + + GGML_ASSERT((size_t) n_outputs == out_ids.size()); + + // TODO: is there something more efficient which also minimizes swaps? + // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) + for (int32_t i = 0; i < n_outputs - 1; ++i) { + int32_t j_min = i; + for (int32_t j = i + 1; j < n_outputs; ++j) { + if (out_ids[j] < out_ids[j_min]) { + j_min = j; + } + } + if (j_min == i) { continue; } + std::swap(out_ids[i], out_ids[j_min]); + if (logits_size > 0) { + for (uint32_t k = 0; k < n_vocab; k++) { + std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); + } + } + if (embd_size > 0) { + for (uint32_t k = 0; k < n_embd; k++) { + std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); + } + } + } + std::fill(output_ids.begin(), output_ids.end(), -1); + for (int32_t i = 0; i < n_outputs; ++i) { + output_ids[out_ids[i]] = i; + } + out_ids.clear(); + } +} + // // kv cache view // diff --git a/src/llama-context.h b/src/llama-context.h index 235fcfee4fb91..16d138b4cbd35 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -43,12 +43,12 @@ struct llama_context : public llama_graph_i { virtual enum llama_pooling_type pooling_type() const; - virtual float * get_logits() = 0; - virtual float * get_logits_ith(int32_t i) = 0; + virtual float * get_logits(); + virtual float * get_logits_ith(int32_t i); - virtual float * get_embeddings() = 0; - virtual float * get_embeddings_ith(int32_t i) = 0; - virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0; + virtual float * get_embeddings(); + virtual float * get_embeddings_ith(int32_t i); + virtual float * get_embeddings_seq(llama_seq_id seq_id); virtual int64_t n_pos_per_token() const; // vision @@ -85,6 +85,19 @@ struct llama_context : public llama_graph_i { int32_t il_start, int32_t il_end); + // returns the result of ggml_backend_sched_graph_compute_async execution + virtual enum ggml_status compute_graph( + ggml_cgraph * graph, + bool batched); + + // Make sure enough space is available for outputs. + // Returns max number of outputs for which space was reserved. + virtual size_t output_reserve(size_t n_outputs); + + // make the outputs have the same order they had in the user-provided batch + // TODO: maybe remove this + virtual void output_reorder(); + // graph build API (generic) virtual void build_cb( @@ -198,6 +211,7 @@ struct llama_context : public llama_graph_i { llama_cparams cparams; llama_adapter_cvec cvec; llama_loras loras; + llama_sbatch sbatch; ggml_threadpool_t threadpool = nullptr; ggml_threadpool_t threadpool_batch = nullptr; @@ -215,6 +229,31 @@ struct llama_context : public llama_graph_i { // memory buffers used to evaluate the model std::vector buf_compute_meta; + // host buffer for the model output (logits and embeddings) + ggml_backend_buffer_ptr buf_output; + + // TODO: remove + bool logits_all = false; + + // decode output (2-dimensional array: [n_outputs][n_vocab]) + size_t logits_size = 0; // capacity (of floats) for logits + float * logits = nullptr; + + // embeddings output (2-dimensional array: [n_outputs][n_embd]) + // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE + size_t embd_size = 0; // capacity (of floats) for embeddings + float * embd = nullptr; + + // sequence embeddings output (map of [n_embd] vectors) + // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE + std::map> embd_seq; + + size_t output_size = 0; // capacity (of tokens positions) for the output buffers + int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch + + std::vector output_ids; // map batch token positions to ids of the logits and embd buffers + + bool need_reserve = false; bool has_evaluated_once = false; mutable int64_t t_start_us = 0; @@ -247,69 +286,21 @@ class llama_context_kv_self : public llama_context { virtual void kv_self_update() override; - virtual float * get_logits() override; - virtual float * get_logits_ith(int32_t i) override; - - virtual float * get_embeddings() override; - virtual float * get_embeddings_ith(int32_t i) override; - virtual float * get_embeddings_seq(llama_seq_id seq_id) override; - virtual ggml_context_ptr init() override; virtual int decode(llama_batch & inp_batch) override; virtual int encode(llama_batch & inp_batch) override; - llama_sbatch sbatch; - - // host buffer for the model output (logits and embeddings) - ggml_backend_buffer_ptr buf_output; - - // decode output (2-dimensional array: [n_outputs][n_vocab]) - size_t logits_size = 0; // capacity (of floats) for logits - float * logits = nullptr; - - std::vector output_ids; // map batch token positions to ids of the logits and embd buffers - size_t output_size = 0; // capacity (of tokens positions) for the output buffers - int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch - - bool logits_all = false; - bool need_reserve = false; - - // embeddings output (2-dimensional array: [n_outputs][n_embd]) - // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE - size_t embd_size = 0; // capacity (of floats) for embeddings - float * embd = nullptr; - - // sequence embeddings output (map of [n_embd] vectors) - // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE - std::map> embd_seq; - virtual std::unique_ptr prepare_batch(const llama_batch & batch); - // returns the result of ggml_backend_sched_graph_compute_async execution - enum ggml_status compute_graph( - ggml_cgraph * graph, - bool batched); - // max token position across all sequences in the current context llama_pos pos_max() const; // certain implementations could require a padding for the context size uint32_t get_ctx_padding(const llama_cparams & cparams) const; - void prepare_k_shift(); - void prepare_defrag(); - void set_inputs(const llama_ubatch & ubatch); - // make the outputs have the same order they had in the user-provided batch - // TODO: maybe remove this - void reorder_outputs(); - - // Make sure enough space is available for outputs. - // Returns max number of outputs for which space was reserved. - size_t reserve_outputs(size_t n_outputs); - // input tensors struct ggml_tensor * inp_tokens; // I32 [n_batch] struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] From ed3cb55abefed68e4123b269da7d840fc9531010 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Feb 2025 15:53:15 +0200 Subject: [PATCH 43/84] context : abstract input ggml-ci --- src/llama-context.cpp | 638 +++++++++++++++++++++--------------------- src/llama-context.h | 23 +- 2 files changed, 334 insertions(+), 327 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 33c256feddc8a..485430095f2f9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -269,6 +269,309 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } +enum ggml_status llama_context::compute_graph( + ggml_cgraph * graph, + bool batched) { + int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; + ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; + + if (backend_cpu != nullptr) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(backend_cpu, tp); + } + + // set the number of threads for all the backends + for (const auto & set_n_threads_fn : set_n_threads_fns) { + set_n_threads_fn.second(set_n_threads_fn.first, n_threads); + } + + auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph); + if (status != GGML_STATUS_SUCCESS) { + LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); + } + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); + + return status; +} + +void llama_context::input_set(const llama_ubatch & ubatch) { + const llama_hparams & hparams = model.hparams; + + if (ubatch.token) { + const int64_t n_tokens = ubatch.n_tokens; + + ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); + } + + if (ubatch.embd) { + const int64_t n_embd = hparams.n_embd; + const int64_t n_tokens = ubatch.n_tokens; + + ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd)); + } + + if (ubatch.pos && inp_pos) { + const int64_t n_tokens = ubatch.n_tokens; + + ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos)); + } + + if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs"); + + if (!inp_out_ids) { + LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__); + } else { + const int64_t n_tokens = ubatch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer)); + int32_t * data = (int32_t *) inp_out_ids->data; + + if (n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (ubatch.output) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch.output[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(n_outputs == n_outputs); + } else if (n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(n_outputs == 0); + } + } + } + + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(inp_mean); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer)); + + float * data = (float *) inp_mean->data; + memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean)); + + std::vector sum(n_tokens, 0); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); + + sum[seq_id] += ubatch.n_seq_tokens; + } + + std::vector div(n_tokens, 0.0f); + for (int i = 0; i < n_tokens; ++i) { + const uint64_t s = sum[i]; + if (s > 0) { + div[i] = 1.0f/float(s); + } + } + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + for (int i = 0; i < n_seq_tokens; ++i) { + data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; + } + } + } + + if (cparams.embeddings && ( + cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || + cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); + + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; + + if (pos == 0) { + data[seq_id] = s*n_seq_tokens + i; + } + } + } + } + + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); + + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); + + std::vector last_pos(n_tokens, -1); + std::vector last_row(n_tokens, -1); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; + + if (pos >= last_pos[seq_id]) { + last_pos[seq_id] = pos; + last_row[seq_id] = s*n_seq_tokens + i; + } + } + } + + for (int i = 0; i < n_tokens; ++i) { + if (last_row[i] >= 0) { + data[i] = last_row[i]; + } + } + } + + GGML_ASSERT( + // (!a || b) is a logical implication (a -> b) + // !hparams.causal_attn -> !cparams.causal_attn + (hparams.causal_attn || !cparams.causal_attn) && + "causal attention is not supported by this model" + ); +} + +size_t llama_context::output_reserve(size_t n_outputs) { + const auto & hparams = model.hparams; + const auto & vocab = model.vocab; + + const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); + + const auto n_batch = cparams.n_batch; + const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; + + // TODO: use a per-batch flag for logits presence instead + const bool has_logits = !cparams.embeddings; + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + + logits_size = has_logits ? n_vocab*n_outputs_max : 0; + embd_size = has_embd ? n_embd*n_outputs_max : 0; + + if (output_ids.empty()) { + // init, never resized afterwards + output_ids.resize(n_batch); + } + + const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; + const size_t new_size = (logits_size + embd_size) * sizeof(float); + + // alloc only when more than the current capacity is required + // TODO: also consider shrinking the buffer + if (!buf_output || prev_size < new_size) { + if (buf_output) { +#ifndef NDEBUG + // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) + LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + buf_output = nullptr; + logits = nullptr; + embd = nullptr; + } + + auto * buft = ggml_backend_cpu_buffer_type(); + // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory + auto * output_dev = model.dev_output(); + auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; + if (output_dev_host_buft) { + buft = output_dev_host_buft; + } + buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); + if (buf_output == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); + return 0; + } + } + + float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get()); + + logits = has_logits ? output_base : nullptr; + embd = has_embd ? output_base + logits_size : nullptr; + + output_size = n_outputs_max; + + // set all ids as invalid (negative) + std::fill(output_ids.begin(), output_ids.end(), -1); + + ggml_backend_buffer_clear(buf_output.get(), 0); + + n_outputs = 0; + + return n_outputs_max; +} + +void llama_context::output_reorder() { + std::vector & out_ids = sbatch.out_ids; + if (!out_ids.empty()) { + const uint32_t n_vocab = model.vocab.n_tokens(); + const uint32_t n_embd = model.hparams.n_embd; + + GGML_ASSERT((size_t) n_outputs == out_ids.size()); + + // TODO: is there something more efficient which also minimizes swaps? + // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) + for (int32_t i = 0; i < n_outputs - 1; ++i) { + int32_t j_min = i; + for (int32_t j = i + 1; j < n_outputs; ++j) { + if (out_ids[j] < out_ids[j_min]) { + j_min = j; + } + } + if (j_min == i) { continue; } + std::swap(out_ids[i], out_ids[j_min]); + if (logits_size > 0) { + for (uint32_t k = 0; k < n_vocab; k++) { + std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); + } + } + if (embd_size > 0) { + for (uint32_t k = 0; k < n_embd; k++) { + std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); + } + } + } + std::fill(output_ids.begin(), output_ids.end(), -1); + for (int32_t i = 0; i < n_outputs; ++i) { + output_ids[out_ids[i]] = i; + } + out_ids.clear(); + } +} + + void llama_context::build_cb( ggml_tensor * cur, const char * name, @@ -1489,7 +1792,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); - set_inputs(ubatch); + input_set(ubatch); // the output is always the last tensor in the graph struct ggml_tensor * t_logits = ggml_graph_node(gf, -1); @@ -1710,7 +2013,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); - set_inputs(ubatch); + input_set(ubatch); // the output embeddings after the final encoder normalization struct ggml_tensor * t_embd = nullptr; @@ -1829,84 +2132,24 @@ uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) c // llama input -void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) { +void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; - - // - // set input data - // - - if (inp_K_shift) { - assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); - - int32_t * data = (int32_t *) inp_K_shift->data; - - for (uint32_t i = 0; i < kv_self.size; ++i) { - data[i] = kv_self.cells[i].delta; - } - - // the K-shift graph requires just this input - return; - } - - if (ubatch.token) { - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); - } - - if (ubatch.embd) { - const int64_t n_embd = hparams.n_embd; - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd)); - } - - if (ubatch.pos && inp_pos) { - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos)); - } - - if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs"); - - if (!inp_out_ids) { - LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__); - } else { - const int64_t n_tokens = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer)); - int32_t * data = (int32_t *) inp_out_ids->data; - - if (n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { - data[i] = i; - } - } else if (ubatch.output) { - int32_t n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch.output[i]) { - data[n_outputs++] = i; - } - } - // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(n_outputs == n_outputs); - } else if (n_outputs == 1) { - // only keep last output - data[0] = n_tokens - 1; - } else { - GGML_ASSERT(n_outputs == 0); - } + + if (inp_K_shift) { + assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); + + int32_t * data = (int32_t *) inp_K_shift->data; + + for (uint32_t i = 0; i < kv_self.size; ++i) { + data[i] = kv_self.cells[i].delta; } + + // the K-shift graph requires just this input + return; } - GGML_ASSERT( - // (!a || b) is a logical implication (a -> b) - // !hparams.causal_attn -> !cparams.causal_attn - (hparams.causal_attn || !cparams.causal_attn) && - "causal attention is not supported by this model" - ); + // call base functionality + llama_context::input_set(ubatch); if (inp_KQ_mask || inp_KQ_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. @@ -2029,111 +2272,6 @@ void llama_context_kv_self::set_inputs(const llama_ubatch & ubatch) { } } - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp_mean); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer)); - - float * data = (float *) inp_mean->data; - memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean)); - - std::vector sum(n_tokens, 0); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); - - sum[seq_id] += ubatch.n_seq_tokens; - } - - std::vector div(n_tokens, 0.0f); - for (int i = 0; i < n_tokens; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); - } - } - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - for (int i = 0; i < n_seq_tokens; ++i) { - data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; - } - } - } - - if (cparams.embeddings && ( - cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || - cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - - uint32_t * data = (uint32_t *) inp_cls->data; - memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; - - if (pos == 0) { - data[seq_id] = s*n_seq_tokens + i; - } - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - - uint32_t * data = (uint32_t *) inp_cls->data; - memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); - - std::vector last_pos(n_tokens, -1); - std::vector last_row(n_tokens, -1); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; - - if (pos >= last_pos[seq_id]) { - last_pos[seq_id] = pos; - last_row[seq_id] = s*n_seq_tokens + i; - } - } - } - - for (int i = 0; i < n_tokens; ++i) { - if (last_row[i] >= 0) { - data[i] = last_row[i]; - } - } - } - if (kv_self.recurrent) { const int64_t n_kv = kv_self.n; @@ -2293,7 +2431,7 @@ void llama_context_kv_self::kv_self_update() { ggml_backend_sched_alloc_graph(sched.get(), gf); - set_inputs({}); + input_set({}); compute_graph(gf, false); @@ -2323,7 +2461,7 @@ void llama_context_kv_self::kv_self_update() { ggml_backend_sched_alloc_graph(sched.get(), gf); // no input - //set_inputs({}); + //input_set({}); compute_graph(gf, false); @@ -3624,140 +3762,6 @@ int32_t llama_apply_adapter_cvec( return res ? 0 : -1; } -enum ggml_status llama_context::compute_graph( - ggml_cgraph * graph, - bool batched) { - int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; - ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; - - if (backend_cpu != nullptr) { - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); - auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); - set_threadpool_fn(backend_cpu, tp); - } - - // set the number of threads for all the backends - for (const auto & set_n_threads_fn : set_n_threads_fns) { - set_n_threads_fn.second(set_n_threads_fn.first, n_threads); - } - - auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph); - if (status != GGML_STATUS_SUCCESS) { - LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); - } - - // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); - - return status; -} - -size_t llama_context::output_reserve(size_t n_outputs) { - const auto & hparams = model.hparams; - const auto & vocab = model.vocab; - - const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); - - const auto n_batch = cparams.n_batch; - const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; - - // TODO: use a per-batch flag for logits presence instead - const bool has_logits = !cparams.embeddings; - const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); - - logits_size = has_logits ? n_vocab*n_outputs_max : 0; - embd_size = has_embd ? n_embd*n_outputs_max : 0; - - if (output_ids.empty()) { - // init, never resized afterwards - output_ids.resize(n_batch); - } - - const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; - const size_t new_size = (logits_size + embd_size) * sizeof(float); - - // alloc only when more than the current capacity is required - // TODO: also consider shrinking the buffer - if (!buf_output || prev_size < new_size) { - if (buf_output) { -#ifndef NDEBUG - // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) - LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); -#endif - buf_output = nullptr; - logits = nullptr; - embd = nullptr; - } - - auto * buft = ggml_backend_cpu_buffer_type(); - // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory - auto * output_dev = model.dev_output(); - auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; - if (output_dev_host_buft) { - buft = output_dev_host_buft; - } - buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); - if (buf_output == nullptr) { - LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); - return 0; - } - } - - float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get()); - - logits = has_logits ? output_base : nullptr; - embd = has_embd ? output_base + logits_size : nullptr; - - output_size = n_outputs_max; - - // set all ids as invalid (negative) - std::fill(output_ids.begin(), output_ids.end(), -1); - - ggml_backend_buffer_clear(buf_output.get(), 0); - - n_outputs = 0; - - return n_outputs_max; -} - -void llama_context::output_reorder() { - std::vector & out_ids = sbatch.out_ids; - if (!out_ids.empty()) { - const uint32_t n_vocab = model.vocab.n_tokens(); - const uint32_t n_embd = model.hparams.n_embd; - - GGML_ASSERT((size_t) n_outputs == out_ids.size()); - - // TODO: is there something more efficient which also minimizes swaps? - // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) - for (int32_t i = 0; i < n_outputs - 1; ++i) { - int32_t j_min = i; - for (int32_t j = i + 1; j < n_outputs; ++j) { - if (out_ids[j] < out_ids[j_min]) { - j_min = j; - } - } - if (j_min == i) { continue; } - std::swap(out_ids[i], out_ids[j_min]); - if (logits_size > 0) { - for (uint32_t k = 0; k < n_vocab; k++) { - std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); - } - } - if (embd_size > 0) { - for (uint32_t k = 0; k < n_embd; k++) { - std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); - } - } - } - std::fill(output_ids.begin(), output_ids.end(), -1); - for (int32_t i = 0; i < n_outputs; ++i) { - output_ids[out_ids[i]] = i; - } - out_ids.clear(); - } -} - // // kv cache view // diff --git a/src/llama-context.h b/src/llama-context.h index 16d138b4cbd35..f8040138222c2 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -90,6 +90,8 @@ struct llama_context : public llama_graph_i { ggml_cgraph * graph, bool batched); + virtual void input_set(const llama_ubatch & ubatch); + // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. virtual size_t output_reserve(size_t n_outputs); @@ -204,6 +206,15 @@ struct llama_context : public llama_graph_i { virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id); virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id); + // input tensors + + struct ggml_tensor * inp_tokens; // I32 [n_batch] + struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] + struct ggml_tensor * inp_pos; // I32 [n_batch] + struct ggml_tensor * inp_out_ids; // I32 [n_outputs] + struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] + struct ggml_tensor * inp_cls; // I32 [n_batch] + // members const llama_model & model; @@ -288,6 +299,8 @@ class llama_context_kv_self : public llama_context { virtual ggml_context_ptr init() override; + virtual void input_set(const llama_ubatch & ubatch) override; + virtual int decode(llama_batch & inp_batch) override; virtual int encode(llama_batch & inp_batch) override; @@ -299,16 +312,6 @@ class llama_context_kv_self : public llama_context { // certain implementations could require a padding for the context size uint32_t get_ctx_padding(const llama_cparams & cparams) const; - void set_inputs(const llama_ubatch & ubatch); - - // input tensors - struct ggml_tensor * inp_tokens; // I32 [n_batch] - struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] - struct ggml_tensor * inp_pos; // I32 [n_batch] - struct ggml_tensor * inp_out_ids; // I32 [n_outputs] - struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] - struct ggml_tensor * inp_cls; // I32 [n_batch] - // === unified KV cache === llama_kv_cache kv_self; From 131743ff4f17bfe65c5bf6b79187ad9fd7fcdb55 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Feb 2025 17:13:42 +0200 Subject: [PATCH 44/84] context : abstract constructor and init ggml-ci --- src/llama-context.cpp | 657 ++++++++++++++++++++++-------------------- src/llama-context.h | 25 +- src/llama.cpp | 2 +- 3 files changed, 359 insertions(+), 325 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 485430095f2f9..31085f644ba0f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -14,14 +14,290 @@ // llama_context // -llama_context::llama_context(const llama_model & model) : +llama_context::llama_context( + const llama_model & model, + const llama_context_params & params) : model (model), t_start_us(model.t_start_us), t_load_us (model.t_load_us) { + const auto & hparams = model.hparams; + + cparams.n_seq_max = std::max(1u, params.n_seq_max); + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.defrag_thold = params.defrag_thold; + cparams.embeddings = params.embeddings; + cparams.offload_kqv = params.offload_kqv; + cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; + cparams.pooling_type = params.pooling_type; + + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + + // with causal attention, the batch size is limited by the context size + cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; + + // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask + // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) + // ref: https://github.com/ggerganov/llama.cpp/pull/5021 + if (cparams.n_batch < GGML_KQ_MASK_PAD) { + LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); + cparams.n_batch = GGML_KQ_MASK_PAD; + } + + cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + + cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : + hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : + hparams.n_ctx_train; + + cparams.cb_eval = params.cb_eval; + cparams.cb_eval_user_data = params.cb_eval_user_data; + + auto rope_scaling_type = params.rope_scaling_type; + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { + rope_scaling_type = hparams.rope_scaling_type_train; + } + + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { + cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none + } + + if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; + } + + cparams.yarn_attn_factor *= hparams.rope_attn_factor; + + if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { + if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { + cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; + } else { + cparams.pooling_type = hparams.pooling_type; + } + } + + if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) { + cparams.causal_attn = hparams.causal_attn; + } else { + cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; + } + + const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; + + LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); + LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); + LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); + LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); + LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); + + if (n_ctx_per_seq < hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", + __func__, n_ctx_per_seq, hparams.n_ctx_train); + } + + if (n_ctx_per_seq > hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + __func__, n_ctx_per_seq, hparams.n_ctx_train); + } + + logits_all = params.logits_all; + + if (!hparams.vocab_only) { + // GPU backends + for (auto * dev : model.devices) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + throw std::runtime_error("failed to initialize backend"); + } + backends.emplace_back(backend); + } + + // add ACCEL backends (such as BLAS) + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + throw std::runtime_error("failed to initialize backend"); + } + backends.emplace_back(backend); + } + } + + // add CPU backend + backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + if (backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + throw std::runtime_error("failed to initialize CPU backend"); + } + backends.emplace_back(backend_cpu); + + // create a list of the set_n_threads functions in the backends + for (auto & backend : backends) { + ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (reg) { + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn) { + set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); + } + } + } + + llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data); + + // graph outputs buffer + { + // resized during inference when a batch uses more outputs + if (output_reserve(params.n_seq_max) < params.n_seq_max) { + LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); + throw std::runtime_error("failed to reserve initial output buffer"); + } + + LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, + ggml_backend_buffer_name (buf_output.get()), + ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0); + } + } } llama_context::~llama_context() = default; +void llama_context::init() { + const auto & hparams = model.hparams; + + if (hparams.vocab_only) { + LLAMA_LOG_WARN("%s: model is vocab-only -- no computation will be performed\n", __func__); + return; + } + + // buffer types used for the compute buffer of each backend + std::vector backend_buft; + std::vector backend_ptrs; + for (auto & backend : backends) { + auto * buft = ggml_backend_get_default_buffer_type(backend.get()); + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) { + // use the host buffer of the first device CPU for faster transfer of the intermediate state + auto * dev = model.devices[0]; + auto * host_buft = ggml_backend_dev_host_buffer_type(dev); + if (host_buft) { + buft = host_buft; + } + } + backend_buft.push_back(buft); + backend_ptrs.push_back(backend.get()); + } + + const size_t max_nodes = model.max_nodes(); + + // buffer used to store the computation graph and the tensor meta data + // TODO: move to base class + buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + + // TODO: move these checks to ggml_backend_sched + // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary + bool pipeline_parallel = + model.n_devices() > 1 && + model.params.n_gpu_layers > (int) model.hparams.n_layer && + model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && + cparams.offload_kqv; + + // pipeline parallelism requires support for async compute and events in all devices + if (pipeline_parallel) { + for (auto & backend : backends) { + auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + // ignore CPU backend + continue; + } + auto * dev = ggml_backend_get_device(backend.get()); + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + if (!props.caps.async || !props.caps.events) { + // device does not support async compute or events + pipeline_parallel = false; + break; + } + } + } + + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); + + if (pipeline_parallel) { + LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); + } + + // initialize scheduler with the worst-case graph + { + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + + // reserve pp graph first so that buffers are only allocated once + llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + ggml_cgraph * gf_pp = build_graph(ubatch_pp, true); + if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); + } + int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); + int n_nodes_pp = ggml_graph_n_nodes(gf_pp); + + // reserve with tg graph to get the number of splits and nodes + llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + ggml_cgraph * gf_tg = build_graph(ubatch_tg, true); + if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); + } + int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); + int n_nodes_tg = ggml_graph_n_nodes(gf_tg); + + // reserve again with pp graph to avoid ggml-alloc reallocations during inference + gf_pp = build_graph(ubatch_pp, true); + if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); + } + + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; + ggml_backend_buffer_type_t buft = backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (size > 1) { + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } + } + + if (n_nodes_pp == n_nodes_tg) { + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); + } else { + LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); + } + + if (n_splits_pp == n_splits_tg) { + LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); + } else { + LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); + } + } +} + const llama_model & llama_context::get_model() const { return model; } @@ -161,46 +437,6 @@ int64_t llama_context::n_pos_per_token() const { return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; } -ggml_context_ptr llama_context::init() { - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - return ggml_context_ptr { ggml_init(params) }; -} - -void llama_context::synchronize() { - ggml_backend_sched_synchronize(sched.get()); - - // FIXME: if multiple single tokens are evaluated without a synchronization, - // the stats will be added to the prompt evaluation stats - // this should only happen when using batch size 1 to evaluate a batch - - // add the evaluation to the stats - if (n_queued_tokens == 1) { - if (!cparams.no_perf) { - t_eval_us += ggml_time_us() - t_compute_start_us; - } - n_eval++; - } else if (n_queued_tokens > 1) { - if (!cparams.no_perf) { - t_p_eval_us += ggml_time_us() - t_compute_start_us; - } - n_p_eval += n_queued_tokens; - } - - // get a more accurate load time, upon first eval - if (n_queued_tokens > 0 && !has_evaluated_once) { - t_load_us = ggml_time_us() - t_start_us; - has_evaluated_once = true; - } - - n_queued_tokens = 0; - t_compute_start_us = 0; -} - void llama_context::attach_threadpool( ggml_threadpool_t threadpool, ggml_threadpool_t threadpool_batch) { @@ -269,7 +505,54 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } -enum ggml_status llama_context::compute_graph( +void llama_context::synchronize() { + ggml_backend_sched_synchronize(sched.get()); + + // FIXME: if multiple single tokens are evaluated without a synchronization, + // the stats will be added to the prompt evaluation stats + // this should only happen when using batch size 1 to evaluate a batch + + // add the evaluation to the stats + if (n_queued_tokens == 1) { + if (!cparams.no_perf) { + t_eval_us += ggml_time_us() - t_compute_start_us; + } + n_eval++; + } else if (n_queued_tokens > 1) { + if (!cparams.no_perf) { + t_p_eval_us += ggml_time_us() - t_compute_start_us; + } + n_p_eval += n_queued_tokens; + } + + // get a more accurate load time, upon first eval + if (n_queued_tokens > 0 && !has_evaluated_once) { + t_load_us = ggml_time_us() - t_start_us; + has_evaluated_once = true; + } + + n_queued_tokens = 0; + t_compute_start_us = 0; +} + +ggml_context_ptr llama_context::graph_init() { + inp_tokens = nullptr; + inp_embd = nullptr; + inp_pos = nullptr; + inp_out_ids = nullptr; + inp_mean = nullptr; + inp_cls = nullptr; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + return ggml_context_ptr { ggml_init(params) }; +} + +enum ggml_status llama_context::graph_compute( ggml_cgraph * graph, bool batched) { int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; @@ -608,7 +891,7 @@ void llama_context::build_cb( } ggml_cgraph * llama_context::build_graph(const llama_ubatch & ubatch, bool worst_case) { - return model.build_graph(*this, cparams, ubatch, init(), worst_case); + return model.build_graph(*this, cparams, ubatch, graph_init(), worst_case); } llama_perf_context_data llama_context::perf_get_data() const { @@ -1183,100 +1466,15 @@ void llama_context::perf_reset() { llama_context_kv_self::llama_context_kv_self( const llama_model & model, - const llama_context_params & params) : llama_context(model) { + const llama_context_params & params) : + llama_context(model, params) { const auto & hparams = model.hparams; - cparams.n_seq_max = std::max(1u, params.n_seq_max); - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch; - cparams.yarn_ext_factor = params.yarn_ext_factor; - cparams.yarn_attn_factor = params.yarn_attn_factor; - cparams.yarn_beta_fast = params.yarn_beta_fast; - cparams.yarn_beta_slow = params.yarn_beta_slow; - cparams.defrag_thold = params.defrag_thold; - cparams.embeddings = params.embeddings; - cparams.offload_kqv = params.offload_kqv; - cparams.flash_attn = params.flash_attn; - cparams.no_perf = params.no_perf; - cparams.pooling_type = params.pooling_type; - - cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; - cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; - - cparams.n_ctx = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams)); - - // with causal attention, the batch size is limited by the context size - cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; - - // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask - // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) - // ref: https://github.com/ggerganov/llama.cpp/pull/5021 - if (cparams.n_batch < GGML_KQ_MASK_PAD) { - LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); - cparams.n_batch = GGML_KQ_MASK_PAD; - } - - cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - - cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : - hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : - hparams.n_ctx_train; - - cparams.cb_eval = params.cb_eval; - cparams.cb_eval_user_data = params.cb_eval_user_data; - - auto rope_scaling_type = params.rope_scaling_type; - if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { - rope_scaling_type = hparams.rope_scaling_type_train; - } - - if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { - cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none - } - - if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' - cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; - } - - cparams.yarn_attn_factor *= hparams.rope_attn_factor; - - if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { - if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { - cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; - } else { - cparams.pooling_type = hparams.pooling_type; - } - } + LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) { - cparams.causal_attn = hparams.causal_attn; - } else { - cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; - } + cparams.n_ctx = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams)); - const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; - - LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); - LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); - LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); - LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); - LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); - LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - - if (n_ctx_per_seq < hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", - __func__, n_ctx_per_seq, hparams.n_ctx_train); - } - - if (n_ctx_per_seq > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", - __func__, n_ctx_per_seq, hparams.n_ctx_train); - } - - logits_all = params.logits_all; + LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx); // build worst-case graph for encoder if a model contains encoder is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder() @@ -1298,51 +1496,6 @@ llama_context_kv_self::llama_context_kv_self( GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); if (!hparams.vocab_only) { - // GPU backends - for (auto * dev : model.devices) { - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); - throw std::runtime_error("failed to initialize backend"); - } - backends.emplace_back(backend); - } - - // add ACCEL backends (such as BLAS) - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - ggml_backend_dev_t dev = ggml_backend_dev_get(i); - if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); - throw std::runtime_error("failed to initialize backend"); - } - backends.emplace_back(backend); - } - } - - // add CPU backend - backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - if (backend_cpu == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); - throw std::runtime_error("failed to initialize CPU backend"); - } - backends.emplace_back(backend_cpu); - - // create a list of the set_n_threads functions in the backends - for (auto & backend : backends) { - ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); - ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; - if (reg) { - auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); - if (ggml_backend_set_n_threads_fn) { - set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); - } - } - } - - llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data); - if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); throw std::runtime_error("failed to initialize self-attention cache"); @@ -1357,128 +1510,6 @@ llama_context_kv_self::llama_context_kv_self( ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - - // graph outputs buffer - { - // resized during inference when a batch uses more outputs - if (output_reserve(params.n_seq_max) < params.n_seq_max) { - LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); - throw std::runtime_error("failed to reserve initial output buffer"); - } - - LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, - ggml_backend_buffer_name (buf_output.get()), - ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0); - } - - // scheduler and compute buffers - { - // buffer types used for the compute buffer of each backend - std::vector backend_buft; - std::vector backend_ptrs; - for (auto & backend : backends) { - auto * buft = ggml_backend_get_default_buffer_type(backend.get()); - auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); - if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) { - // use the host buffer of the first device CPU for faster transfer of the intermediate state - auto * dev = model.devices[0]; - auto * host_buft = ggml_backend_dev_host_buffer_type(dev); - if (host_buft) { - buft = host_buft; - } - } - backend_buft.push_back(buft); - backend_ptrs.push_back(backend.get()); - } - - const size_t max_nodes = model.max_nodes(); - - // buffer used to store the computation graph and the tensor meta data - // TODO: move to base class - buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); - - // TODO: move these checks to ggml_backend_sched - // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary - bool pipeline_parallel = - model.n_devices() > 1 && - model.params.n_gpu_layers > (int) model.hparams.n_layer && - model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && - params.offload_kqv; - - // pipeline parallelism requires support for async compute and events in all devices - if (pipeline_parallel) { - for (auto & backend : backends) { - auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); - if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { - // ignore CPU backend - continue; - } - auto * dev = ggml_backend_get_device(backend.get()); - ggml_backend_dev_props props; - ggml_backend_dev_get_props(dev, &props); - if (!props.caps.async || !props.caps.events) { - // device does not support async compute or events - pipeline_parallel = false; - break; - } - } - } - - sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); - - if (pipeline_parallel) { - LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); - } - - // initialize scheduler with the worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - - llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_pp = build_graph(ubatch_pp, true); - - // reserve pp graph first so that buffers are only allocated once - ggml_backend_sched_reserve(sched.get(), gf_pp); - int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); - int n_nodes_pp = ggml_graph_n_nodes(gf_pp); - - // reserve with tg graph to get the number of splits and nodes - llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_tg = build_graph(ubatch_tg, true); - ggml_backend_sched_reserve(sched.get(), gf_tg); - int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); - int n_nodes_tg = ggml_graph_n_nodes(gf_tg); - - // reserve again with pp graph to avoid ggml-alloc reallocations during inference - gf_pp = build_graph(ubatch_pp, true); - if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - throw std::runtime_error("failed to allocate compute buffers"); - } - - for (size_t i = 0; i < backend_ptrs.size(); ++i) { - ggml_backend_t backend = backend_ptrs[i]; - ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); - if (size > 1) { - LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); - } - } - - if (n_nodes_pp == n_nodes_tg) { - LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); - } else { - LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); - } - if (n_splits_pp == n_splits_tg) { - LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); - } else { - LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); - } - } } } @@ -1497,15 +1528,7 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const { return &kv_self; } -ggml_context_ptr llama_context_kv_self::init() { - inp_tokens = nullptr; - inp_embd = nullptr; - inp_pos = nullptr; - inp_out_ids = nullptr; - inp_mean = nullptr; - inp_cls = nullptr; - inp_embd_enc = nullptr; - inp_pos_bucket = nullptr; +ggml_context_ptr llama_context_kv_self::graph_init() { inp_KQ_mask = nullptr; inp_KQ_mask_cnv = nullptr; inp_KQ_mask_swa = nullptr; @@ -1514,8 +1537,10 @@ ggml_context_ptr llama_context_kv_self::init() { inp_K_shift = nullptr; inp_s_copy = nullptr; inp_s_mask = nullptr; + inp_embd_enc = nullptr; + inp_pos_bucket = nullptr; - return llama_context::init(); + return llama_context::graph_init(); } struct llama_context_kv_self::batch_manager { @@ -1817,7 +1842,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor"); } - const auto compute_status = compute_graph(gf, ubatch.n_tokens > 1); + const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { bman->restore(); switch (compute_status) { @@ -2035,7 +2060,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { } } - const auto compute_status = compute_graph(gf, n_tokens > 1); + const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { case GGML_STATUS_SUCCESS: break; @@ -2422,7 +2447,7 @@ void llama_context_kv_self::kv_self_update() { if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(sched.get()); - auto ctx = init(); + auto ctx = graph_init(); auto ctx0 = ctx.get(); ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -2433,7 +2458,7 @@ void llama_context_kv_self::kv_self_update() { input_set({}); - compute_graph(gf, false); + graph_compute(gf, false); need_reserve = true; } @@ -2451,7 +2476,7 @@ void llama_context_kv_self::kv_self_update() { if (kv.do_defrag) { ggml_backend_sched_reset(sched.get()); - auto ctx = init(); + auto ctx = graph_init(); auto ctx0 = ctx.get(); ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -2463,7 +2488,7 @@ void llama_context_kv_self::kv_self_update() { // no input //input_set({}); - compute_graph(gf, false); + graph_compute(gf, false); kv.do_defrag = false; diff --git a/src/llama-context.h b/src/llama-context.h index f8040138222c2..e70c99f331cd3 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -21,9 +21,16 @@ class llama_io_write_i; using llama_loras = std::unordered_map; struct llama_context : public llama_graph_i { - llama_context(const llama_model & model); + llama_context( + const llama_model & model, + const llama_context_params & params); + virtual ~llama_context(); + // init scheduler and compute buffers + // call once after the context is constructed + virtual void init(); + const llama_model & get_model() const; const llama_cparams & get_cparams() const; @@ -52,10 +59,6 @@ struct llama_context : public llama_graph_i { virtual int64_t n_pos_per_token() const; // vision - virtual ggml_context_ptr init(); - - virtual void synchronize(); - virtual void attach_threadpool( ggml_threadpool_t threadpool, ggml_threadpool_t threadpool_batch); @@ -85,8 +88,14 @@ struct llama_context : public llama_graph_i { int32_t il_start, int32_t il_end); + //// + + virtual void synchronize(); + + virtual ggml_context_ptr graph_init(); + // returns the result of ggml_backend_sched_graph_compute_async execution - virtual enum ggml_status compute_graph( + virtual enum ggml_status graph_compute( ggml_cgraph * graph, bool batched); @@ -297,7 +306,7 @@ class llama_context_kv_self : public llama_context { virtual void kv_self_update() override; - virtual ggml_context_ptr init() override; + virtual ggml_context_ptr graph_init() override; virtual void input_set(const llama_ubatch & ubatch) override; @@ -312,7 +321,7 @@ class llama_context_kv_self : public llama_context { // certain implementations could require a padding for the context size uint32_t get_ctx_padding(const llama_cparams & cparams) const; - // === unified KV cache === + // === KV cache === llama_kv_cache kv_self; diff --git a/src/llama.cpp b/src/llama.cpp index d20a2a6d50f60..a677902f0ba7c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -328,6 +328,7 @@ struct llama_context * llama_init_from_model( try { // TODO: add logic which llama_context implementation to construct ctx = new llama_context_kv_self(*model, params); + ctx->init(); } catch (const std::exception & e) { LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what()); return nullptr; @@ -410,7 +411,6 @@ const char * llama_print_system_info(void) { static std::string s; s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls. - for (size_t i = 0; i < ggml_backend_reg_count(); i++) { auto * reg = ggml_backend_reg_get(i); auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); From d5e8e1a2ba315599d09e6d5fbb37a2b98f841c07 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 14 Feb 2025 16:10:55 +0200 Subject: [PATCH 45/84] context : remove batch_manager ggml-ci --- src/llama-batch.h | 4 +- src/llama-context.cpp | 334 ++++++++++++++++++------------------------ src/llama-context.h | 61 ++++---- src/llama-kv-cache.h | 6 +- 4 files changed, 178 insertions(+), 227 deletions(-) diff --git a/src/llama-batch.h b/src/llama-batch.h index 773c3808b770f..f1df40d27086e 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -42,9 +42,9 @@ struct llama_sbatch { bool logits_all; // TODO: remove once lctx.logits_all is removed too // sorted indices into the batch - std::vector ids; + std::vector ids; // batch indices of the output - std::vector out_ids; + std::vector out_ids; std::vector seq; const llama_batch * batch = nullptr; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 31085f644ba0f..f3fa4c592c86b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -161,7 +161,7 @@ llama_context::llama_context( // graph outputs buffer { // resized during inference when a batch uses more outputs - if (output_reserve(params.n_seq_max) < params.n_seq_max) { + if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) { LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); throw std::runtime_error("failed to reserve initial output buffer"); } @@ -747,11 +747,11 @@ void llama_context::input_set(const llama_ubatch & ubatch) { ); } -size_t llama_context::output_reserve(size_t n_outputs) { +int32_t llama_context::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; - const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); + const int64_t n_outputs_max = std::max(n_outputs, cparams.n_seq_max); const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); @@ -817,7 +817,7 @@ size_t llama_context::output_reserve(size_t n_outputs) { } void llama_context::output_reorder() { - std::vector & out_ids = sbatch.out_ids; + auto & out_ids = sbatch.out_ids; if (!out_ids.empty()) { const uint32_t n_vocab = model.vocab.n_tokens(); const uint32_t n_embd = model.hparams.n_embd; @@ -1320,8 +1320,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) { { output_reorder(); - const uint32_t n_outputs = this->n_outputs; - const auto & output_ids = this->output_ids; + const auto n_outputs = this->n_outputs; + const auto & output_ids = this->output_ids; std::vector w_output_pos; @@ -1334,7 +1334,7 @@ size_t llama_context::state_get_data(llama_io_write_i & io) { // map an output id to a position in the batch int32_t pos = output_ids[i]; if (pos >= 0) { - GGML_ASSERT((uint32_t) pos < n_outputs); + GGML_ASSERT(pos < n_outputs); w_output_pos[pos] = i; } } @@ -1386,15 +1386,15 @@ size_t llama_context::state_set_data(llama_io_read_i & io) { // read output ids { - std::vector output_pos; - - uint32_t n_outputs; + auto n_outputs = this->n_outputs; io.read_to(&n_outputs, sizeof(n_outputs)); if (n_outputs > output_reserve(n_outputs)) { throw std::runtime_error("could not reserve outputs"); } + std::vector output_pos; + if (n_outputs) { output_pos.resize(n_outputs); io.read_to(output_pos.data(), n_outputs * sizeof(int32_t)); @@ -1543,73 +1543,112 @@ ggml_context_ptr llama_context_kv_self::graph_init() { return llama_context::graph_init(); } -struct llama_context_kv_self::batch_manager { - batch_manager(llama_context_kv_self & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { - const auto & model = lctx.model; - const auto & cparams = lctx.cparams; - const auto & hparams = lctx.model.hparams; +int llama_context_kv_self::decode(llama_batch & inp_batch) { + is_encoding = false; - const auto & kv_self = lctx.kv_self; + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } - const int64_t n_tokens_all = batch.n_tokens; - const int64_t n_embd = hparams.n_embd; + // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + const llama_batch & batch = batch_allocr.batch; - if (batch.token) { - for (int64_t i = 0; i < n_tokens_all; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); - throw std::runtime_error("invalid token"); - } - } + const auto & vocab = model.vocab; + const auto & hparams = model.hparams; + + const int32_t n_vocab = vocab.n_tokens(); + + const int64_t n_tokens_all = batch.n_tokens; + const int64_t n_embd = hparams.n_embd; + + // TODO: remove this stuff + class batch_guard { + public: + batch_guard(llama_kv_cache & kv_self) : kv_slot_restorer(kv_self) { } - GGML_ASSERT(n_tokens_all <= cparams.n_batch); + ~batch_guard() { + if (!is_done) { + kv_slot_restorer.restore(); + } + } - GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); + void done() { + is_done = true; + } - if (lctx.t_compute_start_us == 0) { - lctx.t_compute_start_us = ggml_time_us(); + void save(const llama_kv_cache_slot_info & slot_info) { + kv_slot_restorer.save(slot_info); } - lctx.n_queued_tokens += n_tokens_all; - // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens - const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + private: + bool is_done = false; - lctx.embd_seq.clear(); + llama_kv_slot_restorer kv_slot_restorer; + }; + + batch_guard bg(kv_self); - // count outputs - if (batch.logits && !embd_pooled) { - for (uint32_t i = 0; i < n_tokens_all; ++i) { - n_outputs_all += batch.logits[i] != 0; + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + + if (batch.token) { + for (int64_t i = 0; i < n_tokens_all; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); + throw std::runtime_error("invalid token"); } - } else if (lctx.logits_all || embd_pooled) { - n_outputs_all = n_tokens_all; - } else { - // keep last output only - n_outputs_all = 1; } + } - const bool logits_all = n_outputs_all == n_tokens_all; + GGML_ASSERT(n_tokens_all <= cparams.n_batch); - lctx.sbatch.from_batch(batch, n_embd, - /* simple_split */ !kv_self.recurrent, - /* logits_all */ logits_all); - } + GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); - ~batch_manager() { + if (t_compute_start_us == 0) { + t_compute_start_us = ggml_time_us(); } + n_queued_tokens += n_tokens_all; + + // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - bool is_done() const { - return lctx.sbatch.n_tokens == 0; + embd_seq.clear(); + + int64_t n_outputs_all = 0; + + // count outputs + if (batch.logits && !embd_pooled) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + n_outputs_all += batch.logits[i] != 0; + } + } else if (logits_all || embd_pooled) { + n_outputs_all = n_tokens_all; + } else { + // keep last output only + n_outputs_all = 1; } - llama_ubatch next() { - llama_ubatch ubatch = llama_ubatch(); + const bool logits_all = n_outputs_all == n_tokens_all; + + sbatch.from_batch(batch, n_embd, + /* simple_split */ !kv_self.recurrent, + /* logits_all */ logits_all); + + // reserve output buffer + // TODO: move to batch manager? + if (output_reserve(n_outputs_all) < n_outputs_all) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + return -2; + }; + + int64_t n_outputs_prev = 0; - const auto & cparams = lctx.cparams; - const auto & kv_self = lctx.kv_self; + while (sbatch.n_tokens > 0) { + llama_ubatch ubatch = llama_ubatch(); const auto & n_ubatch = cparams.n_ubatch; @@ -1618,28 +1657,16 @@ struct llama_context_kv_self::batch_manager { if (kv_self.recurrent) { if (embd_pooled) { // Pooled embeddings cannot be split across ubatches (yet) - ubatch = lctx.sbatch.split_seq(n_ubatch); + ubatch = sbatch.split_seq(n_ubatch); } else { // recurrent model architectures are easier to implement // with equal-length sequences - ubatch = lctx.sbatch.split_equal(n_ubatch); + ubatch = sbatch.split_equal(n_ubatch); } } else { - ubatch = lctx.sbatch.split_simple(n_ubatch); + ubatch = sbatch.split_simple(n_ubatch); } - return ubatch; - } - - bool prepare(const llama_ubatch & ubatch) { - const auto & cparams = lctx.cparams; - const auto & hparams = lctx.model.hparams; - const auto & batch = lctx.sbatch.batch; - - const auto n_tokens_all = batch->n_tokens; - - auto & kv_self = lctx.kv_self; - // count the outputs in this u_batch { int32_t n_outputs_new = 0; @@ -1654,12 +1681,12 @@ struct llama_context_kv_self::batch_manager { } // needs to happen before the graph is built - lctx.n_outputs = n_outputs_new; + n_outputs = n_outputs_new; } // non-causal masks do not use the KV cache if (hparams.causal_attn) { - lctx.kv_self_update(); + kv_self_update(); // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it @@ -1669,10 +1696,11 @@ struct llama_context_kv_self::batch_manager { const auto slot_info = kv_self.find_slot(ubatch); if (!slot_info) { - return false; + LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); + return -3; } - kv_slot_restorer.save(slot_info); + bg.save(slot_info); if (!kv_self.recurrent) { // a heuristic, to avoid attending the full cache if it is not yet utilized @@ -1687,12 +1715,9 @@ struct llama_context_kv_self::batch_manager { //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); // reserve a worst case graph if needed - if (lctx.need_reserve) { + if (need_reserve) { LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); - const auto & cparams = lctx.cparams; - const auto & model = lctx.model; - // build worst-case graph uint32_t n_seqs = 1; // TODO: worst-case number of sequences uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); @@ -1700,112 +1725,15 @@ struct llama_context_kv_self::batch_manager { llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf = lctx.build_graph(ubatch, true); + ggml_cgraph * gf = build_graph(ubatch, true); // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched.get()); - if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { + ggml_backend_sched_reset(sched.get()); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); } - lctx.need_reserve = false; - } - - return true; - } - - void restore() { - kv_slot_restorer.restore(lctx.kv_self); - } - - void update(const llama_ubatch & ubatch) { - auto & kv_self = lctx.kv_self; - - // update the kv ring buffer - { - kv_self.head += ubatch.n_tokens; - - // Ensure kv cache head points to a valid index. - if (kv_self.head >= kv_self.size) { - kv_self.head = 0; - } - } - } - - void finalize() { - const auto & cparams = lctx.cparams; - - auto & kv_self = lctx.kv_self; - - // decide if we need to defrag the kv cache - if (cparams.causal_attn && cparams.defrag_thold > 0.0f) { - // - do not defrag small contexts (i.e. < 2048 tokens) - // - count the padding towards the number of used tokens - const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + lctx.get_ctx_padding(cparams))/float(kv_self.n)) : 0.0f; - - // queue defragmentation for next llama_kv_cache_update - if (fragmentation > cparams.defrag_thold) { - LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); - - kv_self.defrag(); - } - } - } - - int64_t n_outputs_all = 0; - - llama_context_kv_self & lctx; - - const llama_batch & batch; - - llama_kv_slot_restorer kv_slot_restorer; -}; - -std::unique_ptr llama_context_kv_self::prepare_batch(const llama_batch & batch) { - return std::make_unique(*this, batch); -} - -int llama_context_kv_self::decode(llama_batch & inp_batch) { - is_encoding = false; - - if (inp_batch.n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); - return -1; - } - - // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); - - const llama_batch & batch = batch_allocr.batch; - - const auto & vocab = model.vocab; - const auto & hparams = model.hparams; - - const int32_t n_vocab = vocab.n_tokens(); - const int64_t n_embd = hparams.n_embd; - - // TODO: try catch - auto bman = prepare_batch(batch); - - const auto n_outputs_all = bman->n_outputs_all; - - // reserve output buffer - // TODO: move to batch manager? - if (output_reserve(bman->n_outputs_all) < (size_t) n_outputs_all) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); - return -2; - }; - - int64_t n_outputs_prev = 0; - - while (!bman->is_done()) { - llama_ubatch ubatch = bman->next(); - - if (!bman->prepare(ubatch)) { - LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); - bman->restore(); - return -3; + need_reserve = false; } ggml_backend_sched_reset(sched.get()); @@ -1844,7 +1772,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { - bman->restore(); switch (compute_status) { case GGML_STATUS_ABORTED: return 2; @@ -1856,7 +1783,15 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { } } - bman->update(ubatch); + // update the kv ring buffer + { + kv_self.head += ubatch.n_tokens; + + // Ensure kv cache head points to a valid index. + if (kv_self.head >= kv_self.size) { + kv_self.head = 0; + } + } // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { @@ -1936,14 +1871,17 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { n_outputs_prev += n_outputs; } + // finalize the batch processing + bg.done(); + // set output mappings { bool sorted_output = true; GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all); - for (size_t i = 0; i < (size_t) n_outputs_all; ++i) { - size_t out_id = sbatch.out_ids[i]; + for (int64_t i = 0; i < n_outputs_all; ++i) { + int64_t out_id = sbatch.out_ids[i]; output_ids[out_id] = i; if (out_id != i) { sorted_output = false; @@ -1961,7 +1899,19 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // wait for the computation to finish (automatically done when obtaining the model output) //synchronize(); - bman->finalize(); + // decide if we need to defrag the kv cache + if (cparams.causal_attn && cparams.defrag_thold > 0.0f) { + // - do not defrag small contexts (i.e. < 2048 tokens) + // - count the padding towards the number of used tokens + const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + get_ctx_padding(cparams))/float(kv_self.n)) : 0.0f; + + // queue defragmentation for next llama_kv_cache_update + if (fragmentation > cparams.defrag_thold) { + LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); + + kv_self.defrag(); + } + } // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. @@ -1983,14 +1933,14 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); const llama_batch & batch = batch_allocr.batch; - const uint32_t n_tokens = batch.n_tokens; + const int32_t n_tokens = batch.n_tokens; const auto & hparams = model.hparams; GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT if (batch.token) { - for (uint32_t i = 0; i < n_tokens; ++i) { + for (int32_t i = 0; i < n_tokens; ++i) { if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); return -1; @@ -1999,7 +1949,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { } // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot - GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); + GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens"); if (t_compute_start_us == 0) { t_compute_start_us = ggml_time_us(); @@ -2019,7 +1969,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { return -2; }; - for (uint32_t i = 0; i < n_tokens; ++i) { + for (int32_t i = 0; i < n_tokens; ++i) { output_ids[i] = i; } @@ -2087,7 +2037,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { // remember the sequence ids used during the encoding - needed for cross attention later seq_ids_enc.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { + for (int32_t i = 0; i < n_tokens; i++) { for (int s = 0; s < ubatch.n_seq_id[i]; s++) { llama_seq_id seq_id = ubatch.seq_id[i][s]; seq_ids_enc[i].insert(seq_id); @@ -2116,7 +2066,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - for (uint32_t i = 0; i < n_tokens; i++) { + for (int32_t i = 0; i < n_tokens; i++) { const llama_seq_id seq_id = ubatch.seq_id[i][0]; if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { continue; @@ -2448,7 +2398,7 @@ void llama_context_kv_self::kv_self_update() { ggml_backend_sched_reset(sched.get()); auto ctx = graph_init(); - auto ctx0 = ctx.get(); + auto * ctx0 = ctx.get(); ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -2477,7 +2427,7 @@ void llama_context_kv_self::kv_self_update() { ggml_backend_sched_reset(sched.get()); auto ctx = graph_init(); - auto ctx0 = ctx.get(); + auto * ctx0 = ctx.get(); ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); diff --git a/src/llama-context.h b/src/llama-context.h index e70c99f331cd3..f2ebf4f13321f 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -92,6 +92,7 @@ struct llama_context : public llama_graph_i { virtual void synchronize(); + // zero-out inputs and create ggml_context virtual ggml_context_ptr graph_init(); // returns the result of ggml_backend_sched_graph_compute_async execution @@ -103,13 +104,40 @@ struct llama_context : public llama_graph_i { // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. - virtual size_t output_reserve(size_t n_outputs); + virtual int32_t output_reserve(int32_t n_outputs); // make the outputs have the same order they had in the user-provided batch // TODO: maybe remove this virtual void output_reorder(); + // decode a batch of tokens by evaluating the transformer + // in case of unsuccessful decoding (error or warning), + // the kv_cache state will be returned to its original state + // (for non-recurrent models) or cleaned (for recurrent models) + // + // - lctx: llama context + // - inp_batch: batch to evaluate + // + // return 0 on success + // return positive int on warning + // return negative int on error + // + virtual int decode(llama_batch & inp_batch) = 0; + + // encode a batch of tokens by evaluating the encoder part of the transformer + // + // - lctx: llama context + // - batch: batch to evaluate + // + // return 0 on success + // return positive int on warning + // return negative int on error + // + virtual int encode(llama_batch & inp_batch) = 0; + + // // graph build API (generic) + // virtual void build_cb( ggml_tensor * cur, @@ -141,31 +169,6 @@ struct llama_context : public llama_graph_i { virtual ggml_tensor * build_rope_factors(int il); - // decode a batch of tokens by evaluating the transformer - // in case of unsuccessful decoding (error or warning), - // the kv_cache state will be returned to its original state - // (for non-recurrent models) or cleaned (for recurrent models) - // - // - lctx: llama context - // - inp_batch: batch to evaluate - // - // return 0 on success - // return positive int on warning - // return negative int on error - // - virtual int decode(llama_batch & inp_batch) = 0; - - // encode a batch of tokens by evaluating the encoder part of the transformer - // - // - lctx: llama context - // - batch: batch to evaluate - // - // return 0 on success - // return positive int on warning - // return negative int on error - // - virtual int encode(llama_batch & inp_batch) = 0; - // state save/load virtual size_t state_get_size(); @@ -268,7 +271,7 @@ struct llama_context : public llama_graph_i { // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE std::map> embd_seq; - size_t output_size = 0; // capacity (of tokens positions) for the output buffers + int32_t output_size = 0; // capacity (of tokens positions) for the output buffers int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch std::vector output_ids; // map batch token positions to ids of the logits and embd buffers @@ -291,8 +294,6 @@ struct llama_context : public llama_graph_i { // transformer with a self-attention KV cache class llama_context_kv_self : public llama_context { public: - struct batch_manager; - llama_context_kv_self( const llama_model & model, const llama_context_params & params); @@ -313,8 +314,6 @@ class llama_context_kv_self : public llama_context { virtual int decode(llama_batch & inp_batch) override; virtual int encode(llama_batch & inp_batch) override; - virtual std::unique_ptr prepare_batch(const llama_batch & batch); - // max token position across all sequences in the current context llama_pos pos_max() const; diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 6ea4972979661..3bb07ca9da431 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -150,7 +150,9 @@ struct llama_kv_slot_restorer { bool do_restore = false; - explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) { + llama_kv_cache & cache; + + explicit llama_kv_slot_restorer(llama_kv_cache & cache) : cache(cache) { old_state.head = cache.head; old_state.n = cache.n; } @@ -167,7 +169,7 @@ struct llama_kv_slot_restorer { // must be explicitly called to restore the kv_cache state // and rollback changes from all llama_kv_cache_find_slot calls - void restore(struct llama_kv_cache & cache) { + void restore() { if (do_restore) { cache.head = old_state.head; cache.n = old_state.n; From 828064564cb661c763d7fb8ac9f0095666b143c3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 14 Feb 2025 16:48:21 +0200 Subject: [PATCH 46/84] context : move common inputs to base class ggml-ci --- src/llama-context.cpp | 178 +++++++++++++++++++++--------------------- src/llama-context.h | 44 +++++------ 2 files changed, 111 insertions(+), 111 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f3fa4c592c86b..01dd19e559481 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -987,6 +987,95 @@ ggml_tensor * llama_context::build_rope_factors(int il) { return model.layers[il].rope_short; } +ggml_tensor * llama_context::build_inp_embd( + ggml_context * ctx0, + ggml_tensor * tok_embd, + const llama_ubatch & ubatch) { + const auto & hparams = model.hparams; + + const int64_t n_embd = hparams.n_embd; + + struct ggml_tensor * inpL; + + if (ubatch.token) { + inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + //cb(inp_tokens, "inp_tokens", -1); + ggml_set_input(inp_tokens); + + inpL = ggml_get_rows(ctx0, tok_embd, inp_tokens); + + // apply lora for embedding tokens if needed + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( + ctx0, lw->b, // non-transposed lora_b + ggml_get_rows(ctx0, lw->a, inp_tokens) + ), scale); + + inpL = ggml_add(ctx0, inpL, inpL_delta); + } + } else { + inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); + inpL = inp_embd; + ggml_set_input(inp_embd); + } + + // For Granite architecture + if (hparams.f_embedding_scale != 0.0f) { + inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale); + } + + //cb(inpL, "inp_embd", -1); + + return inpL; +} + +ggml_tensor * llama_context::build_inp_pos( + ggml_context * ctx0, + int32_t n_tokens) { + inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + ggml_set_input(inp_pos); + + return inp_pos; +} + +ggml_tensor * llama_context::build_inp_out_ids( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) { + const int32_t n_out_ids = worst_case ? n_tokens : n_outputs; + + inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids); + ggml_set_input(inp_out_ids); + + return inp_out_ids; +} + +ggml_tensor * llama_context::build_inp_mean( + ggml_context * ctx0, + int32_t n_tokens) { + inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); + ggml_set_input(inp_mean); + + return inp_mean; +} + +ggml_tensor * llama_context::build_inp_cls( + ggml_context * ctx0, + int32_t n_tokens) { + inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp_cls); + + return inp_cls; +} + // // state // @@ -2682,95 +2771,6 @@ ggml_tensor * llama_context_kv_self::build_soft_max_ext( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } -ggml_tensor * llama_context_kv_self::build_inp_embd( - ggml_context * ctx0, - ggml_tensor * tok_embd, - const llama_ubatch & ubatch) { - const auto & hparams = model.hparams; - - const int64_t n_embd = hparams.n_embd; - - struct ggml_tensor * inpL; - - if (ubatch.token) { - inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - //cb(inp_tokens, "inp_tokens", -1); - ggml_set_input(inp_tokens); - - inpL = ggml_get_rows(ctx0, tok_embd, inp_tokens); - - // apply lora for embedding tokens if needed - for (const auto & lora : loras) { - struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd); - if (lw == nullptr) { - continue; - } - - const float adapter_scale = lora.second; - const float scale = lw->get_scale(lora.first->alpha, adapter_scale); - - struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( - ctx0, lw->b, // non-transposed lora_b - ggml_get_rows(ctx0, lw->a, inp_tokens) - ), scale); - - inpL = ggml_add(ctx0, inpL, inpL_delta); - } - } else { - inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - inpL = inp_embd; - ggml_set_input(inp_embd); - } - - // For Granite architecture - if (hparams.f_embedding_scale != 0.0f) { - inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale); - } - - //cb(inpL, "inp_embd", -1); - - return inpL; -} - -ggml_tensor * llama_context_kv_self::build_inp_pos( - ggml_context * ctx0, - int32_t n_tokens) { - inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); - ggml_set_input(inp_pos); - - return inp_pos; -} - -ggml_tensor * llama_context_kv_self::build_inp_out_ids( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) { - const int32_t n_out_ids = worst_case ? n_tokens : n_outputs; - - inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids); - ggml_set_input(inp_out_ids); - - return inp_out_ids; -} - -ggml_tensor * llama_context_kv_self::build_inp_mean( - ggml_context * ctx0, - int32_t n_tokens) { - inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); - ggml_set_input(inp_mean); - - return inp_mean; -} - -ggml_tensor * llama_context_kv_self::build_inp_cls( - ggml_context * ctx0, - int32_t n_tokens) { - inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_input(inp_cls); - - return inp_cls; -} - void llama_context_kv_self::build_k_shift( ggml_context * ctx0, ggml_cgraph * graph) { diff --git a/src/llama-context.h b/src/llama-context.h index f2ebf4f13321f..e3483228d3d1a 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -169,6 +169,28 @@ struct llama_context : public llama_graph_i { virtual ggml_tensor * build_rope_factors(int il); + virtual ggml_tensor * build_inp_embd( + ggml_context * ctx0, + ggml_tensor * tok_embd, + const llama_ubatch & ubatch); + + virtual ggml_tensor * build_inp_pos( + ggml_context * ctx0, + int32_t n_tokens); + + virtual ggml_tensor * build_inp_out_ids( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case); + + virtual ggml_tensor * build_inp_mean( + ggml_context * ctx0, + int32_t n_tokens); + + virtual ggml_tensor * build_inp_cls( + ggml_context * ctx0, + int32_t n_tokens); + // state save/load virtual size_t state_get_size(); @@ -330,28 +352,6 @@ class llama_context_kv_self : public llama_context { struct ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch] struct ggml_tensor * inp_K_shift; // I32 [kv_size] - virtual ggml_tensor * build_inp_embd( - ggml_context * ctx0, - ggml_tensor * tok_embd, - const llama_ubatch & ubatch) override; - - virtual ggml_tensor * build_inp_pos( - ggml_context * ctx0, - int32_t n_tokens) override; - - virtual ggml_tensor * build_inp_out_ids( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) override; - - virtual ggml_tensor * build_inp_mean( - ggml_context * ctx0, - int32_t n_tokens) override; - - virtual ggml_tensor * build_inp_cls( - ggml_context * ctx0, - int32_t n_tokens) override; - virtual void build_attn_inp( ggml_context * ctx0, int32_t n_tokens, From 1d801d27b9b9a79bc06255548792df9ae4f6c7fe Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 14 Feb 2025 17:22:55 +0200 Subject: [PATCH 47/84] graph : update attn/kv_self names --- src/llama-context.cpp | 12 ++++++------ src/llama-context.h | 6 +++--- src/llama-graph.h | 6 +++--- src/llama-model.cpp | 10 +++++----- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 01dd19e559481..94d6d4f907d08 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2491,7 +2491,7 @@ void llama_context_kv_self::kv_self_update() { ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - build_k_shift(ctx0, gf); + build_kv_self_shift(ctx0, gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2520,7 +2520,7 @@ void llama_context_kv_self::kv_self_update() { ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - build_defrag(ctx0, gf); + build_kv_self_defrag(ctx0, gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2762,7 +2762,7 @@ ggml_tensor * llama_context_kv_self::build_attn_qkv( return cur; } -ggml_tensor * llama_context_kv_self::build_soft_max_ext( +ggml_tensor * llama_context_kv_self::build_attn_soft_max( ggml_context * ctx0, ggml_tensor * kq, float kq_scale) { @@ -2771,7 +2771,7 @@ ggml_tensor * llama_context_kv_self::build_soft_max_ext( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } -void llama_context_kv_self::build_k_shift( +void llama_context_kv_self::build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * graph) { const auto & n_ctx = cparams.n_ctx; @@ -2843,7 +2843,7 @@ void llama_context_kv_self::build_k_shift( } } -void llama_context_kv_self::build_defrag( +void llama_context_kv_self::build_kv_self_defrag( ggml_context * ctx0, ggml_cgraph * graph) { const auto & hparams = model.hparams; @@ -2860,7 +2860,7 @@ void llama_context_kv_self::build_defrag( // number of cells moved uint32_t n_moves = 0; - // each move requires 6*n_layer tensors (see build_defrag) + // each move requires 6*n_layer tensors (see build_kv_self_defrag) // - source view, destination view, copy operation // - x2 for keys and values //const uint32_t max_moves = model.max_nodes()/(6*n_layer); diff --git a/src/llama-context.h b/src/llama-context.h index e3483228d3d1a..7a10f84bd86bb 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -379,17 +379,17 @@ class llama_context_kv_self : public llama_context { int il, bool worst_case) override; - virtual ggml_tensor * build_soft_max_ext( + virtual ggml_tensor * build_attn_soft_max( ggml_context * ctx0, ggml_tensor * kq, float kq_scale) override; - virtual void build_k_shift( + virtual void build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * graph) override; // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_defrag( + virtual void build_kv_self_defrag( ggml_context * ctx0, ggml_cgraph * graph) override; diff --git a/src/llama-graph.h b/src/llama-graph.h index 5267d53da4c06..d60b57491f2cb 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -92,17 +92,17 @@ class llama_graph_i { int il, bool worst_case) = 0; - virtual ggml_tensor * build_soft_max_ext( + virtual ggml_tensor * build_attn_soft_max( ggml_context * ctx0, ggml_tensor * kq, float kq_scale) = 0; - virtual void build_k_shift( + virtual void build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * graph) = 0; // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_defrag( + virtual void build_kv_self_defrag( ggml_context * ctx0, ggml_cgraph * graph) = 0; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ba11f1e1514cc..543e78d2b9c41 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4251,18 +4251,18 @@ struct llm_build_context { return cur; } - struct ggml_cgraph * build_k_shift() { + struct ggml_cgraph * build_kv_self_shift() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - lgf.build_k_shift(ctx0, gf); + lgf.build_kv_self_shift(ctx0, gf); return gf; } - struct ggml_cgraph * build_defrag() { + struct ggml_cgraph * build_kv_self_defrag() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - lgf.build_defrag(ctx0, gf); + lgf.build_kv_self_defrag(ctx0, gf); return gf; } @@ -5638,7 +5638,7 @@ struct llm_build_context { cb(kq, "kq", il); //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); - kq = lgf.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); + kq = lgf.build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); cb(kq, "kq_soft_max_ext", il); struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); From c23590319a54f1bb0c92033fec750e029cdab956 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Feb 2025 11:16:53 +0200 Subject: [PATCH 48/84] graph : add llama_graph_result ggml-ci --- src/llama-context.cpp | 67 ++++--- src/llama-context.h | 6 +- src/llama-graph.h | 7 + src/llama-model.cpp | 433 +++++++++++------------------------------- src/llama-model.h | 4 +- 5 files changed, 167 insertions(+), 350 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 94d6d4f907d08..55f1c03826468 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -246,31 +246,48 @@ void llama_context::init() { uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + int n_splits_pp = -1; + int n_nodes_pp = -1; + + int n_splits_tg = -1; + int n_nodes_tg = -1; + // reserve pp graph first so that buffers are only allocated once - llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_pp = build_graph(ubatch_pp, true); - if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); - throw std::runtime_error("failed to allocate compute buffers"); + { + llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + auto res_pp = graph_build(ubatch_pp, true); + auto & gf_pp = res_pp.gf; + if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); + } + + n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); + n_nodes_pp = ggml_graph_n_nodes(gf_pp); } - int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); - int n_nodes_pp = ggml_graph_n_nodes(gf_pp); // reserve with tg graph to get the number of splits and nodes - llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_tg = build_graph(ubatch_tg, true); - if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__); - throw std::runtime_error("failed to allocate compute buffers"); + { + llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + auto res_tg = graph_build(ubatch_tg, true); + auto & gf_tg = res_tg.gf; + if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); + } + n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); + n_nodes_tg = ggml_graph_n_nodes(gf_tg); } - int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); - int n_nodes_tg = ggml_graph_n_nodes(gf_tg); // reserve again with pp graph to avoid ggml-alloc reallocations during inference - gf_pp = build_graph(ubatch_pp, true); - if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); - throw std::runtime_error("failed to allocate compute buffers"); + { + llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + auto res_pp = graph_build(ubatch_pp, true); + auto & gf_pp = res_pp.gf; + if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); + } } for (size_t i = 0; i < backend_ptrs.size(); ++i) { @@ -890,7 +907,7 @@ void llama_context::build_cb( } } -ggml_cgraph * llama_context::build_graph(const llama_ubatch & ubatch, bool worst_case) { +llama_graph_result llama_context::graph_build(const llama_ubatch & ubatch, bool worst_case) { return model.build_graph(*this, cparams, ubatch, graph_init(), worst_case); } @@ -1814,11 +1831,11 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf = build_graph(ubatch, true); + auto res = graph_build(ubatch, true); // initialize scheduler with the worst-case graph ggml_backend_sched_reset(sched.get()); - if (!ggml_backend_sched_reserve(sched.get(), gf)) { + if (!ggml_backend_sched_reserve(sched.get(), res.gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); } @@ -1828,7 +1845,9 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - ggml_cgraph * gf = build_graph(ubatch, false); + auto res = graph_build(ubatch, false); + + auto & gf = res.gf; // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -2073,7 +2092,9 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - ggml_cgraph * gf = build_graph(ubatch, false); + auto res = graph_build(ubatch, false); + + auto & gf = res.gf; ggml_backend_sched_alloc_graph(sched.get(), gf); diff --git a/src/llama-context.h b/src/llama-context.h index 7a10f84bd86bb..981afcc005b06 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -95,6 +95,9 @@ struct llama_context : public llama_graph_i { // zero-out inputs and create ggml_context virtual ggml_context_ptr graph_init(); + // TODO: add encode/decode graphs + virtual llama_graph_result graph_build(const llama_ubatch & ubatch, bool worst_case); + // returns the result of ggml_backend_sched_graph_compute_async execution virtual enum ggml_status graph_compute( ggml_cgraph * graph, @@ -145,9 +148,6 @@ struct llama_context : public llama_graph_i { const llama_ubatch & ubatch, int il); - // TODO: add encode/decode graphs - virtual ggml_cgraph * build_graph(const llama_ubatch & ubatch, bool worst_case); - // apply control vector for layer il virtual ggml_tensor * build_cvec( ggml_context * ctx0, diff --git a/src/llama-graph.h b/src/llama-graph.h index d60b57491f2cb..de3cd2f043458 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -10,6 +10,13 @@ struct ggml_context; struct ggml_tensor; struct llama_ubatch; +struct llama_graph_result { + ggml_cgraph * gf = nullptr; + + ggml_tensor * t_logits = nullptr; + ggml_tensor * t_embd = nullptr; +}; + // TODO: can become more granular in the future class llama_graph_i { public: diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 543e78d2b9c41..4950af59bf01e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4251,22 +4251,6 @@ struct llm_build_context { return cur; } - struct ggml_cgraph * build_kv_self_shift() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - lgf.build_kv_self_shift(ctx0, gf); - - return gf; - } - - struct ggml_cgraph * build_kv_self_defrag() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - - lgf.build_kv_self_defrag(ctx0, gf); - - return gf; - } - struct ggml_tensor * build_inp_pos() { ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens); cb(cur, "inp_pos", -1); @@ -4295,7 +4279,7 @@ struct llm_build_context { return cur; } - struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { + void append_pooling(struct ggml_cgraph * gf) { // find result_norm tensor for input struct ggml_tensor * inp = nullptr; for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { @@ -4356,8 +4340,6 @@ struct llm_build_context { cb(cur, "result_embd_pooled", -1); ggml_build_forward_expand(gf, cur); - - return gf; } //struct ggml_tensor * build_pos_bucket(bool causal) { @@ -4406,9 +4388,7 @@ struct llm_build_context { return cur; } - struct ggml_cgraph * build_llama() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_llama(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -4563,13 +4543,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_deci() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_deci(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -4719,13 +4695,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_baichuan() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_baichuan(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -4834,13 +4806,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_xverse() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_xverse(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -4937,13 +4905,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_falcon() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_falcon(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5057,13 +5021,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_grok() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_grok(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5211,13 +5171,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_dbrx() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_dbrx(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5334,13 +5290,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_starcoder() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_starcoder(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5438,13 +5390,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_refact() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_refact(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5532,13 +5480,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_bert() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_bert(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -5726,13 +5670,9 @@ struct llm_build_context { cb(cur, "result_embd", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_bloom() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_bloom(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5827,13 +5767,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_mpt() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_mpt(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5967,13 +5903,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_stablelm() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - + void build_stablelm(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6117,13 +6049,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_qwen() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_qwen(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6229,13 +6157,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_qwen2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6341,12 +6265,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_qwen2vl() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + void build_qwen2vl(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6457,13 +6378,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_qwen2moe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_qwen2moe(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6601,13 +6518,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_phi2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_phi2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6722,13 +6635,11 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, model.output_b); cb(cur, "result_output", -1); + ggml_build_forward_expand(gf, cur); - return gf; } - struct ggml_cgraph * build_phi3() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_phi3(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6866,14 +6777,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - - struct ggml_cgraph * build_plamo() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - + void build_plamo(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6971,13 +6877,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_gpt2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_gpt2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7076,13 +6978,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_codeshell() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_codeshell(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7187,13 +7085,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_orion() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_orion(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -7305,13 +7199,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_internlm2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_internlm2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -7423,13 +7313,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_minicpm3() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_minicpm3(ggml_cgraph * gf) { //TODO: if the model varies, these parameters need to be read from the model const int64_t n_embd_base = 256; const float scale_embd = 12.0f; @@ -7633,13 +7519,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_gemma() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_gemma(ggml_cgraph * gf) { const int64_t n_embd_head_k = hparams.n_embd_head_k; struct ggml_tensor * cur; @@ -7741,13 +7623,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_gemma2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_gemma2(ggml_cgraph * gf) { const int64_t n_embd_head_k = hparams.n_embd_head_k; struct ggml_tensor * cur; @@ -7871,14 +7749,10 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - - struct ggml_cgraph * build_starcoder2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + // TODO: move up next to build_starcoder + void build_starcoder2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -7991,13 +7865,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_mamba() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_mamba(ggml_cgraph * gf) { struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -8045,14 +7915,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_command_r() { - - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_command_r(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); const float f_logit_scale = hparams.f_logit_scale; @@ -8193,14 +8058,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; - } - struct ggml_cgraph * build_cohere2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_cohere2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); const float f_logit_scale = hparams.f_logit_scale; @@ -8322,8 +8182,6 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } // ref: https://allenai.org/olmo @@ -8332,9 +8190,7 @@ struct llm_build_context { // * clamp qkv // * removed bias // * removed MoE - struct ggml_cgraph * build_olmo() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_olmo(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -8447,13 +8303,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_olmo2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_olmo2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -8566,17 +8418,13 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } // based on the build_qwen2moe() function, changes: // * removed shared experts // * removed bias // * added q, k norm - struct ggml_cgraph * build_olmoe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_olmoe(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -8692,13 +8540,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_openelm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_openelm(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8817,13 +8661,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_gptneox() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_gptneox(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8960,13 +8800,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_arctic() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_arctic(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -9089,13 +8925,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_deepseek() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_deepseek(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -9244,13 +9076,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_deepseek2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_deepseek2(ggml_cgraph * gf) { bool is_lite = (hparams.n_layer == 27); // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. @@ -9471,13 +9299,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_bitnet() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_bitnet(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9622,12 +9446,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - return gf; } - //struct ggml_cgraph * build_t5_enc() { - // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + //void build_t5_enc(ggml_cgraph * gf) { // const int64_t n_embd_head = hparams.n_embd_head_v; // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9749,13 +9570,9 @@ struct llm_build_context { // cb(cur, "result_norm", -1); // ggml_build_forward_expand(gf, cur); - - // return gf; //} - //struct ggml_cgraph * build_t5_dec() { - // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + //void build_t5_dec(ggml_cgraph * gf) { // const int64_t n_embd_head = hparams.n_embd_head_v; // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9954,9 +9771,7 @@ struct llm_build_context { // return gf; //} - struct ggml_cgraph * build_jais() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_jais(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10041,13 +9856,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_chatglm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_chatglm(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10170,13 +9981,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_nemotron() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_nemotron(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); //GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -10290,13 +10097,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_exaone() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_exaone(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -10412,13 +10215,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - ggml_cgraph * build_rwkv6() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_rwkv6(ggml_cgraph * gf) { GGML_ASSERT(hparams.token_shift_count == 2); struct ggml_tensor * cur; @@ -10502,14 +10301,10 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py - ggml_cgraph * build_rwkv6qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_rwkv6qwen2(ggml_cgraph * gf) { GGML_ASSERT(n_embd == hparams.n_embd_k_s()); struct ggml_tensor * cur; @@ -10586,8 +10381,6 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } // ref: https://github.com/facebookresearch/chameleon @@ -10596,9 +10389,7 @@ struct llm_build_context { // * swin-norm // * removed bias // * removed MoE - struct ggml_cgraph * build_chameleon() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_chameleon(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -10759,13 +10550,9 @@ struct llm_build_context { cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); - - return gf; } - struct ggml_cgraph * build_wavtokenizer_dec() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - + void build_wavtokenizer_dec(ggml_cgraph * gf) { struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -10911,231 +10698,233 @@ struct llm_build_context { cb(cur, "result_embd", -1); ggml_build_forward_expand(gf, cur); - - return gf; } }; -ggml_cgraph * llama_model::build_graph( +llama_graph_result llama_model::build_graph( llama_graph_i & lgf, const llama_cparams & cparams, const llama_ubatch & ubatch, ggml_context_ptr && ctx, bool worst_case) const { - struct ggml_cgraph * result = NULL; + llama_graph_result result = {}; struct llm_build_context llm(lgf, *this, cparams, ubatch, std::move(ctx), worst_case); + auto & gf = result.gf; + + gf = ggml_new_graph_custom(llm.ctx0, max_nodes(), false); + switch (arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_MINICPM: case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: { - result = llm.build_llama(); + llm.build_llama(gf); } break; case LLM_ARCH_DECI: { - result = llm.build_deci(); + llm.build_deci(gf); } break; case LLM_ARCH_BAICHUAN: { - result = llm.build_baichuan(); + llm.build_baichuan(gf); } break; case LLM_ARCH_FALCON: { - result = llm.build_falcon(); + llm.build_falcon(gf); } break; case LLM_ARCH_GROK: { - result = llm.build_grok(); + llm.build_grok(gf); } break; case LLM_ARCH_STARCODER: { - result = llm.build_starcoder(); + llm.build_starcoder(gf); } break; case LLM_ARCH_REFACT: { - result = llm.build_refact(); + llm.build_refact(gf); } break; case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: { - result = llm.build_bert(); + llm.build_bert(gf); } break; case LLM_ARCH_BLOOM: { - result = llm.build_bloom(); + llm.build_bloom(gf); } break; case LLM_ARCH_MPT: { - result = llm.build_mpt(); + llm.build_mpt(gf); } break; case LLM_ARCH_STABLELM: { - result = llm.build_stablelm(); + llm.build_stablelm(gf); } break; case LLM_ARCH_QWEN: { - result = llm.build_qwen(); + llm.build_qwen(gf); } break; case LLM_ARCH_QWEN2: { - result = llm.build_qwen2(); + llm.build_qwen2(gf);; } break; case LLM_ARCH_QWEN2VL: { - result = llm.build_qwen2vl(); + llm.build_qwen2vl(gf); } break; case LLM_ARCH_QWEN2MOE: { - result = llm.build_qwen2moe(); + llm.build_qwen2moe(gf); } break; case LLM_ARCH_PHI2: { - result = llm.build_phi2(); + llm.build_phi2(gf); } break; case LLM_ARCH_PHI3: case LLM_ARCH_PHIMOE: { - result = llm.build_phi3(); + llm.build_phi3(gf); } break; case LLM_ARCH_PLAMO: { - result = llm.build_plamo(); + llm.build_plamo(gf); } break; case LLM_ARCH_GPT2: { - result = llm.build_gpt2(); + llm.build_gpt2(gf); } break; case LLM_ARCH_CODESHELL: { - result = llm.build_codeshell(); + llm.build_codeshell(gf); } break; case LLM_ARCH_ORION: { - result = llm.build_orion(); + llm.build_orion(gf); } break; case LLM_ARCH_INTERNLM2: { - result = llm.build_internlm2(); + llm.build_internlm2(gf); } break; case LLM_ARCH_MINICPM3: { - result = llm.build_minicpm3(); + llm.build_minicpm3(gf); } break; case LLM_ARCH_GEMMA: { - result = llm.build_gemma(); + llm.build_gemma(gf); } break; case LLM_ARCH_GEMMA2: { - result = llm.build_gemma2(); + llm.build_gemma2(gf); } break; case LLM_ARCH_STARCODER2: { - result = llm.build_starcoder2(); + llm.build_starcoder2(gf); } break; case LLM_ARCH_MAMBA: { - result = llm.build_mamba(); + llm.build_mamba(gf); } break; case LLM_ARCH_XVERSE: { - result = llm.build_xverse(); + llm.build_xverse(gf); } break; case LLM_ARCH_COMMAND_R: { - result = llm.build_command_r(); + llm.build_command_r(gf); } break; case LLM_ARCH_COHERE2: { - result = llm.build_cohere2(); + llm.build_cohere2(gf); } break; case LLM_ARCH_DBRX: { - result = llm.build_dbrx(); + llm.build_dbrx(gf); } break; case LLM_ARCH_OLMO: { - result = llm.build_olmo(); + llm.build_olmo(gf); } break; case LLM_ARCH_OLMO2: { - result = llm.build_olmo2(); + llm.build_olmo2(gf); } break; case LLM_ARCH_OLMOE: { - result = llm.build_olmoe(); + llm.build_olmoe(gf); } break; case LLM_ARCH_OPENELM: { - result = llm.build_openelm(); + llm.build_openelm(gf); } break; case LLM_ARCH_GPTNEOX: { - result = llm.build_gptneox(); + llm.build_gptneox(gf); } break; case LLM_ARCH_ARCTIC: { - result = llm.build_arctic(); + llm.build_arctic(gf); } break; case LLM_ARCH_DEEPSEEK: { - result = llm.build_deepseek(); + llm.build_deepseek(gf); } break; case LLM_ARCH_DEEPSEEK2: { - result = llm.build_deepseek2(); + llm.build_deepseek2(gf); } break; case LLM_ARCH_CHATGLM: { - result = llm.build_chatglm(); + llm.build_chatglm(gf); } break; case LLM_ARCH_BITNET: { - result = llm.build_bitnet(); + llm.build_bitnet(gf); } break; //case LLM_ARCH_T5: // { // if (lctx.is_encoding) { - // result = llm.build_t5_enc(); + // llm.build_t5_enc(gf); // } else { - // result = llm.build_t5_dec(); + // llm.build_t5_dec(gf); // } // } break; //case LLM_ARCH_T5ENCODER: // { - // result = llm.build_t5_enc(); + // llm.build_t5_enc(gf); // } break; case LLM_ARCH_JAIS: { - result = llm.build_jais(); + llm.build_jais(gf); } break; case LLM_ARCH_NEMOTRON: { - result = llm.build_nemotron(); + llm.build_nemotron(gf); } break; case LLM_ARCH_EXAONE: { - result = llm.build_exaone(); + llm.build_exaone(gf); } break; case LLM_ARCH_RWKV6: { - result = llm.build_rwkv6(); + llm.build_rwkv6(gf); } break; case LLM_ARCH_RWKV6QWEN2: { - result = llm.build_rwkv6qwen2(); + llm.build_rwkv6qwen2(gf); } break; case LLM_ARCH_CHAMELEON: { - result = llm.build_chameleon(); + llm.build_chameleon(gf); } break; case LLM_ARCH_WAVTOKENIZER_DEC: { - result = llm.build_wavtokenizer_dec(); + llm.build_wavtokenizer_dec(gf); } break; default: GGML_ABORT("fatal error"); @@ -11143,7 +10932,7 @@ ggml_cgraph * llama_model::build_graph( // add on pooling layer if (cparams.embeddings) { - result = llm.append_pooling(result); + llm.append_pooling(gf); } return result; diff --git a/src/llama-model.h b/src/llama-model.h index 0374b484b10ab..a3267bbbbb44a 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -16,6 +16,7 @@ class llama_graph_i; struct llama_cparams; struct llama_ubatch; struct llama_model_loader; +struct llama_graph_result; // available models enum llm_type { @@ -368,8 +369,7 @@ struct llama_model { const struct ggml_tensor * get_tensor(const char * name) const; // TODO: add encode/decode graphs - // TODO: return a struct containing the graph and the output tensors, such as logits, embeddings, etc. - ggml_cgraph * build_graph( + llama_graph_result build_graph( llama_graph_i & lgf, const llama_cparams & cparams, const llama_ubatch & ubatch, From 172f61690cb612be187980c5174707aeb5871714 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Feb 2025 13:48:43 +0200 Subject: [PATCH 49/84] cont : return important tensors ggml-ci --- src/llama-context.cpp | 29 +++-- src/llama-context.h | 5 +- src/llama-graph.h | 6 +- src/llama-model.cpp | 289 ++++++++++++++++++++++++++++++++++++++---- src/llama-model.h | 10 +- 5 files changed, 293 insertions(+), 46 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 55f1c03826468..d39263d288f8b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -255,7 +255,8 @@ void llama_context::init() { // reserve pp graph first so that buffers are only allocated once { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto res_pp = graph_build(ubatch_pp, true); + auto ctx = graph_init(); + auto res_pp = graph_build(ctx, ubatch_pp, true); auto & gf_pp = res_pp.gf; if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); @@ -269,7 +270,8 @@ void llama_context::init() { // reserve with tg graph to get the number of splits and nodes { llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto res_tg = graph_build(ubatch_tg, true); + auto ctx = graph_init(); + auto res_tg = graph_build(ctx, ubatch_tg, true); auto & gf_tg = res_tg.gf; if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) { LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__); @@ -282,7 +284,8 @@ void llama_context::init() { // reserve again with pp graph to avoid ggml-alloc reallocations during inference { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto res_pp = graph_build(ubatch_pp, true); + auto ctx = graph_init(); + auto res_pp = graph_build(ctx, ubatch_pp, true); auto & gf_pp = res_pp.gf; if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); @@ -569,6 +572,13 @@ ggml_context_ptr llama_context::graph_init() { return ggml_context_ptr { ggml_init(params) }; } +llama_graph_result llama_context::graph_build( + ggml_context_ptr & ctx, + const llama_ubatch & ubatch, + bool worst_case) { + return model.build_graph(ctx, *this, cparams, ubatch, worst_case); +} + enum ggml_status llama_context::graph_compute( ggml_cgraph * graph, bool batched) { @@ -907,10 +917,6 @@ void llama_context::build_cb( } } -llama_graph_result llama_context::graph_build(const llama_ubatch & ubatch, bool worst_case) { - return model.build_graph(*this, cparams, ubatch, graph_init(), worst_case); -} - llama_perf_context_data llama_context::perf_get_data() const { llama_perf_context_data data = {}; @@ -1831,7 +1837,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto res = graph_build(ubatch, true); + auto ctx = graph_init(); + auto res = graph_build(ctx, ubatch, true); // initialize scheduler with the worst-case graph ggml_backend_sched_reset(sched.get()); @@ -1845,7 +1852,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - auto res = graph_build(ubatch, false); + auto ctx = graph_init(); + auto res = graph_build(ctx, ubatch, false); auto & gf = res.gf; @@ -2092,7 +2100,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - auto res = graph_build(ubatch, false); + auto ctx = graph_init(); + auto res = graph_build(ctx, ubatch, false); auto & gf = res.gf; diff --git a/src/llama-context.h b/src/llama-context.h index 981afcc005b06..e3ab12e59c746 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -96,7 +96,10 @@ struct llama_context : public llama_graph_i { virtual ggml_context_ptr graph_init(); // TODO: add encode/decode graphs - virtual llama_graph_result graph_build(const llama_ubatch & ubatch, bool worst_case); + virtual llama_graph_result graph_build( + ggml_context_ptr & ctx, + const llama_ubatch & ubatch, + bool worst_case); // returns the result of ggml_backend_sched_graph_compute_async execution virtual enum ggml_status graph_compute( diff --git a/src/llama-graph.h b/src/llama-graph.h index de3cd2f043458..14d0c5da0a359 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -13,8 +13,10 @@ struct llama_ubatch; struct llama_graph_result { ggml_cgraph * gf = nullptr; - ggml_tensor * t_logits = nullptr; - ggml_tensor * t_embd = nullptr; + // important graph nodes + ggml_tensor * t_logits = nullptr; + ggml_tensor * t_embd = nullptr; + ggml_tensor * t_embd_pooled = nullptr; }; // TODO: can become more granular in the future diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4950af59bf01e..ecfd6f185039a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3841,17 +3841,19 @@ struct llm_build_context { const enum llama_pooling_type pooling_type; const enum llama_rope_type rope_type; - const ggml_context_ptr ctx = nullptr; - ggml_context * ctx0 = nullptr; + ggml_context_ptr & ctx; + ggml_context * ctx0 = nullptr; + + llama_graph_result res; // TODO: consider making the entire interface noexcept llm_build_context( - llama_graph_i & lgf, - const llama_model & model, - const llama_cparams & cparams, - const llama_ubatch & ubatch, - ggml_context_ptr && ctx, - bool worst_case) : + ggml_context_ptr & ctx, + llama_graph_i & lgf, + const llama_model & model, + const llama_cparams & cparams, + const llama_ubatch & ubatch, + bool worst_case) : lgf (lgf), model (model), hparams (model.hparams), @@ -3883,7 +3885,7 @@ struct llm_build_context { flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), - ctx (std::move(ctx)), + ctx (ctx), ctx0 (this->ctx.get()) { } @@ -4280,16 +4282,18 @@ struct llm_build_context { } void append_pooling(struct ggml_cgraph * gf) { - // find result_norm tensor for input - struct ggml_tensor * inp = nullptr; - for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { - inp = ggml_graph_node(gf, i); - if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { - break; - } + struct ggml_tensor * inp = res.t_embd; + + //// find result_norm tensor for input + //for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + // inp = ggml_graph_node(gf, i); + // if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { + // break; + // } + + // inp = nullptr; + //} - inp = nullptr; - } GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor"); struct ggml_tensor * cur; @@ -4338,6 +4342,7 @@ struct llm_build_context { } cb(cur, "result_embd_pooled", -1); + res.t_embd_pooled = cur; ggml_build_forward_expand(gf, cur); } @@ -4390,6 +4395,7 @@ struct llm_build_context { void build_llama(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -4530,7 +4536,9 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -4541,12 +4549,14 @@ struct llm_build_context { } cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_deci(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -4682,7 +4692,9 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -4693,12 +4705,14 @@ struct llm_build_context { } cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_baichuan(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -4799,17 +4813,22 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_xverse(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -4898,11 +4917,15 @@ struct llm_build_context { cur = inpL; cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -4910,6 +4933,7 @@ struct llm_build_context { void build_falcon(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5015,16 +5039,21 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_grok(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5158,7 +5187,9 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -5169,6 +5200,7 @@ struct llm_build_context { cur = ggml_scale(ctx0, cur, 0.5773502691896257f); cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5176,6 +5208,7 @@ struct llm_build_context { void build_dbrx(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5282,12 +5315,15 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5295,6 +5331,7 @@ struct llm_build_context { void build_starcoder(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -5384,16 +5421,21 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_refact(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -5473,11 +5515,15 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5668,6 +5714,7 @@ struct llm_build_context { cur = inpL; cb(cur, "result_embd", -1); + res.t_embd = cur; ggml_build_forward_expand(gf, cur); } @@ -5675,6 +5722,7 @@ struct llm_build_context { void build_bloom(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -5761,10 +5809,14 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5772,6 +5824,7 @@ struct llm_build_context { void build_mpt(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -5897,16 +5950,21 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_stablelm(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -6042,17 +6100,22 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_qwen(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -6150,17 +6213,22 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_qwen2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6258,17 +6326,22 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_qwen2vl(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6371,17 +6444,22 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_qwen2moe(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6511,11 +6589,15 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6523,6 +6605,7 @@ struct llm_build_context { void build_phi2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -6628,13 +6711,17 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6642,6 +6729,7 @@ struct llm_build_context { void build_phi3(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -6656,7 +6744,7 @@ struct llm_build_context { lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { - auto residual = inpL; + auto * residual = inpL; // self-attention { @@ -6766,7 +6854,9 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); @@ -6774,13 +6864,16 @@ struct llm_build_context { cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); } + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_plamo(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6870,11 +6963,15 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6882,6 +6979,7 @@ struct llm_build_context { void build_gpt2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -6972,10 +7070,14 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6983,6 +7085,7 @@ struct llm_build_context { void build_codeshell(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -7079,16 +7182,21 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_orion(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -7192,17 +7300,22 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_internlm2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -7306,11 +7419,15 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7507,7 +7624,9 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head scaling const float scale_lmhead = float(n_embd_base)/float(n_embd); @@ -7516,7 +7635,9 @@ struct llm_build_context { // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7616,11 +7737,15 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7736,7 +7861,9 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -7747,6 +7874,7 @@ struct llm_build_context { cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7754,6 +7882,7 @@ struct llm_build_context { // TODO: move up next to build_starcoder void build_starcoder2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -7858,11 +7987,15 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7908,18 +8041,24 @@ struct llm_build_context { cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_command_r(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const float f_logit_scale = hparams.f_logit_scale; struct ggml_tensor * cur; @@ -8046,7 +8185,9 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -8056,13 +8197,16 @@ struct llm_build_context { } cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_cohere2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const float f_logit_scale = hparams.f_logit_scale; struct ggml_tensor * cur; @@ -8170,7 +8314,9 @@ struct llm_build_context { cur = inpL; cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -8180,6 +8326,7 @@ struct llm_build_context { } cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8192,6 +8339,7 @@ struct llm_build_context { // * removed MoE void build_olmo(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -8296,17 +8444,22 @@ struct llm_build_context { cur = build_norm(cur, NULL, NULL, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_olmo2(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -8411,11 +8564,15 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8426,6 +8583,7 @@ struct llm_build_context { // * added q, k norm void build_olmoe(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -8533,17 +8691,22 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_openelm(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -8655,10 +8818,14 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8666,6 +8833,7 @@ struct llm_build_context { void build_gptneox(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -8794,16 +8962,21 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_arctic(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -8918,17 +9091,22 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_deepseek(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -9068,12 +9246,15 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -9292,17 +9473,22 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_bitnet(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -9438,12 +9624,16 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head // FIXME: do not use model.tok_embd directly, duplicate as model.output cur = build_lora_mm(model.tok_embd, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -9451,6 +9641,7 @@ struct llm_build_context { //void build_t5_enc(ggml_cgraph * gf) { // const int64_t n_embd_head = hparams.n_embd_head_v; // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); // struct ggml_tensor * cur; @@ -9567,7 +9758,9 @@ struct llm_build_context { // cur = build_norm(cur, // model.output_norm_enc, NULL, // LLM_NORM_RMS, -1); + // // cb(cur, "result_norm", -1); + // res.t_embd = cur; // ggml_build_forward_expand(gf, cur); //} @@ -9575,6 +9768,7 @@ struct llm_build_context { //void build_t5_dec(ggml_cgraph * gf) { // const int64_t n_embd_head = hparams.n_embd_head_v; // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); // struct ggml_tensor * cur; @@ -9760,11 +9954,15 @@ struct llm_build_context { // cur = build_norm(cur, // model.output_norm, NULL, // LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); + // res.t_embd = cur; // // lm_head // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); + // res.t_logits = cur; // ggml_build_forward_expand(gf, cur); @@ -9774,6 +9972,7 @@ struct llm_build_context { void build_jais(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -9849,11 +10048,14 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -9861,6 +10063,7 @@ struct llm_build_context { void build_chatglm(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; @@ -9975,16 +10178,21 @@ struct llm_build_context { model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_nemotron(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); //GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -10090,17 +10298,22 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } void build_exaone(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -10208,11 +10421,15 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10290,15 +10507,21 @@ struct llm_build_context { } cur = inpL; + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); cur = ggml_get_rows(ctx0, cur, inp_out_ids); cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10375,10 +10598,14 @@ struct llm_build_context { cur = ggml_get_rows(ctx0, cur, inp_out_ids); cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10391,6 +10618,7 @@ struct llm_build_context { // * removed MoE void build_chameleon(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -10530,7 +10758,9 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res.t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -10546,8 +10776,11 @@ struct llm_build_context { struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens); img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX); cb(img_logits, "img_logits", -1); + cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx); + cb(cur, "result_output", -1); + res.t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10695,23 +10928,23 @@ struct llm_build_context { cur = build_lora_mm(model.output, cur); cur = ggml_add(ctx0, cur, model.output_b); + cb(cur, "result_embd", -1); + res.t_embd = cur; ggml_build_forward_expand(gf, cur); } }; llama_graph_result llama_model::build_graph( - llama_graph_i & lgf, - const llama_cparams & cparams, - const llama_ubatch & ubatch, - ggml_context_ptr && ctx, + ggml_context_ptr & ctx, + llama_graph_i & lgf, + const llama_cparams & cparams, + const llama_ubatch & ubatch, bool worst_case) const { - llama_graph_result result = {}; + struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch, worst_case); - struct llm_build_context llm(lgf, *this, cparams, ubatch, std::move(ctx), worst_case); - - auto & gf = result.gf; + auto & gf = llm.res.gf; gf = ggml_new_graph_custom(llm.ctx0, max_nodes(), false); @@ -10935,7 +11168,7 @@ llama_graph_result llama_model::build_graph( llm.append_pooling(gf); } - return result; + return llm.res; } // diff --git a/src/llama-model.h b/src/llama-model.h index a3267bbbbb44a..f5d1f7b79f50b 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -370,11 +370,11 @@ struct llama_model { // TODO: add encode/decode graphs llama_graph_result build_graph( - llama_graph_i & lgf, - const llama_cparams & cparams, - const llama_ubatch & ubatch, - ggml_context_ptr && ctx, - bool worst_case) const; + ggml_context_ptr & ctx, + llama_graph_i & lgf, + const llama_cparams & cparams, + const llama_ubatch & ubatch, + bool worst_case) const; private: struct impl; From bc6f187e9c0d40ca355e088708e4323bac2828da Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Feb 2025 14:24:17 +0200 Subject: [PATCH 50/84] cont : use returend tensors from the graph build ggml-ci --- src/llama-context.cpp | 60 ++++++++++--------------------------------- 1 file changed, 13 insertions(+), 47 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d39263d288f8b..b508a4f8d194c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1855,7 +1855,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { auto ctx = graph_init(); auto res = graph_build(ctx, ubatch, false); - auto & gf = res.gf; + auto * gf = res.gf; // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -1863,29 +1863,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { input_set(ubatch); - // the output is always the last tensor in the graph - struct ggml_tensor * t_logits = ggml_graph_node(gf, -1); - struct ggml_tensor * t_embd = ggml_graph_node(gf, -2); - - if (n_outputs == 0) { - // no output - t_logits = nullptr; - t_embd = nullptr; - } else if (cparams.embeddings) { - t_logits = nullptr; // do not extract logits for embedding case - t_embd = nullptr; - for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { - if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { - t_embd = ggml_graph_node(gf, i); - break; - } - } - GGML_ASSERT(t_embd != nullptr && "missing embeddings tensor"); - } else { - t_embd = nullptr; // do not extract embeddings when not needed - GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor"); - } - const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { switch (compute_status) { @@ -1914,8 +1891,15 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} + auto * t_logits = cparams.embeddings ? nullptr : res.t_logits; + auto * t_embd = cparams.embeddings ? res.t_embd : nullptr; + + if (t_embd && res.t_embd_pooled) { + t_embd = res.t_embd_pooled; + } + // extract logits - if (t_logits) { + if (t_logits && n_outputs > 0) { ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); GGML_ASSERT(backend_res != nullptr); GGML_ASSERT(logits != nullptr); @@ -1930,7 +1914,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { } // extract embeddings - if (t_embd) { + if (t_embd && n_outputs > 0) { ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); GGML_ASSERT(backend_embd != nullptr); @@ -2103,32 +2087,12 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { auto ctx = graph_init(); auto res = graph_build(ctx, ubatch, false); - auto & gf = res.gf; + auto * gf = res.gf; ggml_backend_sched_alloc_graph(sched.get(), gf); input_set(ubatch); - // the output embeddings after the final encoder normalization - struct ggml_tensor * t_embd = nullptr; - - // there are two cases here - if (llama_model_has_decoder(&model)) { - // first case is an encoder-decoder T5 model where embeddings are passed to decoder - t_embd = ggml_graph_node(gf, -1); - GGML_ASSERT(strcmp(t_embd->name, "result_norm") == 0 && "missing result_output tensor"); - } else { - // second case is an encoder-only T5 model - if (cparams.embeddings) { - // only output embeddings if required - t_embd = ggml_graph_node(gf, -1); - if (strcmp(t_embd->name, "result_embd_pooled") != 0) { - t_embd = ggml_graph_node(gf, -2); - } - GGML_ASSERT(strcmp(t_embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); - } - } - const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { case GGML_STATUS_SUCCESS: @@ -2142,6 +2106,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { return -3; } + auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd; + // extract embeddings if (t_embd) { ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); From befe14f06f2f36e16f87a79706d874d406c51bfa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Feb 2025 14:47:53 +0200 Subject: [PATCH 51/84] llama : reorder encode/decode in sources --- src/llama-context.cpp | 324 +++++++++++++++++++++--------------------- src/llama-context.h | 20 +-- 2 files changed, 172 insertions(+), 172 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index b508a4f8d194c..0e0af806d66c9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1655,6 +1655,168 @@ ggml_context_ptr llama_context_kv_self::graph_init() { return llama_context::graph_init(); } +int llama_context_kv_self::encode(llama_batch & inp_batch) { + is_encoding = true; + + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } + + // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); + + const llama_batch & batch = batch_allocr.batch; + const int32_t n_tokens = batch.n_tokens; + + const auto & hparams = model.hparams; + + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + + if (batch.token) { + for (int32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + return -1; + } + } + } + + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot + GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens"); + + if (t_compute_start_us == 0) { + t_compute_start_us = ggml_time_us(); + } + + n_queued_tokens += n_tokens; + + const int64_t n_embd = hparams.n_embd; + + sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); + + const llama_ubatch ubatch = sbatch.split_simple(n_tokens); + + // reserve output buffer + if (output_reserve(n_tokens) < n_tokens) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); + return -2; + }; + + for (int32_t i = 0; i < n_tokens; ++i) { + output_ids[i] = i; + } + + inp_embd_enc = NULL; + n_outputs = n_tokens; + + //batch_manager->prepare(ubatch); + + // TODO: do reserve + GGML_ASSERT(need_reserve == false); + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + auto ctx = graph_init(); + auto res = graph_build(ctx, ubatch, false); + + auto * gf = res.gf; + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + input_set(ubatch); + + const auto compute_status = graph_compute(gf, n_tokens > 1); + switch (compute_status) { + case GGML_STATUS_SUCCESS: + break; + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + + auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd; + + // extract embeddings + if (t_embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); + + if (llama_model_has_decoder(&model)) { + embd_enc.resize(n_tokens*n_embd); + float * embd_out = embd_enc.data(); + + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + // remember the sequence ids used during the encoding - needed for cross attention later + seq_ids_enc.resize(n_tokens); + for (int32_t i = 0; i < n_tokens; i++) { + for (int s = 0; s < ubatch.n_seq_id[i]; s++) { + llama_seq_id seq_id = ubatch.seq_id[i][s]; + seq_ids_enc[i].insert(seq_id); + } + } + } else { + GGML_ASSERT(embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(embd != nullptr); + float * embd_out = embd; + + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = embd_seq; + embd_seq_out.clear(); + + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + for (int32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = ubatch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // TODO: this likely should be the same logic as in llama_decoder_internal, but better to + // wait for an encoder model that requires this pooling type in order to test it + // https://github.com/ggerganov/llama.cpp/pull/9510 + GGML_ABORT("RANK pooling not implemented yet"); + } + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } + } + } + } + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + return 0; +} + int llama_context_kv_self::decode(llama_batch & inp_batch) { is_encoding = false; @@ -2020,168 +2182,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { return 0; } -int llama_context_kv_self::encode(llama_batch & inp_batch) { - is_encoding = true; - - if (inp_batch.n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); - return -1; - } - - // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); - - const llama_batch & batch = batch_allocr.batch; - const int32_t n_tokens = batch.n_tokens; - - const auto & hparams = model.hparams; - - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - - if (batch.token) { - for (int32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); - return -1; - } - } - } - - // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot - GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens"); - - if (t_compute_start_us == 0) { - t_compute_start_us = ggml_time_us(); - } - - n_queued_tokens += n_tokens; - - const int64_t n_embd = hparams.n_embd; - - sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); - - const llama_ubatch ubatch = sbatch.split_simple(n_tokens); - - // reserve output buffer - if (output_reserve(n_tokens) < n_tokens) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); - return -2; - }; - - for (int32_t i = 0; i < n_tokens; ++i) { - output_ids[i] = i; - } - - inp_embd_enc = NULL; - n_outputs = n_tokens; - - //batch_manager->prepare(ubatch); - - // TODO: do reserve - GGML_ASSERT(need_reserve == false); - - ggml_backend_sched_reset(sched.get()); - ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - - auto ctx = graph_init(); - auto res = graph_build(ctx, ubatch, false); - - auto * gf = res.gf; - - ggml_backend_sched_alloc_graph(sched.get(), gf); - - input_set(ubatch); - - const auto compute_status = graph_compute(gf, n_tokens > 1); - switch (compute_status) { - case GGML_STATUS_SUCCESS: - break; - case GGML_STATUS_ABORTED: - return 2; - case GGML_STATUS_ALLOC_FAILED: - return -2; - case GGML_STATUS_FAILED: - default: - return -3; - } - - auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd; - - // extract embeddings - if (t_embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); - GGML_ASSERT(backend_embd != nullptr); - - if (llama_model_has_decoder(&model)) { - embd_enc.resize(n_tokens*n_embd); - float * embd_out = embd_enc.data(); - - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - - // remember the sequence ids used during the encoding - needed for cross attention later - seq_ids_enc.resize(n_tokens); - for (int32_t i = 0; i < n_tokens; i++) { - for (int s = 0; s < ubatch.n_seq_id[i]; s++) { - llama_seq_id seq_id = ubatch.seq_id[i][s]; - seq_ids_enc[i].insert(seq_id); - } - } - } else { - GGML_ASSERT(embd != nullptr); - - switch (cparams.pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - // extract token embeddings - GGML_ASSERT(embd != nullptr); - float * embd_out = embd; - - GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - } break; - case LLAMA_POOLING_TYPE_MEAN: - case LLAMA_POOLING_TYPE_CLS: - case LLAMA_POOLING_TYPE_LAST: - { - // extract sequence embeddings - auto & embd_seq_out = embd_seq; - embd_seq_out.clear(); - - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - - for (int32_t i = 0; i < n_tokens; i++) { - const llama_seq_id seq_id = ubatch.seq_id[i][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_RANK: - { - // TODO: this likely should be the same logic as in llama_decoder_internal, but better to - // wait for an encoder model that requires this pooling type in order to test it - // https://github.com/ggerganov/llama.cpp/pull/9510 - GGML_ABORT("RANK pooling not implemented yet"); - } - case LLAMA_POOLING_TYPE_UNSPECIFIED: - { - GGML_ABORT("unknown pooling type"); - } - } - } - } - - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - ggml_backend_sched_reset(sched.get()); - - return 0; -} - llama_pos llama_context_kv_self::pos_max() const { return kv_self.pos_max(); } diff --git a/src/llama-context.h b/src/llama-context.h index e3ab12e59c746..9f6abfc824b3d 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -116,30 +116,30 @@ struct llama_context : public llama_graph_i { // TODO: maybe remove this virtual void output_reorder(); - // decode a batch of tokens by evaluating the transformer - // in case of unsuccessful decoding (error or warning), - // the kv_cache state will be returned to its original state - // (for non-recurrent models) or cleaned (for recurrent models) + // encode a batch of tokens by evaluating the encoder part of the transformer // // - lctx: llama context - // - inp_batch: batch to evaluate + // - batch: batch to evaluate // // return 0 on success // return positive int on warning // return negative int on error // - virtual int decode(llama_batch & inp_batch) = 0; + virtual int encode(llama_batch & inp_batch) = 0; - // encode a batch of tokens by evaluating the encoder part of the transformer + // decode a batch of tokens by evaluating the transformer + // in case of unsuccessful decoding (error or warning), + // the kv_cache state will be returned to its original state + // (for non-recurrent models) or cleaned (for recurrent models) // // - lctx: llama context - // - batch: batch to evaluate + // - inp_batch: batch to evaluate // // return 0 on success // return positive int on warning // return negative int on error // - virtual int encode(llama_batch & inp_batch) = 0; + virtual int decode(llama_batch & inp_batch) = 0; // // graph build API (generic) @@ -336,8 +336,8 @@ class llama_context_kv_self : public llama_context { virtual void input_set(const llama_ubatch & ubatch) override; - virtual int decode(llama_batch & inp_batch) override; virtual int encode(llama_batch & inp_batch) override; + virtual int decode(llama_batch & inp_batch) override; // max token position across all sequences in the current context llama_pos pos_max() const; From 9e50456e19ac5c24c40387e6b4a2b3072f7a9d8e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Feb 2025 14:53:02 +0200 Subject: [PATCH 52/84] context : minor simplify ggml-ci --- src/llama-context.cpp | 24 +++++++++++------------- src/llama-context.h | 2 +- src/llama-model.cpp | 20 +++++++++----------- src/llama-model.h | 2 +- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0e0af806d66c9..d9735cfaa41fc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -256,7 +256,7 @@ void llama_context::init() { { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; auto ctx = graph_init(); - auto res_pp = graph_build(ctx, ubatch_pp, true); + auto res_pp = graph_build(ctx.get(), ubatch_pp, true); auto & gf_pp = res_pp.gf; if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); @@ -271,7 +271,7 @@ void llama_context::init() { { llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; auto ctx = graph_init(); - auto res_tg = graph_build(ctx, ubatch_tg, true); + auto res_tg = graph_build(ctx.get(), ubatch_tg, true); auto & gf_tg = res_tg.gf; if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) { LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__); @@ -285,7 +285,7 @@ void llama_context::init() { { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; auto ctx = graph_init(); - auto res_pp = graph_build(ctx, ubatch_pp, true); + auto res_pp = graph_build(ctx.get(), ubatch_pp, true); auto & gf_pp = res_pp.gf; if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); @@ -573,7 +573,7 @@ ggml_context_ptr llama_context::graph_init() { } llama_graph_result llama_context::graph_build( - ggml_context_ptr & ctx, + ggml_context * ctx, const llama_ubatch & ubatch, bool worst_case) { return model.build_graph(ctx, *this, cparams, ubatch, worst_case); @@ -1720,7 +1720,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); auto ctx = graph_init(); - auto res = graph_build(ctx, ubatch, false); + auto res = graph_build(ctx.get(), ubatch, false); auto * gf = res.gf; @@ -2000,7 +2000,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; auto ctx = graph_init(); - auto res = graph_build(ctx, ubatch, true); + auto res = graph_build(ctx.get(), ubatch, true); // initialize scheduler with the worst-case graph ggml_backend_sched_reset(sched.get()); @@ -2015,7 +2015,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); auto ctx = graph_init(); - auto res = graph_build(ctx, ubatch, false); + auto res = graph_build(ctx.get(), ubatch, false); auto * gf = res.gf; @@ -2483,11 +2483,10 @@ void llama_context_kv_self::kv_self_update() { ggml_backend_sched_reset(sched.get()); auto ctx = graph_init(); - auto * ctx0 = ctx.get(); - ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false); - build_kv_self_shift(ctx0, gf); + build_kv_self_shift(ctx.get(), gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2512,11 +2511,10 @@ void llama_context_kv_self::kv_self_update() { ggml_backend_sched_reset(sched.get()); auto ctx = graph_init(); - auto * ctx0 = ctx.get(); - ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false); - build_kv_self_defrag(ctx0, gf); + build_kv_self_defrag(ctx.get(), gf); ggml_backend_sched_alloc_graph(sched.get(), gf); diff --git a/src/llama-context.h b/src/llama-context.h index 9f6abfc824b3d..4bf8244e625c1 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -97,7 +97,7 @@ struct llama_context : public llama_graph_i { // TODO: add encode/decode graphs virtual llama_graph_result graph_build( - ggml_context_ptr & ctx, + ggml_context * ctx, const llama_ubatch & ubatch, bool worst_case); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ecfd6f185039a..289c3422e3dcf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3841,19 +3841,18 @@ struct llm_build_context { const enum llama_pooling_type pooling_type; const enum llama_rope_type rope_type; - ggml_context_ptr & ctx; - ggml_context * ctx0 = nullptr; + ggml_context * ctx0 = nullptr; llama_graph_result res; // TODO: consider making the entire interface noexcept llm_build_context( - ggml_context_ptr & ctx, - llama_graph_i & lgf, - const llama_model & model, - const llama_cparams & cparams, - const llama_ubatch & ubatch, - bool worst_case) : + ggml_context * ctx, + llama_graph_i & lgf, + const llama_model & model, + const llama_cparams & cparams, + const llama_ubatch & ubatch, + bool worst_case) : lgf (lgf), model (model), hparams (model.hparams), @@ -3885,8 +3884,7 @@ struct llm_build_context { flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), - ctx (ctx), - ctx0 (this->ctx.get()) { + ctx0 (ctx) { } // TODO: tmp @@ -10937,7 +10935,7 @@ struct llm_build_context { }; llama_graph_result llama_model::build_graph( - ggml_context_ptr & ctx, + ggml_context * ctx, llama_graph_i & lgf, const llama_cparams & cparams, const llama_ubatch & ubatch, diff --git a/src/llama-model.h b/src/llama-model.h index f5d1f7b79f50b..a7c53bdbdc7ea 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -370,7 +370,7 @@ struct llama_model { // TODO: add encode/decode graphs llama_graph_result build_graph( - ggml_context_ptr & ctx, + ggml_context * ctx, llama_graph_i & lgf, const llama_cparams & cparams, const llama_ubatch & ubatch, From 2bffc2d514ac2a86acae27037e0e466ebc723fd4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Feb 2025 14:57:26 +0200 Subject: [PATCH 53/84] model : pass llama_graph_i as ptr ggml-ci --- src/llama-context.cpp | 2 +- src/llama-model.cpp | 252 +++++++++++++++++++++--------------------- src/llama-model.h | 2 +- 3 files changed, 128 insertions(+), 128 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d9735cfaa41fc..bfcdf6cddcf30 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -576,7 +576,7 @@ llama_graph_result llama_context::graph_build( ggml_context * ctx, const llama_ubatch & ubatch, bool worst_case) { - return model.build_graph(ctx, *this, cparams, ubatch, worst_case); + return model.build_graph(ctx, this, cparams, ubatch, worst_case); } enum ggml_status llama_context::graph_compute( diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 289c3422e3dcf..350dfd89cee3d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3804,7 +3804,6 @@ enum llm_norm_type { }; struct llm_build_context { - llama_graph_i & lgf; const llama_model & model; const llama_hparams & hparams; const llama_cparams & cparams; @@ -3842,18 +3841,18 @@ struct llm_build_context { const enum llama_rope_type rope_type; ggml_context * ctx0 = nullptr; + llama_graph_i * lgf = nullptr; llama_graph_result res; // TODO: consider making the entire interface noexcept llm_build_context( ggml_context * ctx, - llama_graph_i & lgf, + llama_graph_i * lgf, const llama_model & model, const llama_cparams & cparams, const llama_ubatch & ubatch, bool worst_case) : - lgf (lgf), model (model), hparams (model.hparams), cparams (cparams), @@ -3884,17 +3883,18 @@ struct llm_build_context { flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), - ctx0 (ctx) { + ctx0 (ctx), + lgf (lgf) { } // TODO: tmp void cb(struct ggml_tensor * cur, const char * name, int il) { - lgf.build_cb(cur, name, ubatch, il); + lgf->build_cb(cur, name, ubatch, il); } // TODO: tmp struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { - struct ggml_tensor * inpL = lgf.build_inp_embd(ctx0, tok_embd, ubatch); + struct ggml_tensor * inpL = lgf->build_inp_embd(ctx0, tok_embd, ubatch); cb(inpL, "inp_embd", -1); return inpL; @@ -3904,7 +3904,7 @@ struct llm_build_context { struct ggml_tensor * build_lora_mm( struct ggml_tensor * w, struct ggml_tensor * cur) { - return lgf.build_lora_mm(ctx0, w, cur); + return lgf->build_lora_mm(ctx0, w, cur); } // TODO: tmp @@ -3912,7 +3912,7 @@ struct llm_build_context { struct ggml_tensor * w, // struct ggml_tensor * as struct ggml_tensor * cur, // struct ggml_tensor * b struct ggml_tensor * ids) { - return lgf.build_lora_mm_id(ctx0, w, cur, ids); + return lgf->build_lora_mm_id(ctx0, w, cur, ids); } struct ggml_tensor * build_norm( @@ -4211,12 +4211,12 @@ struct llm_build_context { ggml_build_forward_expand(graph, v_cur); //build_kv_store(graph, k_cur, v_cur, il); - lgf.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); + lgf->build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); struct ggml_tensor * cur; //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il); - cur = lgf.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); + cur = lgf->build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); cb(cur, "kqv_out", il); return cur; @@ -4252,28 +4252,28 @@ struct llm_build_context { } struct ggml_tensor * build_inp_pos() { - ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens); cb(cur, "inp_pos", -1); return cur; } struct ggml_tensor * build_inp_out_ids() { - ggml_tensor * cur = lgf.build_inp_out_ids(ctx0, n_tokens, worst_case); + ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case); cb(cur, "inp_out_ids", -1); return cur; } struct ggml_tensor * build_inp_mean() { - ggml_tensor * cur = lgf.build_inp_mean(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens); cb(cur, "inp_mean", -1); return cur; } struct ggml_tensor * build_inp_cls() { - ggml_tensor * cur = lgf.build_inp_cls(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens); cb(cur, "inp_cls", -1); return cur; @@ -4378,14 +4378,14 @@ struct llm_build_context { //} struct ggml_tensor * build_inp_embd_enc() { - ggml_tensor * cur = lgf.build_inp_embd_enc(ctx0, n_tokens, worst_case); + ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case); cb(cur, "embd_enc", -1); return cur; } struct ggml_tensor * build_inp_KQ_mask_cross() { - ggml_tensor * cur = lgf.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); + ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); cb(cur, "KQ_mask_cross", -1); return cur; @@ -4405,7 +4405,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4420,7 +4420,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -4522,7 +4522,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4566,7 +4566,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4592,7 +4592,7 @@ struct llm_build_context { } else if (n_head > 0) { // self-attention // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -4678,7 +4678,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4722,7 +4722,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4799,7 +4799,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4838,7 +4838,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4905,7 +4905,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4943,7 +4943,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5023,7 +5023,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, inpL); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5066,7 +5066,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5173,7 +5173,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5218,7 +5218,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5301,7 +5301,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5340,7 +5340,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -5408,7 +5408,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5441,7 +5441,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5501,7 +5501,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5555,7 +5555,7 @@ struct llm_build_context { inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); cb(inpL, "inp_norm", -1); - lgf.build_attn_inp(ctx0, n_tokens, false, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, false, false, worst_case); // iterate layers for (int il = 0; il < n_layer; ++il) { @@ -5626,7 +5626,7 @@ struct llm_build_context { cb(kq, "kq", il); //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); - kq = lgf.build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); + kq = lgf->build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); cb(kq, "kq_soft_max_ext", il); struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); @@ -5728,7 +5728,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); inpL = build_norm(inpL, model.tok_norm, @@ -5796,7 +5796,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5831,7 +5831,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); if (model.pos_embd) { // inp_pos - contains the positions @@ -5935,7 +5935,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5973,7 +5973,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { @@ -6085,7 +6085,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6124,7 +6124,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6199,7 +6199,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6238,7 +6238,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6312,7 +6312,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6351,7 +6351,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -6430,7 +6430,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6469,7 +6469,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6575,7 +6575,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6616,7 +6616,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { attn_norm_output = build_norm(inpL, @@ -6698,7 +6698,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_output); cur = ggml_add(ctx0, cur, inpL); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6739,7 +6739,7 @@ struct llm_build_context { struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { auto * residual = inpL; @@ -6747,7 +6747,7 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); struct ggml_tensor* attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, @@ -6841,7 +6841,7 @@ struct llm_build_context { cur = ggml_add(ctx0, residual, cur); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6883,7 +6883,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { @@ -6949,7 +6949,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); cur = ggml_add(ctx0, cur, inpL); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6989,7 +6989,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -7057,7 +7057,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -7095,7 +7095,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -7169,7 +7169,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -7206,7 +7206,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7286,7 +7286,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -7325,7 +7325,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7405,7 +7405,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -7453,12 +7453,12 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -7610,7 +7610,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -7654,7 +7654,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm @@ -7723,7 +7723,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -7762,7 +7762,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { // norm @@ -7847,7 +7847,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -7892,7 +7892,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7973,7 +7973,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8005,8 +8005,8 @@ struct llm_build_context { // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case); for (int il = 0; il < n_layer; ++il) { // norm @@ -8016,7 +8016,7 @@ struct llm_build_context { cb(cur, "attn_norm", il); //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il); - cur = lgf.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case); + cur = lgf->build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case); if (il == n_layer - 1) { // skip computing output for unused tokens @@ -8028,7 +8028,7 @@ struct llm_build_context { // residual cur = ggml_add(ctx0, cur, inpL); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8067,7 +8067,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { @@ -8171,7 +8171,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8215,7 +8215,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case); // sliding window switch pattern const int32_t sliding_window_pattern = 4; @@ -8233,7 +8233,7 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -8302,7 +8302,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8349,7 +8349,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8430,7 +8430,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8469,7 +8469,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8550,7 +8550,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8593,7 +8593,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8677,7 +8677,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8714,7 +8714,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { const int64_t n_head = hparams.n_head(il); @@ -8804,7 +8804,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); inpL = cur; @@ -8842,7 +8842,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -8919,7 +8919,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, attn_out); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8948,7 +8948,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -8986,7 +8986,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9077,7 +9077,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_out); cb(cur, "ffn_out", il); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -9116,7 +9116,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -9132,7 +9132,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -9232,7 +9232,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -9279,7 +9279,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9459,7 +9459,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -9497,7 +9497,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9978,7 +9978,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -10072,7 +10072,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10202,7 +10202,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10284,7 +10284,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -10323,7 +10323,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10337,7 +10337,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lgf.build_rope_factors(il); + struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -10407,7 +10407,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -10441,8 +10441,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10451,7 +10451,7 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { const llama_layer * layer = &model.layers[il]; - struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load( + struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load( ctx0, gf, state_copy, state_mask, ubatch, il, worst_case ); @@ -10468,7 +10468,7 @@ struct llm_build_context { 1 ); - cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); + cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -10491,13 +10491,13 @@ struct llm_build_context { ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), 1 ); - ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); + ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { cur = ggml_scale(ctx0, cur, 0.5F); } - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -10533,8 +10533,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10545,7 +10545,7 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { const llama_layer * layer = &model.layers[il]; - struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load( + struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load( ctx0, gf, state_copy, state_mask, ubatch, il, worst_case ); @@ -10559,10 +10559,10 @@ struct llm_build_context { 1 ); - cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); + cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); - ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); + ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -10583,7 +10583,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -10628,7 +10628,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10744,7 +10744,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lgf.build_cvec(ctx0, cur, il); + cur = lgf->build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -10936,7 +10936,7 @@ struct llm_build_context { llama_graph_result llama_model::build_graph( ggml_context * ctx, - llama_graph_i & lgf, + llama_graph_i * lgf, const llama_cparams & cparams, const llama_ubatch & ubatch, bool worst_case) const { diff --git a/src/llama-model.h b/src/llama-model.h index a7c53bdbdc7ea..2a9fca7d40c6d 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -371,7 +371,7 @@ struct llama_model { // TODO: add encode/decode graphs llama_graph_result build_graph( ggml_context * ctx, - llama_graph_i & lgf, + llama_graph_i * lgf, const llama_cparams & cparams, const llama_ubatch & ubatch, bool worst_case) const; From f5cedbcaaa5070d17f5290a03fd3124d58a3b824 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Feb 2025 21:26:42 +0200 Subject: [PATCH 54/84] kv-cache : prepare for abstraction ggml-ci --- src/llama-context.cpp | 518 +++++++++-------------------------------- src/llama-context.h | 47 ++-- src/llama-graph.h | 48 ++-- src/llama-kv-cache.cpp | 319 ++++++++++++++++++++++++- src/llama-kv-cache.h | 76 +++--- src/llama-model.cpp | 117 +++++----- src/llama-model.h | 3 +- 7 files changed, 594 insertions(+), 534 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index bfcdf6cddcf30..454e141c85796 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -201,7 +201,7 @@ void llama_context::init() { backend_ptrs.push_back(backend.get()); } - const size_t max_nodes = model.max_nodes(); + const size_t max_nodes = this->max_nodes(); // buffer used to store the computation graph and the tensor meta data // TODO: move to base class @@ -255,39 +255,36 @@ void llama_context::init() { // reserve pp graph first so that buffers are only allocated once { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto ctx = graph_init(); - auto res_pp = graph_build(ctx.get(), ubatch_pp, true); - auto & gf_pp = res_pp.gf; - if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch_pp, true); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); throw std::runtime_error("failed to allocate compute buffers"); } n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); - n_nodes_pp = ggml_graph_n_nodes(gf_pp); + n_nodes_pp = ggml_graph_n_nodes(gf); } // reserve with tg graph to get the number of splits and nodes { llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto ctx = graph_init(); - auto res_tg = graph_build(ctx.get(), ubatch_tg, true); - auto & gf_tg = res_tg.gf; - if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) { + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch_tg, true); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__); throw std::runtime_error("failed to allocate compute buffers"); } n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); - n_nodes_tg = ggml_graph_n_nodes(gf_tg); + n_nodes_tg = ggml_graph_n_nodes(gf); } // reserve again with pp graph to avoid ggml-alloc reallocations during inference { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto ctx = graph_init(); - auto res_pp = graph_build(ctx.get(), ubatch_pp, true); - auto & gf_pp = res_pp.gf; - if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch_pp, true); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); throw std::runtime_error("failed to allocate compute buffers"); } @@ -350,6 +347,10 @@ uint32_t llama_context::n_threads_batch() const { return cparams.n_threads_batch; } +int32_t llama_context::max_nodes() const { + return std::max(8192, 5*model.n_tensors()); +} + enum llama_pooling_type llama_context::pooling_type() const { return cparams.pooling_type; } @@ -555,7 +556,7 @@ void llama_context::synchronize() { t_compute_start_us = 0; } -ggml_context_ptr llama_context::graph_init() { +ggml_cgraph * llama_context::graph_init() { inp_tokens = nullptr; inp_embd = nullptr; inp_pos = nullptr; @@ -569,18 +570,21 @@ ggml_context_ptr llama_context::graph_init() { /*.no_alloc =*/ true, }; - return ggml_context_ptr { ggml_init(params) }; + ctx_compute.reset(ggml_init(params)); + + return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false); } llama_graph_result llama_context::graph_build( ggml_context * ctx, + ggml_cgraph * gf, const llama_ubatch & ubatch, bool worst_case) { - return model.build_graph(ctx, this, cparams, ubatch, worst_case); + return model.build_graph(ctx, gf, this, cparams, ubatch, worst_case); } enum ggml_status llama_context::graph_compute( - ggml_cgraph * graph, + ggml_cgraph * gf, bool batched) { int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; @@ -596,7 +600,7 @@ enum ggml_status llama_context::graph_compute( set_n_threads_fn.second(set_n_threads_fn.first, n_threads); } - auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph); + auto status = ggml_backend_sched_graph_compute_async(sched.get(), gf); if (status != GGML_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); } @@ -881,7 +885,6 @@ void llama_context::output_reorder() { } } - void llama_context::build_cb( ggml_tensor * cur, const char * name, @@ -1010,6 +1013,55 @@ ggml_tensor * llama_context::build_rope_factors(int il) { return model.layers[il].rope_short; } +ggml_tensor * llama_context::build_rope_shift( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + ggml_backend_buffer * bbuf) { + const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; + const auto & freq_base = cparams.rope_freq_base; + const auto & freq_scale = cparams.rope_freq_scale; + + const auto & yarn_ext_factor = cparams.yarn_ext_factor; + const auto & yarn_attn_factor = cparams.yarn_attn_factor; + const auto & yarn_beta_fast = cparams.yarn_beta_fast; + const auto & yarn_beta_slow = cparams.yarn_beta_slow; + + const auto & n_rot = model.hparams.n_rot; + const auto & rope_type = model.hparams.rope_type; + + struct ggml_tensor * tmp; + + if (ggml_is_quantized(cur->type)) { + // dequantize to f32 -> RoPE -> quantize back + tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); + + if (bbuf) { + for (auto & backend : backends) { + // Figure out which backend KV cache belongs to + if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) { + ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); + break; + } + } + } + + tmp = ggml_rope_ext_inplace(ctx0, tmp, + shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + + tmp = ggml_cpy(ctx0, tmp, cur); + } else { + // we rotate only the first n_rot dimensions + tmp = ggml_rope_ext_inplace(ctx0, cur, + shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + } + + return tmp; +} + ggml_tensor * llama_context::build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, @@ -1579,7 +1631,8 @@ void llama_context::perf_reset() { llama_context_kv_self::llama_context_kv_self( const llama_model & model, const llama_context_params & params) : - llama_context(model, params) { + llama_context(model, params), + kv_self(model.hparams) { const auto & hparams = model.hparams; LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx); @@ -1640,13 +1693,13 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const { return &kv_self; } -ggml_context_ptr llama_context_kv_self::graph_init() { +ggml_cgraph * llama_context_kv_self::graph_init() { inp_KQ_mask = nullptr; inp_KQ_mask_cnv = nullptr; inp_KQ_mask_swa = nullptr; inp_KQ_mask_swa_cnv = nullptr; inp_KQ_mask_cross = nullptr; - inp_K_shift = nullptr; + inp_k_shift = nullptr; inp_s_copy = nullptr; inp_s_mask = nullptr; inp_embd_enc = nullptr; @@ -1719,10 +1772,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - auto ctx = graph_init(); - auto res = graph_build(ctx.get(), ubatch, false); - - auto * gf = res.gf; + auto * gf = graph_init(); + auto res = graph_build(ctx_compute.get(), gf, ubatch, false); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -1999,12 +2050,12 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto ctx = graph_init(); - auto res = graph_build(ctx.get(), ubatch, true); + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch, true); // initialize scheduler with the worst-case graph ggml_backend_sched_reset(sched.get()); - if (!ggml_backend_sched_reserve(sched.get(), res.gf)) { + if (!ggml_backend_sched_reserve(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); } @@ -2014,10 +2065,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - auto ctx = graph_init(); - auto res = graph_build(ctx.get(), ubatch, false); - - auto * gf = res.gf; + auto * gf = graph_init(); + auto res = graph_build(ctx_compute.get(), gf, ubatch, false); // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -2195,10 +2244,10 @@ uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) c void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; - if (inp_K_shift) { - assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); + if (inp_k_shift) { + assert(ggml_backend_buffer_is_host(inp_k_shift->buffer)); - int32_t * data = (int32_t *) inp_K_shift->data; + int32_t * data = (int32_t *) inp_k_shift->data; for (uint32_t i = 0; i < kv_self.size; ++i) { data[i] = kv_self.cells[i].delta; @@ -2482,11 +2531,9 @@ void llama_context_kv_self::kv_self_update() { if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(sched.get()); - auto ctx = graph_init(); + auto * gf = graph_init(); - ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false); - - build_kv_self_shift(ctx.get(), gf); + kv_self.build_shift(ctx_compute.get(), gf, this); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2510,11 +2557,9 @@ void llama_context_kv_self::kv_self_update() { if (kv.do_defrag) { ggml_backend_sched_reset(sched.get()); - auto ctx = graph_init(); - - ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false); + auto * gf = graph_init(); - build_kv_self_defrag(ctx.get(), gf); + kv_self.build_defrag(ctx_compute.get(), gf, max_nodes(), !cparams.flash_attn); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2529,6 +2574,13 @@ void llama_context_kv_self::kv_self_update() { } } +ggml_tensor * llama_context_kv_self::build_inp_k_shift(ggml_context * ctx0) { + inp_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); + ggml_set_input(inp_k_shift); + + return inp_k_shift; +} + void llama_context_kv_self::build_attn_inp( ggml_context * ctx0, int32_t n_tokens, @@ -2765,348 +2817,6 @@ ggml_tensor * llama_context_kv_self::build_attn_soft_max( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } -void llama_context_kv_self::build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * graph) { - const auto & n_ctx = cparams.n_ctx; - const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; - const auto & freq_base = cparams.rope_freq_base; - const auto & freq_scale = cparams.rope_freq_scale; - - const auto & yarn_ext_factor = cparams.yarn_ext_factor; - const auto & yarn_attn_factor = cparams.yarn_attn_factor; - const auto & yarn_beta_fast = cparams.yarn_beta_fast; - const auto & yarn_beta_slow = cparams.yarn_beta_slow; - - const auto & hparams = model.hparams; - - const auto & n_rot = hparams.n_rot; - const auto & n_layer = hparams.n_layer; - const auto & rope_type = hparams.rope_type; - - const auto & n_embd_head_k = hparams.n_embd_head_k; - //const auto & n_embd_head_v = hparams.n_embd_head_v; - - GGML_ASSERT(kv_self.size == n_ctx); - - inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - //cb(inp_K_shift, "K_shift", -1); - ggml_set_input(inp_K_shift); - - for (uint32_t il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - - struct ggml_tensor * rope_factors = build_rope_factors(il); - - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - 0); - - struct ggml_tensor * tmp; - if (ggml_is_quantized(k->type)) { - // dequantize to f32 -> RoPE -> quantize back - tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); - //cb(tmp, "K_f32", il); - - for (auto & backend : backends) { - // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { - ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); - break; - } - } - tmp = ggml_rope_ext_inplace(ctx0, tmp, - inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); - //cb(tmp, "K_shifted_f32", il); - - tmp = ggml_cpy(ctx0, tmp, k); - } else { - // we rotate only the first n_rot dimensions - tmp = ggml_rope_ext_inplace(ctx0, k, - inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); - } - //cb(tmp, "K_shifted", il); - - ggml_build_forward_expand(graph, tmp); - } -} - -void llama_context_kv_self::build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * graph) { - const auto & hparams = model.hparams; - - const uint32_t n_layer = hparams.n_layer; - - const uint32_t n_kv = kv_self.cell_max(); - const uint32_t n_used = kv_self.used; - - assert(n_used <= n_kv); - - //const int64_t t_start = ggml_time_us(); - - // number of cells moved - uint32_t n_moves = 0; - - // each move requires 6*n_layer tensors (see build_kv_self_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - //const uint32_t max_moves = model.max_nodes()/(6*n_layer); - // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (model.max_nodes() - 2*n_layer)/(6*n_layer); - - // determine which KV cells to move where - // - // cell i moves to ids[i] - // - // if ids[i] == i || ids[i] == n_kv, then cell i is not moved - // - std::vector ids(n_kv, n_kv); - - for (uint32_t i0 = 0; i0 < n_used; ++i0) { - const auto & cell0 = kv_self.cells[i0]; - - if (!cell0.is_empty()) { - ids[i0] = i0; - - continue; - } - - // found a hole - fill it with data from the end of the cache - - uint32_t nh = 1; - - // determine the size of the hole - while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { - nh++; - } - - uint32_t nf = 0; - uint32_t is = n_kv - 1; - - // starting from the end, find nh non-empty cells - for (; is > i0; --is) { - const auto & cell1 = kv_self.cells[is]; - - if (cell1.is_empty() || ids[is] != n_kv) { - continue; - } - - // non-empty cell which is not yet moved - nf++; - - if (nf == nh) { - break; - } - } - - // this can only happen if `n_used` is not accurate, which would be a bug - GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); - - nf = 0; - - uint32_t i1 = is; - - // are we moving a continuous block of memory? - bool cont = false; - - // should we stop searching for the next move? - bool stop = false; - - // go back and move the nf cells to the hole - for (; i1 < n_kv; ++i1) { - auto & cell1 = kv_self.cells[i1]; - - if (cell1.is_empty() || ids[i1] != n_kv) { - if (n_moves == max_moves) { - stop = true; - break; - } - - cont = false; - continue; - } - - // this cell goes to (i0 + nf) - ids[i1] = i0 + nf; - - // move the cell meta data - kv_self.cells[i0 + nf] = cell1; - - // clear the old cell and move the head there - cell1 = llama_kv_cell(); - kv_self.head = n_used; - - if (!cont) { - n_moves++; - cont = true; - } - - nf++; - - if (nf == nh) { - break; - } - } - - if (stop || n_moves == max_moves) { - break; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); - - i0 += nh - 1; - } - - if (n_moves == 0) { - return; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - - //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); - -#if 0 - // CPU defrag - // - // TODO: optimizations are possible: - // - multiple threads - // - avoid copying to the host memory when already there - // - // likely not worth the effort, as we have ggml_graph based defrag - // - - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - - const uint32_t kv_size = kv_self.size; - - std::vector buf_k; - std::vector buf_v; - - for (uint32_t il = 0; il < n_layer; ++il) { - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); - - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); - - buf_k.resize(k_size); - buf_v.resize(v_size); - - ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); - - // batch move [i, i+nm) to [id, id+nm) - // note: cells can move only to a lower index - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == n_kv) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < n_kv && ids[i + nm] == id + nm) { - nm++; - } - - // move keys - { - const int64_t os = i*k_size_row; - const int64_t od = id*k_size_row; - - memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); - } - - // move values (note: they are transposed) - { - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); - } - } - - i += nm - 1; - } - - ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); - } -#else - for (uint32_t i = 0; i < ids.size(); ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == ids.size()) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < ids.size() && ids[i + nm] == id + nm) { - nm++; - } - - for (uint32_t il = 0; il < n_layer; ++il) { - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); - - ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); - - ggml_tensor * view_v_src; - ggml_tensor * view_v_dst; - - if (cparams.flash_attn) { - // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); - - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); - } else { - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, i)); - - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, id)); - } - - ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_k_src, view_k_dst)); - ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_v_src, view_v_dst)); - } - - i += nm - 1; - } - - //LLAMA_LOG_INFO("graph->n_nodes = %d\n", graph->n_nodes); -#endif -} - ggml_tensor * llama_context_kv_self::build_inp_embd_enc( ggml_context * ctx0, int32_t n_tokens, @@ -3162,7 +2872,7 @@ ggml_tensor * llama_context_kv_self::build_inp_s_mask( ggml_tensor * llama_context_kv_self::build_copy_mask_state( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * s, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -3185,7 +2895,7 @@ ggml_tensor * llama_context_kv_self::build_copy_mask_state( states = ggml_mul(ctx0, states, state_mask); // copy states which won't be changed further (between n_seqs and n_kv) - ggml_build_forward_expand(graph, + ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)), ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s)))); @@ -3197,7 +2907,7 @@ ggml_tensor * llama_context_kv_self::build_copy_mask_state( // TODO: split ggml_tensor * llama_context_kv_self::build_mamba_layer( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -3231,11 +2941,11 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer( // (ab)using the KV cache to store the states struct ggml_tensor * conv = build_copy_mask_state( - ctx0, graph, conv_states_all, state_copy, state_mask, + ctx0, gf, conv_states_all, state_copy, state_mask, n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); struct ggml_tensor * ssm = build_copy_mask_state( - ctx0, graph, ssm_states_all, state_copy, state_mask, + ctx0, gf, ssm_states_all, state_copy, state_mask, n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs); @@ -3257,7 +2967,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer( // copy last (d_conv - 1) columns back into the state cache struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); - ggml_build_forward_expand(graph, + ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv, ggml_view_1d(ctx0, conv_states_all, (d_conv - 1)*(d_inner)*(n_seqs), @@ -3306,7 +3016,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer( struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C); // store last states - ggml_build_forward_expand(graph, + ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); @@ -3333,7 +3043,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer( ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, @@ -3349,7 +3059,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load( struct ggml_tensor * token_shift_all = kv_self.k_l[il]; struct ggml_tensor * token_shift = build_copy_mask_state( - ctx0, graph, token_shift_all, state_copy, state_mask, + ctx0, gf, token_shift_all, state_copy, state_mask, n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs); @@ -3384,7 +3094,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store( ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * x_prev, ggml_tensor * state_copy, @@ -3509,7 +3219,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( } struct ggml_tensor * wkv_state = build_copy_mask_state( - ctx0, graph, kv_self.v_l[il], state_copy, state_mask, + ctx0, gf, kv_self.v_l[il], state_copy, state_mask, n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); struct ggml_tensor * wkv_output; @@ -3522,7 +3232,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); ggml_build_forward_expand( - graph, + gf, ggml_cpy( ctx0, wkv_state, @@ -3558,7 +3268,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { llama_context::state_get_data(io); - kv_self.state_write(io, model.hparams); + kv_self.state_write(io); return io.n_bytes(); } @@ -3566,7 +3276,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { llama_context::state_set_data(io); - kv_self.state_read(io, model.hparams); + kv_self.state_read(io); return io.n_bytes(); } @@ -3574,7 +3284,7 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { llama_context::state_seq_get_data(io, seq_id); - kv_self.state_write(io, model.hparams, seq_id); + kv_self.state_write(io, seq_id); return io.n_bytes(); } @@ -3582,7 +3292,7 @@ size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_se size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { llama_context::state_seq_set_data(io, seq_id); - kv_self.state_read(io, model.hparams, seq_id); + kv_self.state_read(io, seq_id); return io.n_bytes(); } diff --git a/src/llama-context.h b/src/llama-context.h index 4bf8244e625c1..0311ad4734daf 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -43,6 +43,8 @@ struct llama_context : public llama_graph_i { virtual uint32_t n_threads() const; virtual uint32_t n_threads_batch() const; + virtual int32_t max_nodes() const; + virtual llama_kv_cache * get_kv_self() = 0; virtual const llama_kv_cache * get_kv_self() const = 0; @@ -93,18 +95,19 @@ struct llama_context : public llama_graph_i { virtual void synchronize(); // zero-out inputs and create ggml_context - virtual ggml_context_ptr graph_init(); + virtual ggml_cgraph * graph_init(); // TODO: add encode/decode graphs virtual llama_graph_result graph_build( - ggml_context * ctx, - const llama_ubatch & ubatch, - bool worst_case); + ggml_context * ctx, + ggml_cgraph * gf, + const llama_ubatch & ubatch, + bool worst_case); // returns the result of ggml_backend_sched_graph_compute_async execution virtual enum ggml_status graph_compute( - ggml_cgraph * graph, - bool batched); + ggml_cgraph * gf, + bool batched); virtual void input_set(const llama_ubatch & ubatch); @@ -172,6 +175,13 @@ struct llama_context : public llama_graph_i { virtual ggml_tensor * build_rope_factors(int il); + virtual ggml_tensor * build_rope_shift( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + ggml_backend_buffer * bbuf); + virtual ggml_tensor * build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, @@ -274,6 +284,8 @@ struct llama_context : public llama_graph_i { ggml_backend_sched_ptr sched; + ggml_context_ptr ctx_compute; + // memory buffers used to evaluate the model std::vector buf_compute_meta; @@ -332,7 +344,7 @@ class llama_context_kv_self : public llama_context { virtual void kv_self_update() override; - virtual ggml_context_ptr graph_init() override; + virtual ggml_cgraph * graph_init() override; virtual void input_set(const llama_ubatch & ubatch) override; @@ -349,11 +361,13 @@ class llama_context_kv_self : public llama_context { llama_kv_cache kv_self; - struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch] - struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch] - struct ggml_tensor * inp_K_shift; // I32 [kv_size] + ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] + ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch] + ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] + ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch] + ggml_tensor * inp_k_shift; // I32 [kv_size] + + virtual ggml_tensor * build_inp_k_shift(ggml_context * ctx0) override; virtual void build_attn_inp( ggml_context * ctx0, @@ -387,15 +401,6 @@ class llama_context_kv_self : public llama_context { ggml_tensor * kq, float kq_scale) override; - virtual void build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * graph) override; - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * graph) override; - // === encoder-decoder === // whether we are computing encoder output or decoder output diff --git a/src/llama-graph.h b/src/llama-graph.h index 14d0c5da0a359..6098d2b9293b4 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -8,11 +8,10 @@ struct ggml_cgraph; struct ggml_context; struct ggml_tensor; +struct ggml_backend_buffer; struct llama_ubatch; struct llama_graph_result { - ggml_cgraph * gf = nullptr; - // important graph nodes ggml_tensor * t_logits = nullptr; ggml_tensor * t_embd = nullptr; @@ -50,6 +49,14 @@ class llama_graph_i { virtual ggml_tensor * build_rope_factors(int il) = 0; + // note: optionally set the backend to be the same as the bbuf's backend + virtual ggml_tensor * build_rope_shift( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + ggml_backend_buffer * bbuft) = 0; + // graph build API (context-specific) virtual ggml_tensor * build_inp_embd( @@ -83,7 +90,7 @@ class llama_graph_i { virtual void build_attn_kv_store( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * k_cur, ggml_tensor * v_cur, int32_t n_tokens, @@ -92,7 +99,7 @@ class llama_graph_i { virtual ggml_tensor * build_attn_qkv( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -106,14 +113,8 @@ class llama_graph_i { ggml_tensor * kq, float kq_scale) = 0; - virtual void build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * graph) = 0; - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * graph) = 0; + virtual ggml_tensor * build_inp_k_shift( + ggml_context * ctx0) = 0; virtual ggml_tensor * build_inp_embd_enc( ggml_context * ctx0, @@ -135,7 +136,7 @@ class llama_graph_i { virtual ggml_tensor * build_copy_mask_state( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * s, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -146,7 +147,7 @@ class llama_graph_i { virtual ggml_tensor * build_mamba_layer( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -156,7 +157,7 @@ class llama_graph_i { virtual ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, @@ -172,7 +173,7 @@ class llama_graph_i { virtual ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * x_prev, ggml_tensor * state_copy, @@ -181,3 +182,18 @@ class llama_graph_i { int il, bool worst_case) = 0; }; + +class llama_graph_kv_cache_i { +public: + virtual void build_shift( + ggml_context * ctx0, + ggml_cgraph * gf, + llama_graph_i * lgf) = 0; + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + virtual void build_defrag( + ggml_context * ctx0, + ggml_cgraph * gf, + int32_t max_nodes, + bool v_trans) = 0; +}; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index c93410f0a412c..5dde8b8703875 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -13,6 +13,9 @@ static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false}; +llama_kv_cache::llama_kv_cache(const llama_hparams & hparams) : hparams(hparams) { +} + bool llama_kv_cache::init( const llama_model & model, const llama_cparams & cparams, @@ -20,8 +23,6 @@ bool llama_kv_cache::init( ggml_type type_v, uint32_t kv_size, bool offload) { - const struct llama_hparams & hparams = model.hparams; - const int32_t n_layer = hparams.n_layer; has_shift = false; @@ -698,7 +699,309 @@ size_t llama_kv_cache::size_v_bytes() const { return size_v_bytes; } -void llama_kv_cache::state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id) const { +void llama_kv_cache::build_shift( + ggml_context * ctx0, + ggml_cgraph * gf, + llama_graph_i * lgf) { + const auto & n_layer = hparams.n_layer; + + const auto & n_embd_head_k = hparams.n_embd_head_k; + //const auto & n_embd_head_v = hparams.n_embd_head_v; + + //GGML_ASSERT(kv_self.size == n_ctx); + + ggml_tensor * inp_k_shift = lgf->build_inp_k_shift(ctx0); + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + + struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, k_l[il], + n_embd_head_k, n_head_kv, size, + ggml_row_size(k_l[il]->type, n_embd_head_k), + ggml_row_size(k_l[il]->type, n_embd_k_gqa), + 0); + + ggml_tensor * cur = lgf->build_rope_shift(ctx0, k, inp_k_shift, rope_factors, k_l[il]->buffer); + + ggml_build_forward_expand(gf, cur); + } +} + +void llama_kv_cache::build_defrag( + ggml_context * ctx0, + ggml_cgraph * gf, + int32_t max_nodes, + bool v_trans) { + const uint32_t n_layer = hparams.n_layer; + + const uint32_t n_kv = cell_max(); + const uint32_t n_used = used; + + assert(n_used <= n_kv); + + //const int64_t t_start = ggml_time_us(); + + // number of cells moved + uint32_t n_moves = 0; + + // each move requires 6*n_layer tensors (see build_kv_self_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + //const uint32_t max_moves = max_nodes/(6*n_layer); + // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 + const uint32_t max_moves = (max_nodes - 2*n_layer)/(6*n_layer); + + // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // + std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + uint32_t nh = 1; + + // determine the size of the hole + while (i0 + nh < n_used && cells[i0 + nh].is_empty()) { + nh++; + } + + uint32_t nf = 0; + uint32_t is = n_kv - 1; + + // starting from the end, find nh non-empty cells + for (; is > i0; --is) { + const auto & cell1 = cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } + + // non-empty cell which is not yet moved + nf++; + + if (nf == nh) { + break; + } + } + + // this can only happen if `n_used` is not accurate, which would be a bug + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + uint32_t i1 = is; + + // are we moving a continuous block of memory? + bool cont = false; + + // should we stop searching for the next move? + bool stop = false; + + // go back and move the nf cells to the hole + for (; i1 < n_kv; ++i1) { + auto & cell1 = cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + + cont = false; + continue; + } + + // this cell goes to (i0 + nf) + ids[i1] = i0 + nf; + + // move the cell meta data + cells[i0 + nf] = cell1; + + // clear the old cell and move the head there + cell1 = llama_kv_cell(); + head = n_used; + + if (!cont) { + n_moves++; + cont = true; + } + + nf++; + + if (nf == nh) { + break; + } + } + + if (stop || n_moves == max_moves) { + break; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); + + i0 += nh - 1; + } + + if (n_moves == 0) { + return; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); + + //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); + +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = size; + + std::vector buf_k; + std::vector buf_v; + + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size); + + const size_t v_size_el = ggml_type_size(v_l[il]->type); + const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size); + + buf_k.resize(k_size); + buf_v.resize(v_size); + + ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size()); + + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } + + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; + + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); + } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); + } +#else + for (uint32_t i = 0; i < ids.size(); ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == ids.size()) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < ids.size() && ids[i + nm] == id + nm) { + nm++; + } + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + ggml_tensor * view_k_src = ggml_view_2d(ctx0, k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(k_l[il]->type, n_embd_k_gqa), + ggml_row_size(k_l[il]->type, n_embd_k_gqa*i)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(k_l[il]->type, n_embd_k_gqa), + ggml_row_size(k_l[il]->type, n_embd_k_gqa*id)); + + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; + + if (!v_trans) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx0, v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(v_l[il]->type, n_embd_v_gqa), + ggml_row_size(v_l[il]->type, n_embd_v_gqa*i)); + + view_v_dst = ggml_view_2d(ctx0, v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(v_l[il]->type, n_embd_v_gqa), + ggml_row_size(v_l[il]->type, n_embd_v_gqa*id)); + } else { + view_v_src = ggml_view_2d(ctx0, v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(v_l[il]->type, size), + ggml_row_size(v_l[il]->type, i)); + + view_v_dst = ggml_view_2d(ctx0, v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(v_l[il]->type, size), + ggml_row_size(v_l[il]->type, id)); + } + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); + } + + i += nm - 1; + } + + //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); +#endif +} + +void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { std::vector> cell_ranges; // ranges, from inclusive, to exclusive uint32_t cell_count = 0; @@ -733,16 +1036,16 @@ void llama_kv_cache::state_write(llama_io_write_i & io, const llama_hparams & hp io.write(&cell_count, sizeof(cell_count)); state_write_meta(io, cell_ranges, seq_id); - state_write_data(io, cell_ranges, hparams); + state_write_data(io, cell_ranges); } -void llama_kv_cache::state_read(llama_io_read_i & io, const llama_hparams & hparams, llama_seq_id seq_id) { +void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id) { uint32_t cell_count; io.read_to(&cell_count, sizeof(cell_count)); bool res = true; res = res && state_read_meta(io, cell_count, seq_id); - res = res && state_read_data(io, hparams, cell_count); + res = res && state_read_data(io, cell_count); if (!res) { if (seq_id == -1) { @@ -773,7 +1076,7 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const { +void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { const uint32_t v_trans = this->v_trans ? 1 : 0; const uint32_t n_layer = hparams.n_layer; @@ -955,7 +1258,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t cell_count, return true; } -bool llama_kv_cache::state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count) { +bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t cell_count) { uint32_t v_trans; uint32_t n_layer; io.read_to(&v_trans, sizeof(v_trans)); diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 3ea9abfce59be..67e59bc094b71 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -2,12 +2,12 @@ #include "llama.h" #include "llama-io.h" +#include "llama-graph.h" #include "ggml-cpp.h" #include #include -#include struct llama_cparams; struct llama_hparams; @@ -49,31 +49,13 @@ struct llama_kv_cache_slot_info { // TODO: pimpl // TODO: add notion of max sequences // TODO: add llama_hparams & -struct llama_kv_cache { - bool has_shift = false; - bool do_defrag = false; - bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token - bool v_trans = true; // the value tensor is transposed - bool can_shift = false; - - // Note: The value of head isn't only used to optimize searching - // for a free KV slot. llama_decode_impl also uses it, so it - // cannot be freely changed after a slot has been allocated. - uint32_t head = 0; - uint32_t size = 0; - uint32_t used = 0; // used cells (i.e. at least one seq_id) - - // computed before each graph build - uint32_t n = 0; - - std::vector cells; - - std::vector k_l; // per layer - std::vector v_l; +struct llama_kv_cache : public llama_graph_kv_cache_i { + llama_kv_cache(const llama_hparams & hparams); + virtual ~llama_kv_cache() = default; // TODO: become constructor bool init( - const llama_model & model, + const llama_model & model, // TODO: do not reference the model const llama_cparams & cparams, ggml_type type_k, ggml_type type_v, @@ -115,8 +97,48 @@ struct llama_kv_cache { size_t size_k_bytes() const; size_t size_v_bytes() const; - void state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const; - void state_read (llama_io_read_i & io, const llama_hparams & hparams, llama_seq_id seq_id = -1); + // graph build API + + virtual void build_shift( + ggml_context * ctx0, + ggml_cgraph * gf, + llama_graph_i * lgf) override; + + virtual void build_defrag( + ggml_context * ctx0, + ggml_cgraph * gf, + int32_t max_nodes, + bool v_trans) override; + + // state save/load + + void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const; + void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1); + + // members + + const llama_hparams & hparams; + + bool has_shift = false; + bool do_defrag = false; + bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token + bool v_trans = true; // the value tensor is transposed + bool can_shift = false; + + // Note: The value of head isn't only used to optimize searching + // for a free KV slot. llama_decode_impl also uses it, so it + // cannot be freely changed after a slot has been allocated. + uint32_t head = 0; + uint32_t size = 0; + uint32_t used = 0; // used cells (i.e. at least one seq_id) + + // computed before each graph build + uint32_t n = 0; + + std::vector cells; + + std::vector k_l; // per layer + std::vector v_l; private: ggml_type type_k = GGML_TYPE_F16; @@ -126,10 +148,10 @@ struct llama_kv_cache { std::vector bufs; void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; - void state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const; + void state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const; bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1); - bool state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count); + bool state_read_data(llama_io_read_i & io, uint32_t cell_count); }; // diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 350dfd89cee3d..09fd63f61ce6c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3579,8 +3579,8 @@ size_t llama_model::size() const { return pimpl->n_bytes; } -size_t llama_model::max_nodes() const { - return std::max(8192, tensors_by_name.size()*5); +size_t llama_model::n_tensors() const { + return tensors_by_name.size(); } size_t llama_model::n_devices() const { @@ -3900,6 +3900,38 @@ struct llm_build_context { return inpL; } + // TODO: tmp + struct ggml_tensor * build_inp_pos() { + ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens); + cb(cur, "inp_pos", -1); + + return cur; + } + + // TODO: tmp + struct ggml_tensor * build_inp_out_ids() { + ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case); + cb(cur, "inp_out_ids", -1); + + return cur; + } + + // TODO: tmp + struct ggml_tensor * build_inp_mean() { + ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens); + cb(cur, "inp_mean", -1); + + return cur; + } + + // TODO: tmp + struct ggml_tensor * build_inp_cls() { + ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens); + cb(cur, "inp_cls", -1); + + return cur; + } + // TODO: tmp struct ggml_tensor * build_lora_mm( struct ggml_tensor * w, @@ -3915,6 +3947,22 @@ struct llm_build_context { return lgf->build_lora_mm_id(ctx0, w, cur, ids); } + // TODO: tmp + struct ggml_tensor * build_inp_embd_enc() { + ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case); + cb(cur, "embd_enc", -1); + + return cur; + } + + // TODO: tmp + struct ggml_tensor * build_inp_KQ_mask_cross() { + ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); + cb(cur, "KQ_mask_cross", -1); + + return cur; + } + struct ggml_tensor * build_norm( struct ggml_tensor * cur, struct ggml_tensor * mw, @@ -4195,7 +4243,7 @@ struct llm_build_context { } struct ggml_tensor * build_attn( - struct ggml_cgraph * graph, + struct ggml_cgraph * gf, struct ggml_tensor * wo, struct ggml_tensor * wo_b, struct ggml_tensor * k_cur, @@ -4206,17 +4254,17 @@ struct llm_build_context { int il) { // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(graph, q_cur); - ggml_build_forward_expand(graph, k_cur); - ggml_build_forward_expand(graph, v_cur); + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, k_cur); + ggml_build_forward_expand(gf, v_cur); - //build_kv_store(graph, k_cur, v_cur, il); - lgf->build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); + //build_kv_store(gf, k_cur, v_cur, il); + lgf->build_attn_kv_store(ctx0, gf, k_cur, v_cur, n_tokens, il, worst_case); struct ggml_tensor * cur; - //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il); - cur = lgf->build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); + //cur = build_kqv(gf, wo, wo_b, q_cur, kq_mask, kq_scale, il); + cur = lgf->build_attn_qkv(ctx0, gf, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); cb(cur, "kqv_out", il); return cur; @@ -4251,34 +4299,6 @@ struct llm_build_context { return cur; } - struct ggml_tensor * build_inp_pos() { - ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens); - cb(cur, "inp_pos", -1); - - return cur; - } - - struct ggml_tensor * build_inp_out_ids() { - ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case); - cb(cur, "inp_out_ids", -1); - - return cur; - } - - struct ggml_tensor * build_inp_mean() { - ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens); - cb(cur, "inp_mean", -1); - - return cur; - } - - struct ggml_tensor * build_inp_cls() { - ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens); - cb(cur, "inp_cls", -1); - - return cur; - } - void append_pooling(struct ggml_cgraph * gf) { struct ggml_tensor * inp = res.t_embd; @@ -4377,20 +4397,6 @@ struct llm_build_context { // return pos_bias; //} - struct ggml_tensor * build_inp_embd_enc() { - ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case); - cb(cur, "embd_enc", -1); - - return cur; - } - - struct ggml_tensor * build_inp_KQ_mask_cross() { - ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); - cb(cur, "KQ_mask_cross", -1); - - return cur; - } - void build_llama(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -10936,16 +10942,13 @@ struct llm_build_context { llama_graph_result llama_model::build_graph( ggml_context * ctx, + ggml_cgraph * gf, llama_graph_i * lgf, const llama_cparams & cparams, const llama_ubatch & ubatch, - bool worst_case) const { + bool worst_case) const { struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch, worst_case); - auto & gf = llm.res.gf; - - gf = ggml_new_graph_custom(llm.ctx0, max_nodes(), false); - switch (arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_MINICPM: diff --git a/src/llama-model.h b/src/llama-model.h index 2a9fca7d40c6d..94e7622943937 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -353,7 +353,7 @@ struct llama_model { std::string desc() const; size_t size() const; - size_t max_nodes() const; + size_t n_tensors() const; size_t n_devices() const; // total number of parameters in the model @@ -371,6 +371,7 @@ struct llama_model { // TODO: add encode/decode graphs llama_graph_result build_graph( ggml_context * ctx, + ggml_cgraph * gf, llama_graph_i * lgf, const llama_cparams & cparams, const llama_ubatch & ubatch, From 5f11a5502a37df607d35c703f52dd6f8f6454bdd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 19 Feb 2025 14:36:27 +0200 Subject: [PATCH 55/84] kv-cache : remove llama_kv_cache_i --- src/llama-context.cpp | 307 ++++++++++++++++++++++++++++++++++++++++- src/llama-context.h | 21 ++- src/llama-graph.h | 24 ++-- src/llama-kv-cache.cpp | 302 ---------------------------------------- src/llama-kv-cache.h | 15 +- 5 files changed, 330 insertions(+), 339 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 454e141c85796..bec82b4464303 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2533,7 +2533,7 @@ void llama_context_kv_self::kv_self_update() { auto * gf = graph_init(); - kv_self.build_shift(ctx_compute.get(), gf, this); + build_kv_self_shift(ctx_compute.get(), gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2559,7 +2559,7 @@ void llama_context_kv_self::kv_self_update() { auto * gf = graph_init(); - kv_self.build_defrag(ctx_compute.get(), gf, max_nodes(), !cparams.flash_attn); + build_kv_self_defrag(ctx_compute.get(), gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2817,6 +2817,309 @@ ggml_tensor * llama_context_kv_self::build_attn_soft_max( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } +void llama_context_kv_self::build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf) { + const auto & hparams = model.hparams; + + const auto & n_layer = hparams.n_layer; + + const auto & n_embd_head_k = hparams.n_embd_head_k; + //const auto & n_embd_head_v = hparams.n_embd_head_v; + + //GGML_ASSERT(kv_self.size == n_ctx); + + ggml_tensor * inp_k_shift = build_inp_k_shift(ctx0); + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + + struct ggml_tensor * rope_factors = build_rope_factors(il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_head_kv, kv_self.size, + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + 0); + + ggml_tensor * cur = build_rope_shift(ctx0, k, inp_k_shift, rope_factors, kv_self.k_l[il]->buffer); + + ggml_build_forward_expand(gf, cur); + } +} + +void llama_context_kv_self::build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf) { + const auto & hparams = model.hparams; + + const uint32_t n_layer = hparams.n_layer; + + const uint32_t n_kv = kv_self.cell_max(); + const uint32_t n_used = kv_self.used; + + assert(n_used <= n_kv); + + //const int64_t t_start = ggml_time_us(); + + // number of cells moved + uint32_t n_moves = 0; + + // each move requires 6*n_layer tensors (see build_kv_self_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + //const uint32_t max_moves = max_nodes()/(6*n_layer); + // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 + const uint32_t max_moves = (max_nodes() - 2*n_layer)/(6*n_layer); + + // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // + std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = kv_self.cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + uint32_t nh = 1; + + // determine the size of the hole + while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { + nh++; + } + + uint32_t nf = 0; + uint32_t is = n_kv - 1; + + // starting from the end, find nh non-empty cells + for (; is > i0; --is) { + const auto & cell1 = kv_self.cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } + + // non-empty cell which is not yet moved + nf++; + + if (nf == nh) { + break; + } + } + + // this can only happen if `n_used` is not accurate, which would be a bug + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + uint32_t i1 = is; + + // are we moving a continuous block of memory? + bool cont = false; + + // should we stop searching for the next move? + bool stop = false; + + // go back and move the nf cells to the hole + for (; i1 < n_kv; ++i1) { + auto & cell1 = kv_self.cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + + cont = false; + continue; + } + + // this cell goes to (i0 + nf) + ids[i1] = i0 + nf; + + // move the cell meta data + kv_self.cells[i0 + nf] = cell1; + + // clear the old cell and move the head there + cell1 = llama_kv_cell(); + kv_self.head = n_used; + + if (!cont) { + n_moves++; + cont = true; + } + + nf++; + + if (nf == nh) { + break; + } + } + + if (stop || n_moves == max_moves) { + break; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); + + i0 += nh - 1; + } + + if (n_moves == 0) { + return; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); + + //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); + +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = size; + + std::vector buf_k; + std::vector buf_v; + + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size); + + const size_t v_size_el = ggml_type_size(v_l[il]->type); + const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size); + + buf_k.resize(k_size); + buf_v.resize(v_size); + + ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size()); + + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } + + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; + + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); + } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); + } +#else + for (uint32_t i = 0; i < ids.size(); ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == ids.size()) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < ids.size() && ids[i + nm] == id + nm) { + nm++; + } + + for (uint32_t il = 0; il < n_layer; ++il) { // NOLINT + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; + + if (cparams.flash_attn) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); + } else { + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, id)); + } + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); + } + + i += nm - 1; + } + + //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); +#endif +} + ggml_tensor * llama_context_kv_self::build_inp_embd_enc( ggml_context * ctx0, int32_t n_tokens, diff --git a/src/llama-context.h b/src/llama-context.h index 0311ad4734daf..a256f3042257b 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -378,7 +378,7 @@ class llama_context_kv_self : public llama_context { virtual void build_attn_kv_store( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * k_cur, ggml_tensor * v_cur, int32_t n_tokens, @@ -387,7 +387,7 @@ class llama_context_kv_self : public llama_context { virtual ggml_tensor * build_attn_qkv( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -401,6 +401,15 @@ class llama_context_kv_self : public llama_context { ggml_tensor * kq, float kq_scale) override; + virtual void build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf) override; + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + virtual void build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf) override; + // === encoder-decoder === // whether we are computing encoder output or decoder output @@ -443,7 +452,7 @@ class llama_context_kv_self : public llama_context { virtual ggml_tensor * build_copy_mask_state( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * s, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -454,7 +463,7 @@ class llama_context_kv_self : public llama_context { virtual ggml_tensor * build_mamba_layer( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -464,7 +473,7 @@ class llama_context_kv_self : public llama_context { virtual ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, @@ -480,7 +489,7 @@ class llama_context_kv_self : public llama_context { virtual ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * x_prev, ggml_tensor * state_copy, diff --git a/src/llama-graph.h b/src/llama-graph.h index 6098d2b9293b4..bb51b9a912f81 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -113,6 +113,15 @@ class llama_graph_i { ggml_tensor * kq, float kq_scale) = 0; + virtual void build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf) = 0; + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + virtual void build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf) = 0; + virtual ggml_tensor * build_inp_k_shift( ggml_context * ctx0) = 0; @@ -182,18 +191,3 @@ class llama_graph_i { int il, bool worst_case) = 0; }; - -class llama_graph_kv_cache_i { -public: - virtual void build_shift( - ggml_context * ctx0, - ggml_cgraph * gf, - llama_graph_i * lgf) = 0; - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_defrag( - ggml_context * ctx0, - ggml_cgraph * gf, - int32_t max_nodes, - bool v_trans) = 0; -}; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 5dde8b8703875..8a87f91290eed 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -699,308 +699,6 @@ size_t llama_kv_cache::size_v_bytes() const { return size_v_bytes; } -void llama_kv_cache::build_shift( - ggml_context * ctx0, - ggml_cgraph * gf, - llama_graph_i * lgf) { - const auto & n_layer = hparams.n_layer; - - const auto & n_embd_head_k = hparams.n_embd_head_k; - //const auto & n_embd_head_v = hparams.n_embd_head_v; - - //GGML_ASSERT(kv_self.size == n_ctx); - - ggml_tensor * inp_k_shift = lgf->build_inp_k_shift(ctx0); - - for (uint32_t il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - - struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); - - struct ggml_tensor * k = - ggml_view_3d(ctx0, k_l[il], - n_embd_head_k, n_head_kv, size, - ggml_row_size(k_l[il]->type, n_embd_head_k), - ggml_row_size(k_l[il]->type, n_embd_k_gqa), - 0); - - ggml_tensor * cur = lgf->build_rope_shift(ctx0, k, inp_k_shift, rope_factors, k_l[il]->buffer); - - ggml_build_forward_expand(gf, cur); - } -} - -void llama_kv_cache::build_defrag( - ggml_context * ctx0, - ggml_cgraph * gf, - int32_t max_nodes, - bool v_trans) { - const uint32_t n_layer = hparams.n_layer; - - const uint32_t n_kv = cell_max(); - const uint32_t n_used = used; - - assert(n_used <= n_kv); - - //const int64_t t_start = ggml_time_us(); - - // number of cells moved - uint32_t n_moves = 0; - - // each move requires 6*n_layer tensors (see build_kv_self_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - //const uint32_t max_moves = max_nodes/(6*n_layer); - // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (max_nodes - 2*n_layer)/(6*n_layer); - - // determine which KV cells to move where - // - // cell i moves to ids[i] - // - // if ids[i] == i || ids[i] == n_kv, then cell i is not moved - // - std::vector ids(n_kv, n_kv); - - for (uint32_t i0 = 0; i0 < n_used; ++i0) { - const auto & cell0 = cells[i0]; - - if (!cell0.is_empty()) { - ids[i0] = i0; - - continue; - } - - // found a hole - fill it with data from the end of the cache - - uint32_t nh = 1; - - // determine the size of the hole - while (i0 + nh < n_used && cells[i0 + nh].is_empty()) { - nh++; - } - - uint32_t nf = 0; - uint32_t is = n_kv - 1; - - // starting from the end, find nh non-empty cells - for (; is > i0; --is) { - const auto & cell1 = cells[is]; - - if (cell1.is_empty() || ids[is] != n_kv) { - continue; - } - - // non-empty cell which is not yet moved - nf++; - - if (nf == nh) { - break; - } - } - - // this can only happen if `n_used` is not accurate, which would be a bug - GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); - - nf = 0; - - uint32_t i1 = is; - - // are we moving a continuous block of memory? - bool cont = false; - - // should we stop searching for the next move? - bool stop = false; - - // go back and move the nf cells to the hole - for (; i1 < n_kv; ++i1) { - auto & cell1 = cells[i1]; - - if (cell1.is_empty() || ids[i1] != n_kv) { - if (n_moves == max_moves) { - stop = true; - break; - } - - cont = false; - continue; - } - - // this cell goes to (i0 + nf) - ids[i1] = i0 + nf; - - // move the cell meta data - cells[i0 + nf] = cell1; - - // clear the old cell and move the head there - cell1 = llama_kv_cell(); - head = n_used; - - if (!cont) { - n_moves++; - cont = true; - } - - nf++; - - if (nf == nh) { - break; - } - } - - if (stop || n_moves == max_moves) { - break; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); - - i0 += nh - 1; - } - - if (n_moves == 0) { - return; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - - //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); - -#if 0 - // CPU defrag - // - // TODO: optimizations are possible: - // - multiple threads - // - avoid copying to the host memory when already there - // - // likely not worth the effort, as we have ggml_graph based defrag - // - - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - - const uint32_t kv_size = size; - - std::vector buf_k; - std::vector buf_v; - - for (uint32_t il = 0; il < n_layer; ++il) { - const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); - const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size); - - const size_t v_size_el = ggml_type_size(v_l[il]->type); - const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size); - - buf_k.resize(k_size); - buf_v.resize(v_size); - - ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size()); - - // batch move [i, i+nm) to [id, id+nm) - // note: cells can move only to a lower index - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == n_kv) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < n_kv && ids[i + nm] == id + nm) { - nm++; - } - - // move keys - { - const int64_t os = i*k_size_row; - const int64_t od = id*k_size_row; - - memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); - } - - // move values (note: they are transposed) - { - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); - } - } - - i += nm - 1; - } - - ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); - } -#else - for (uint32_t i = 0; i < ids.size(); ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == ids.size()) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < ids.size() && ids[i + nm] == id + nm) { - nm++; - } - - for (uint32_t il = 0; il < n_layer; ++il) { - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - ggml_tensor * view_k_src = ggml_view_2d(ctx0, k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(k_l[il]->type, n_embd_k_gqa), - ggml_row_size(k_l[il]->type, n_embd_k_gqa*i)); - - ggml_tensor * view_k_dst = ggml_view_2d(ctx0, k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(k_l[il]->type, n_embd_k_gqa), - ggml_row_size(k_l[il]->type, n_embd_k_gqa*id)); - - ggml_tensor * view_v_src; - ggml_tensor * view_v_dst; - - if (!v_trans) { - // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx0, v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(v_l[il]->type, n_embd_v_gqa), - ggml_row_size(v_l[il]->type, n_embd_v_gqa*i)); - - view_v_dst = ggml_view_2d(ctx0, v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(v_l[il]->type, n_embd_v_gqa), - ggml_row_size(v_l[il]->type, n_embd_v_gqa*id)); - } else { - view_v_src = ggml_view_2d(ctx0, v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(v_l[il]->type, size), - ggml_row_size(v_l[il]->type, i)); - - view_v_dst = ggml_view_2d(ctx0, v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(v_l[il]->type, size), - ggml_row_size(v_l[il]->type, id)); - } - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); - } - - i += nm - 1; - } - - //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); -#endif -} - void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { std::vector> cell_ranges; // ranges, from inclusive, to exclusive uint32_t cell_count = 0; diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 67e59bc094b71..049193fd0f176 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -49,7 +49,7 @@ struct llama_kv_cache_slot_info { // TODO: pimpl // TODO: add notion of max sequences // TODO: add llama_hparams & -struct llama_kv_cache : public llama_graph_kv_cache_i { +struct llama_kv_cache { llama_kv_cache(const llama_hparams & hparams); virtual ~llama_kv_cache() = default; @@ -97,19 +97,6 @@ struct llama_kv_cache : public llama_graph_kv_cache_i { size_t size_k_bytes() const; size_t size_v_bytes() const; - // graph build API - - virtual void build_shift( - ggml_context * ctx0, - ggml_cgraph * gf, - llama_graph_i * lgf) override; - - virtual void build_defrag( - ggml_context * ctx0, - ggml_cgraph * gf, - int32_t max_nodes, - bool v_trans) override; - // state save/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const; From e17e4b72d16710ee430b6858d58ce6ab3f4a31bb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 19 Feb 2025 14:56:01 +0200 Subject: [PATCH 56/84] context : add llama_context_recurrent ggml-ci --- src/llama-context.cpp | 151 ++++++++++++++++++++++++------------------ src/llama-context.h | 32 ++++++--- src/llama-graph.cpp | 135 +++++++++++++++++++++++++++++++++++++ src/llama-graph.h | 16 ++--- src/llama.cpp | 15 ++++- 5 files changed, 266 insertions(+), 83 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index bec82b4464303..b571c9343fa88 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -20,6 +20,8 @@ llama_context::llama_context( model (model), t_start_us(model.t_start_us), t_load_us (model.t_load_us) { + LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__); + const auto & hparams = model.hparams; cparams.n_seq_max = std::max(1u, params.n_seq_max); @@ -1633,6 +1635,8 @@ llama_context_kv_self::llama_context_kv_self( const llama_context_params & params) : llama_context(model, params), kv_self(model.hparams) { + LLAMA_LOG_INFO("%s: constructing llama_context_kv_self\n", __func__); + const auto & hparams = model.hparams; LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx); @@ -1700,8 +1704,6 @@ ggml_cgraph * llama_context_kv_self::graph_init() { inp_KQ_mask_swa_cnv = nullptr; inp_KQ_mask_cross = nullptr; inp_k_shift = nullptr; - inp_s_copy = nullptr; - inp_s_mask = nullptr; inp_embd_enc = nullptr; inp_pos_bucket = nullptr; @@ -2381,53 +2383,6 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { } } - if (kv_self.recurrent) { - const int64_t n_kv = kv_self.n; - - if (inp_s_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer)); - float * data = (float *) inp_s_mask->data; - - // clear unused states - for (int i = 0; i < n_kv; ++i) { - const uint32_t cell_id = i + kv_self.head; - llama_kv_cell & kv_cell = kv_self.cells[cell_id]; - - data[i] = (float) (kv_cell.src >= 0); - - // TODO: do not mutate the KV cache - // only clear once - if (kv_cell.src < 0) { - kv_cell.src = cell_id; - } - } - } - - if (inp_s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer)); - int32_t * data = (int32_t *) inp_s_copy->data; - - // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t cell_id = i + kv_self.head; - llama_kv_cell & kv_cell = kv_self.cells[cell_id]; - - // prevent out-of-bound sources - if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) { - kv_cell.src = cell_id; - } - - data[i] = kv_cell.src; - - // TODO: do not mutate the KV cache - // ensure copy only happens once - if (kv_cell.src != (int32_t) cell_id) { - kv_cell.src = cell_id; - } - } - } - } - if (inp_pos_bucket) { const int64_t n_tokens = ubatch.n_tokens; @@ -2614,7 +2569,7 @@ void llama_context_kv_self::build_attn_inp( void llama_context_kv_self::build_attn_kv_store( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * k_cur, ggml_tensor * v_cur, int32_t n_tokens, @@ -2635,7 +2590,7 @@ void llama_context_kv_self::build_attn_kv_store( //cb(k_cache_view, "k_cache_view", il); // note: storing RoPE-ed version of K in the KV cache - ggml_build_forward_expand(graph, ggml_cpy(ctx0, k_cur, k_cache_view)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view)); assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); @@ -2653,12 +2608,12 @@ void llama_context_kv_self::build_attn_kv_store( } //cb(v_cache_view, "v_cache_view", il); - ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view)); } ggml_tensor * llama_context_kv_self::build_attn_qkv( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -2791,7 +2746,7 @@ ggml_tensor * llama_context_kv_self::build_attn_qkv( } } - ggml_build_forward_expand(graph, cur); + ggml_build_forward_expand(gf, cur); if (wo) { cur = build_lora_mm(ctx0, wo, cur); @@ -3152,7 +3107,79 @@ ggml_tensor * llama_context_kv_self::build_inp_KQ_mask_cross( return inp_KQ_mask_cross; } -ggml_tensor * llama_context_kv_self::build_inp_s_copy( +// +// llama_context_recurrent +// + +llama_context_recurrent::llama_context_recurrent( + const llama_model & model, + const llama_context_params & params) : + llama_context_kv_self(model, params) { + LLAMA_LOG_INFO("%s: constructing llama_context_recurrent\n", __func__); +} + +llama_context_recurrent::~llama_context_recurrent() = default; + +ggml_cgraph * llama_context_recurrent::graph_init() { + inp_s_copy = nullptr; + inp_s_mask = nullptr; + + return llama_context_kv_self::graph_init(); +} + +void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { + // call base functionality + llama_context_kv_self::input_set(ubatch); + + GGML_ASSERT(kv_self.recurrent); + + const int64_t n_kv = kv_self.n; + + if (inp_s_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer)); + float * data = (float *) inp_s_mask->data; + + // clear unused states + for (int i = 0; i < n_kv; ++i) { + const uint32_t cell_id = i + kv_self.head; + llama_kv_cell & kv_cell = kv_self.cells[cell_id]; + + data[i] = (float) (kv_cell.src >= 0); + + // TODO: do not mutate the KV cache + // only clear once + if (kv_cell.src < 0) { + kv_cell.src = cell_id; + } + } + } + + if (inp_s_copy) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer)); + int32_t * data = (int32_t *) inp_s_copy->data; + + // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t cell_id = i + kv_self.head; + llama_kv_cell & kv_cell = kv_self.cells[cell_id]; + + // prevent out-of-bound sources + if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) { + kv_cell.src = cell_id; + } + + data[i] = kv_cell.src; + + // TODO: do not mutate the KV cache + // ensure copy only happens once + if (kv_cell.src != (int32_t) cell_id) { + kv_cell.src = cell_id; + } + } + } +} + +ggml_tensor * llama_context_recurrent::build_inp_s_copy( ggml_context * ctx0, bool worst_case) { const auto n_kv = worst_case ? kv_self.size : kv_self.n; @@ -3163,7 +3190,7 @@ ggml_tensor * llama_context_kv_self::build_inp_s_copy( return inp_s_copy; } -ggml_tensor * llama_context_kv_self::build_inp_s_mask( +ggml_tensor * llama_context_recurrent::build_inp_s_mask( ggml_context * ctx0, bool worst_case) { const auto n_kv = worst_case ? kv_self.size : kv_self.n; @@ -3173,7 +3200,7 @@ ggml_tensor * llama_context_kv_self::build_inp_s_mask( return inp_s_mask; } -ggml_tensor * llama_context_kv_self::build_copy_mask_state( +ggml_tensor * llama_context_recurrent::build_copy_mask_state( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * s, @@ -3208,7 +3235,7 @@ ggml_tensor * llama_context_kv_self::build_copy_mask_state( } // TODO: split -ggml_tensor * llama_context_kv_self::build_mamba_layer( +ggml_tensor * llama_context_recurrent::build_mamba_layer( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * cur, @@ -3344,7 +3371,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer( } -ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load( +ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_load( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * state_copy, @@ -3370,8 +3397,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load( return token_shift; } - -ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store( +ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, @@ -3394,8 +3420,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store( ); } - -ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( +ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * cur, diff --git a/src/llama-context.h b/src/llama-context.h index a256f3042257b..133eb8b36f739 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -433,15 +433,28 @@ class llama_context_kv_self : public llama_context { int32_t n_tokens, bool worst_case) override; - // === recurrent === +protected: + virtual size_t state_get_data(llama_io_write_i & io) override; + virtual size_t state_set_data(llama_io_read_i & io) override; - struct ggml_tensor * inp_s_copy; // I32 [kv_size] - struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] + virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; + virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; +}; - // TODO: add recurrent cache - // TODO: add mamba-specific llama_context +// a recurrent transformer (ie.e RWKV, Mamba) +// TODO: temporary reuse kv_self, but in the future, implement recurrent-specific context with specific cache +class llama_context_recurrent : public llama_context_kv_self { +public: + llama_context_recurrent( + const llama_model & model, + const llama_context_params & params); + + virtual ~llama_context_recurrent(); + + virtual ggml_cgraph * graph_init() override; + + virtual void input_set(const llama_ubatch & ubatch) override; - // TODO: change these to build_mamba_inp and hide `state_copy` and `state_mask` inside the llama_context impl virtual ggml_tensor * build_inp_s_copy( ggml_context * ctx0, bool worst_case) override; @@ -499,11 +512,10 @@ class llama_context_kv_self : public llama_context { bool worst_case) override; protected: - virtual size_t state_get_data(llama_io_write_i & io) override; - virtual size_t state_set_data(llama_io_read_i & io) override; + struct ggml_tensor * inp_s_copy; // I32 [kv_size] + struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] - virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; - virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; + // TODO: add recurrent cache }; // For internal test use diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 20f2ee0bd56aa..17605e74cc90b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1 +1,136 @@ #include "llama-graph.h" + +#include "llama-impl.h" + +ggml_tensor * llama_graph_i::build_inp_s_copy ( + ggml_context * ctx0, + bool worst_case) { + GGML_UNUSED(ctx0); + GGML_UNUSED(worst_case); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + + return nullptr; // NOLINT +} + +ggml_tensor * llama_graph_i::build_inp_s_mask( + ggml_context * ctx0, + bool worst_case) { + GGML_UNUSED(ctx0); + GGML_UNUSED(worst_case); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + + return nullptr; // NOLINT +} + +ggml_tensor * llama_graph_i::build_copy_mask_state( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_tokens, + int32_t n_state, + int32_t n_seqs, + bool worst_case) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + GGML_UNUSED(s); + GGML_UNUSED(state_copy); + GGML_UNUSED(state_mask); + GGML_UNUSED(n_tokens); + GGML_UNUSED(n_state); + GGML_UNUSED(n_seqs); + GGML_UNUSED(worst_case); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + + return nullptr; // NOLINT +} + +ggml_tensor * llama_graph_i::build_mamba_layer( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + GGML_UNUSED(cur); + GGML_UNUSED(state_copy); + GGML_UNUSED(state_mask); + GGML_UNUSED(ubatch); + GGML_UNUSED(il); + GGML_UNUSED(worst_case); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + + return nullptr; // NOLINT +} + +ggml_tensor * llama_graph_i::build_rwkv_token_shift_load( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + GGML_UNUSED(state_copy); + GGML_UNUSED(state_mask); + GGML_UNUSED(ubatch); + GGML_UNUSED(il); + GGML_UNUSED(worst_case); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + + return nullptr; // NOLINT +} + +ggml_tensor * llama_graph_i::build_rwkv_token_shift_store( + ggml_context * ctx0, + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + GGML_UNUSED(ctx0); + GGML_UNUSED(token_shift); + GGML_UNUSED(ubatch); + GGML_UNUSED(il); + GGML_UNUSED(worst_case); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + + return nullptr; // NOLINT +} + +ggml_tensor * llama_graph_i::build_rwkv6_time_mix( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + GGML_UNUSED(cur); + GGML_UNUSED(x_prev); + GGML_UNUSED(state_copy); + GGML_UNUSED(state_mask); + GGML_UNUSED(ubatch); + GGML_UNUSED(il); + GGML_UNUSED(worst_case); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + + return nullptr; // NOLINT +} diff --git a/src/llama-graph.h b/src/llama-graph.h index bb51b9a912f81..b9456e3d1ca74 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -55,7 +55,7 @@ class llama_graph_i { ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, - ggml_backend_buffer * bbuft) = 0; + ggml_backend_buffer * bbuf) = 0; // graph build API (context-specific) @@ -137,11 +137,11 @@ class llama_graph_i { virtual ggml_tensor * build_inp_s_copy( ggml_context * ctx0, - bool worst_case) = 0; + bool worst_case); virtual ggml_tensor * build_inp_s_mask( ggml_context * ctx0, - bool worst_case) = 0; + bool worst_case); virtual ggml_tensor * build_copy_mask_state( ggml_context * ctx0, @@ -152,7 +152,7 @@ class llama_graph_i { int32_t n_tokens, int32_t n_state, int32_t n_seqs, - bool worst_case) = 0; + bool worst_case); virtual ggml_tensor * build_mamba_layer( ggml_context * ctx0, @@ -162,7 +162,7 @@ class llama_graph_i { ggml_tensor * state_mask, const llama_ubatch & ubatch, int il, - bool worst_case) = 0; + bool worst_case); virtual ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, @@ -171,14 +171,14 @@ class llama_graph_i { ggml_tensor * state_mask, const llama_ubatch & ubatch, int il, - bool worst_case) = 0; + bool worst_case); virtual ggml_tensor * build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, int il, - bool worst_case) = 0; + bool worst_case); virtual ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, @@ -189,5 +189,5 @@ class llama_graph_i { ggml_tensor * state_mask, const llama_ubatch & ubatch, int il, - bool worst_case) = 0; + bool worst_case); }; diff --git a/src/llama.cpp b/src/llama.cpp index a677902f0ba7c..3db1644775fe7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -326,8 +326,19 @@ struct llama_context * llama_init_from_model( llama_context * ctx = nullptr; try { - // TODO: add logic which llama_context implementation to construct - ctx = new llama_context_kv_self(*model, params); + // TODO: make static method of llama_context + switch (model->arch) { + case LLM_ARCH_RWKV6: + case LLM_ARCH_RWKV6QWEN2: + case LLM_ARCH_MAMBA: + GGML_ASSERT(llama_model_is_recurrent(model)); + ctx = new llama_context_recurrent(*model, params); + break; + default: + GGML_ASSERT(!llama_model_is_recurrent(model)); + ctx = new llama_context_kv_self(*model, params); + }; + ctx->init(); } catch (const std::exception & e) { LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what()); From 2eacb4c1bfe01839f579e8aac3068f8758c26874 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 19 Feb 2025 18:43:49 +0200 Subject: [PATCH 57/84] graph : simplify attention api ggml-ci --- src/llama-context.cpp | 65 +++++++++++++++++++------------------------ src/llama-context.h | 14 +++------- src/llama-graph.h | 13 ++------- src/llama-model.cpp | 8 +----- 4 files changed, 36 insertions(+), 64 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index b571c9343fa88..818702143e196 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2567,63 +2567,56 @@ void llama_context_kv_self::build_attn_inp( } } -void llama_context_kv_self::build_attn_kv_store( +ggml_tensor * llama_context_kv_self::build_attn( ggml_context * ctx0, ggml_cgraph * gf, + ggml_tensor * wo, + ggml_tensor * wo_b, ggml_tensor * k_cur, ggml_tensor * v_cur, + ggml_tensor * q_cur, int32_t n_tokens, - int64_t il, + float kq_scale, + int il, bool worst_case) { const auto & hparams = model.hparams; const auto & n_ctx = cparams.n_ctx; - const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - GGML_ASSERT(kv_self.size == n_ctx); + // store to KV cache + { + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head); - //cb(k_cache_view, "k_cache_view", il); + GGML_ASSERT(kv_self.size == n_ctx); - // note: storing RoPE-ed version of K in the KV cache - ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view)); + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head); + //cb(k_cache_view, "k_cache_view", il); - assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); + // note: storing RoPE-ed version of K in the KV cache + ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view)); - struct ggml_tensor * v_cache_view = nullptr; + assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); - if (cparams.flash_attn) { - v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head); - } else { - // note: the V cache is transposed when not using flash attention - v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa, - ( n_ctx)*ggml_element_size(kv_self.v_l[il]), - (kv_head)*ggml_element_size(kv_self.v_l[il])); + struct ggml_tensor * v_cache_view = nullptr; - v_cur = ggml_transpose(ctx0, v_cur); - } - //cb(v_cache_view, "v_cache_view", il); + if (cparams.flash_attn) { + v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head); + } else { + // note: the V cache is transposed when not using flash attention + v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa, + ( n_ctx)*ggml_element_size(kv_self.v_l[il]), + (kv_head)*ggml_element_size(kv_self.v_l[il])); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view)); -} + v_cur = ggml_transpose(ctx0, v_cur); + } + //cb(v_cache_view, "v_cache_view", il); -ggml_tensor * llama_context_kv_self::build_attn_qkv( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, - int32_t n_tokens, - float kq_scale, - int il, - bool worst_case) { - const auto & hparams = model.hparams; + ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view)); + } - const auto & n_ctx = cparams.n_ctx; const auto & n_embd_head_k = hparams.n_embd_head_k; const auto & n_embd_head_v = hparams.n_embd_head_v; @@ -2657,8 +2650,6 @@ ggml_tensor * llama_context_kv_self::build_attn_qkv( const int64_t n_head = hparams.n_head(il); const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); //cb(q, "q", il); diff --git a/src/llama-context.h b/src/llama-context.h index 133eb8b36f739..fb241adf1d151 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -376,20 +376,13 @@ class llama_context_kv_self : public llama_context { bool swa, bool worst_case) override; - virtual void build_attn_kv_store( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - int32_t n_tokens, - int64_t il, - bool worst_case) override; - - virtual ggml_tensor * build_attn_qkv( + virtual ggml_tensor * build_attn( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, + ggml_tensor * k_cur, + ggml_tensor * v_cur, ggml_tensor * q_cur, int32_t n_tokens, float kq_scale, @@ -443,6 +436,7 @@ class llama_context_kv_self : public llama_context { // a recurrent transformer (ie.e RWKV, Mamba) // TODO: temporary reuse kv_self, but in the future, implement recurrent-specific context with specific cache +//class llama_context_recurrent : public llama_context { class llama_context_recurrent : public llama_context_kv_self { public: llama_context_recurrent( diff --git a/src/llama-graph.h b/src/llama-graph.h index b9456e3d1ca74..9adfc6f2313e2 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -88,20 +88,13 @@ class llama_graph_i { bool swa, bool worst_case) = 0; - virtual void build_attn_kv_store( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - int32_t n_tokens, - int64_t il, - bool worst_case) = 0; - - virtual ggml_tensor * build_attn_qkv( + virtual ggml_tensor * build_attn( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, + ggml_tensor * k_cur, + ggml_tensor * v_cur, ggml_tensor * q_cur, int32_t n_tokens, float kq_scale, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 09fd63f61ce6c..a22720c3ad184 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4258,13 +4258,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - //build_kv_store(gf, k_cur, v_cur, il); - lgf->build_attn_kv_store(ctx0, gf, k_cur, v_cur, n_tokens, il, worst_case); - - struct ggml_tensor * cur; - - //cur = build_kqv(gf, wo, wo_b, q_cur, kq_mask, kq_scale, il); - cur = lgf->build_attn_qkv(ctx0, gf, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); + ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, k_cur, v_cur, q_cur, n_tokens, kq_scale, il, worst_case); cb(cur, "kqv_out", il); return cur; From f95b04a21cbb748ff5ed1a0489389166bc345672 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 19 Feb 2025 18:47:37 +0200 Subject: [PATCH 58/84] model : fix order kvq -> qkv ggml-ci --- src/llama-context.cpp | 12 +++--- src/llama-context.h | 2 +- src/llama-graph.h | 2 +- src/llama-model.cpp | 95 ++++++++++++++++++++++--------------------- 4 files changed, 56 insertions(+), 55 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 818702143e196..dbc9231acf1c8 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2572,9 +2572,9 @@ ggml_tensor * llama_context_kv_self::build_attn( ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, + ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, - ggml_tensor * q_cur, int32_t n_tokens, float kq_scale, int il, @@ -2617,9 +2617,6 @@ ggml_tensor * llama_context_kv_self::build_attn( ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view)); } - const auto & n_embd_head_k = hparams.n_embd_head_k; - const auto & n_embd_head_v = hparams.n_embd_head_v; - // TODO: improve bool is_sliding = false; @@ -2648,8 +2645,11 @@ ggml_tensor * llama_context_kv_self::build_attn( const auto n_kv = worst_case ? kv_self.size : kv_self.n; - const int64_t n_head = hparams.n_head(il); - const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + + const auto & n_embd_head_k = hparams.n_embd_head_k; + const auto & n_embd_head_v = hparams.n_embd_head_v; struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); //cb(q, "q", il); diff --git a/src/llama-context.h b/src/llama-context.h index fb241adf1d151..2b3d5f122bbbe 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -381,9 +381,9 @@ class llama_context_kv_self : public llama_context { ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, + ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, - ggml_tensor * q_cur, int32_t n_tokens, float kq_scale, int il, diff --git a/src/llama-graph.h b/src/llama-graph.h index 9adfc6f2313e2..b64e0f5f4fdb0 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -93,9 +93,9 @@ class llama_graph_i { ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, + ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, - ggml_tensor * q_cur, int32_t n_tokens, float kq_scale, int il, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a22720c3ad184..debbacbb6183b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4246,9 +4246,9 @@ struct llm_build_context { struct ggml_cgraph * gf, struct ggml_tensor * wo, struct ggml_tensor * wo_b, + struct ggml_tensor * q_cur, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, - struct ggml_tensor * q_cur, int32_t n_tokens, float kq_scale, int il) { @@ -4258,7 +4258,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, k_cur, v_cur, q_cur, n_tokens, kq_scale, il, worst_case); + ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il, worst_case); cb(cur, "kqv_out", il); return cur; @@ -4460,7 +4460,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, kq_scale, il); + Qcur, Kcur, Vcur, n_tokens, kq_scale, il); } if (il == n_layer - 1) { @@ -4632,7 +4632,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, kq_scale, il); + Qcur, Kcur, Vcur, n_tokens, kq_scale, il); } if (il == n_layer - 1) { @@ -4768,7 +4768,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -4874,7 +4874,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -4996,7 +4996,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5118,7 +5118,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } if (il == n_layer - 1) { @@ -5265,7 +5265,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5375,7 +5375,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5470,7 +5470,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5763,7 +5763,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5896,13 +5896,13 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } } @@ -6048,7 +6048,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6168,7 +6168,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6283,7 +6283,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6401,7 +6401,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6514,7 +6514,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6673,7 +6673,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } if (il == n_layer - 1) { @@ -6796,7 +6796,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } if (il == n_layer - 1) { @@ -6921,7 +6921,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } struct ggml_tensor * sa_out = cur; @@ -7024,7 +7024,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7136,7 +7136,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7257,7 +7257,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7376,7 +7376,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7570,7 +7570,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - k_states, v_states, q_states, n_tokens, kq_scale, il); + q_states, k_states, v_states, n_tokens, kq_scale, il); } if (il == n_layer - 1) { @@ -7692,7 +7692,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } if (il == n_layer - 1) { @@ -7806,7 +7806,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f, il); + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } cur = build_norm(cur, @@ -7943,7 +7943,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8143,7 +8143,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8276,8 +8276,9 @@ struct llm_build_context { cb(Kcur, "Kcur", il); } - cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, - n_tokens, 1.0f / sqrtf(float(n_embd_head)), il); + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8400,7 +8401,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8515,7 +8516,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } cur = build_norm(cur, @@ -8643,7 +8644,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8773,7 +8774,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8883,7 +8884,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9025,7 +9026,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9172,7 +9173,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, kq_scale, il); + Qcur, Kcur, Vcur, n_tokens, kq_scale, il); } if (il == n_layer - 1) { @@ -9400,7 +9401,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - k_states, v_states, q_states, n_tokens, kq_scale, il); + q_states, k_states, v_states, n_tokens, kq_scale, il); } if (il == n_layer - 1) { @@ -9558,7 +9559,7 @@ struct llm_build_context { cur = build_attn(gf, NULL, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); cur = build_norm(cur, model.layers[il].attn_sub_norm, NULL, @@ -10007,7 +10008,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/float(n_embd_head), il); } if (il == n_layer - 1) { @@ -10135,7 +10136,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -10254,7 +10255,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -10377,7 +10378,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -10699,7 +10700,7 @@ struct llm_build_context { cur = build_attn(gf, model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); if (hparams.swin_norm) { cur = build_norm(cur, From b1554be1d7213fbc628e184bffef5e42a734595d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Feb 2025 15:18:45 +0200 Subject: [PATCH 59/84] context : add cache-less llama_context ggml-ci --- common/common.cpp | 2 +- src/llama-context.cpp | 1210 ++++++++++++++++++++++++++++++---------- src/llama-context.h | 108 ++-- src/llama-graph.cpp | 78 +++ src/llama-graph.h | 21 +- src/llama-kv-cache.cpp | 44 ++ src/llama-model.cpp | 58 +- src/llama.cpp | 5 + 8 files changed, 1122 insertions(+), 404 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b751959569ca1..ec95f32d63122 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -952,7 +952,7 @@ struct common_init_result common_init_from_params(common_params & params) { } if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) { - LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__); + LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__); params.ctx_shift = false; } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index dbc9231acf1c8..6b2a11ad69097 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -341,6 +341,10 @@ uint32_t llama_context::n_ubatch() const { return cparams.n_ubatch; } +uint32_t llama_context::n_seq_max() const { + return cparams.n_seq_max; +} + uint32_t llama_context::n_threads() const { return cparams.n_threads; } @@ -353,6 +357,20 @@ int32_t llama_context::max_nodes() const { return std::max(8192, 5*model.n_tensors()); } +llama_kv_cache * llama_context::get_kv_self() { + LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__); + return nullptr; +} + +const llama_kv_cache * llama_context::get_kv_self() const { + LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__); + return nullptr; +} + +void llama_context::kv_self_update() { + LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__); +} + enum llama_pooling_type llama_context::pooling_type() const { return cparams.pooling_type; } @@ -566,6 +584,9 @@ ggml_cgraph * llama_context::graph_init() { inp_mean = nullptr; inp_cls = nullptr; + inp_kq_mask = nullptr; + inp_kq_mask_cnv = nullptr; + struct ggml_init_params params = { /*.mem_size =*/ buf_compute_meta.size(), /*.mem_buffer =*/ buf_compute_meta.data(), @@ -612,179 +633,11 @@ enum ggml_status llama_context::graph_compute( return status; } -void llama_context::input_set(const llama_ubatch & ubatch) { - const llama_hparams & hparams = model.hparams; - - if (ubatch.token) { - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); - } - - if (ubatch.embd) { - const int64_t n_embd = hparams.n_embd; - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd)); - } - - if (ubatch.pos && inp_pos) { - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos)); - } - - if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs"); - - if (!inp_out_ids) { - LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__); - } else { - const int64_t n_tokens = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer)); - int32_t * data = (int32_t *) inp_out_ids->data; - - if (n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { - data[i] = i; - } - } else if (ubatch.output) { - int32_t n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch.output[i]) { - data[n_outputs++] = i; - } - } - // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(n_outputs == n_outputs); - } else if (n_outputs == 1) { - // only keep last output - data[0] = n_tokens - 1; - } else { - GGML_ASSERT(n_outputs == 0); - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp_mean); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer)); - - float * data = (float *) inp_mean->data; - memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean)); - - std::vector sum(n_tokens, 0); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); - - sum[seq_id] += ubatch.n_seq_tokens; - } - - std::vector div(n_tokens, 0.0f); - for (int i = 0; i < n_tokens; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); - } - } - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - for (int i = 0; i < n_seq_tokens; ++i) { - data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; - } - } - } - - if (cparams.embeddings && ( - cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || - cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - - uint32_t * data = (uint32_t *) inp_cls->data; - memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; - - if (pos == 0) { - data[seq_id] = s*n_seq_tokens + i; - } - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - - uint32_t * data = (uint32_t *) inp_cls->data; - memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); - - std::vector last_pos(n_tokens, -1); - std::vector last_row(n_tokens, -1); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; - - if (pos >= last_pos[seq_id]) { - last_pos[seq_id] = pos; - last_row[seq_id] = s*n_seq_tokens + i; - } - } - } - - for (int i = 0; i < n_tokens; ++i) { - if (last_row[i] >= 0) { - data[i] = last_row[i]; - } - } - } - - GGML_ASSERT( - // (!a || b) is a logical implication (a -> b) - // !hparams.causal_attn -> !cparams.causal_attn - (hparams.causal_attn || !cparams.causal_attn) && - "causal attention is not supported by this model" - ); -} - int32_t llama_context::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; - const int64_t n_outputs_max = std::max(n_outputs, cparams.n_seq_max); + const int64_t n_outputs_max = std::max(n_outputs, n_seq_max()); const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); @@ -887,72 +740,401 @@ void llama_context::output_reorder() { } } -void llama_context::build_cb( - ggml_tensor * cur, - const char * name, - const llama_ubatch & ubatch, - int il) { - if (il >= 0) { - ggml_format_name(cur, "%s-%d", name, il); - } else { - ggml_set_name(cur, name); +int llama_context::encode(llama_batch & inp_batch) { + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; } - if (!cparams.offload_kqv) { - if (strcmp(name, "kqv_merged_cont") == 0) { - // all nodes between the KV store and the attention output are run on the CPU - ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu); - } - } + // temporary allocate memory for the input batch if needed + llama_batch_allocr batch_allocr(inp_batch, 0); - // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends - // FIXME: fix in ggml_backend_sched - const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer; - if (ubatch.n_tokens < 32 || full_offload) { - if (il != -1 && strcmp(name, "norm") == 0) { - const auto & dev_layer = model.dev_layer(il); - for (auto & backend : backends) { - if (ggml_backend_get_device(backend.get()) == dev_layer) { - if (ggml_backend_supports_op(backend.get(), cur)) { - ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get()); - } - } + const llama_batch & batch = batch_allocr.batch; + + const int32_t n_tokens = batch.n_tokens; + + const auto & hparams = model.hparams; + + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + + if (batch.token) { + for (int32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + return -1; } } } -} -llama_perf_context_data llama_context::perf_get_data() const { - llama_perf_context_data data = {}; + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot + GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens"); - data.t_start_ms = 1e-3 * t_start_us; - data.t_load_ms = 1e-3 * t_load_us; - data.t_p_eval_ms = 1e-3 * t_p_eval_us; - data.t_eval_ms = 1e-3 * t_eval_us; - data.n_p_eval = std::max(1, n_p_eval); - data.n_eval = std::max(1, n_eval); + if (t_compute_start_us == 0) { + t_compute_start_us = ggml_time_us(); + } - return data; -} + n_queued_tokens += n_tokens; -ggml_tensor * llama_context::build_cvec( - ggml_context * ctx0, - ggml_tensor * cur, - int il) { - return cvec.apply_to(ctx0, cur, il); -} + const int64_t n_embd = hparams.n_embd; -ggml_tensor * llama_context::build_lora_mm( - ggml_context * ctx0, - ggml_tensor * w, - ggml_tensor * cur) { - struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); - for (const auto & lora : loras) { - struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); - if (lw == nullptr) { - continue; - } + const llama_ubatch ubatch = sbatch.split_simple(n_tokens); + + // reserve output buffer + if (output_reserve(n_tokens) < n_tokens) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); + return -2; + }; + + for (int32_t i = 0; i < n_tokens; ++i) { + output_ids[i] = i; + } + + n_outputs = n_tokens; + + GGML_ASSERT(need_reserve == false); + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + auto * gf = graph_init(); + auto res = graph_build(ctx_compute.get(), gf, ubatch, false); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + input_set(ubatch); + + const auto compute_status = graph_compute(gf, n_tokens > 1); + switch (compute_status) { + case GGML_STATUS_SUCCESS: + break; + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + + auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd; + + // extract embeddings + if (t_embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + GGML_ASSERT(embd != nullptr); + + // extract token embeddings + float * embd_out = embd; + + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = embd_seq; + embd_seq_out.clear(); + + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + for (int32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = ubatch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // TODO: this likely should be the same logic as in llama_decoder_internal, but better to + // wait for an encoder model that requires this pooling type in order to test it + // https://github.com/ggerganov/llama.cpp/pull/9510 + GGML_ABORT("RANK pooling not implemented yet"); + } + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } + } + } + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + return 0; +} + +int llama_context::decode(llama_batch & inp_batch) { + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } + + // temporary allocate memory for the input batch if needed + llama_batch_allocr batch_allocr(inp_batch, 0); + + const llama_batch & batch = batch_allocr.batch; + + const auto & vocab = model.vocab; + const auto & hparams = model.hparams; + + const int32_t n_vocab = vocab.n_tokens(); + + const int64_t n_tokens = batch.n_tokens; + const int64_t n_embd = hparams.n_embd; + + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + + if (batch.token) { + for (int64_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); + throw std::runtime_error("invalid token"); + } + } + } + + // micro-batching is not possible without KV cache + GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "llama_context requires n_ubatch >= n_tokens"); + + if (t_compute_start_us == 0) { + t_compute_start_us = ggml_time_us(); + } + n_queued_tokens += n_tokens; + + // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + + embd_seq.clear(); + + int64_t n_outputs_all = 0; + + // count outputs + if (batch.logits && !embd_pooled) { + for (uint32_t i = 0; i < n_tokens; ++i) { + n_outputs_all += batch.logits[i] != 0; + } + } else if (logits_all || embd_pooled) { + n_outputs_all = n_tokens; + } else { + // keep last output only + n_outputs_all = 1; + } + + const bool logits_all = n_outputs_all == n_tokens; + + sbatch.from_batch(batch, n_embd, + /* simple_split */ true, + /* logits_all */ logits_all); + + const llama_ubatch ubatch = sbatch.split_simple(n_tokens); + + // reserve output buffer + if (output_reserve(n_outputs_all) < n_outputs_all) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + return -2; + }; + + n_outputs = n_outputs_all; + + GGML_ASSERT(need_reserve == false); + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + auto * gf = graph_init(); + auto res = graph_build(ctx_compute.get(), gf, ubatch, false); + + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + input_set(ubatch); + + const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); + if (compute_status != GGML_STATUS_SUCCESS) { + switch (compute_status) { + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + } + + auto * t_logits = cparams.embeddings ? nullptr : res.t_logits; + auto * t_embd = cparams.embeddings ? res.t_embd : nullptr; + + if (t_embd && res.t_embd_pooled) { + t_embd = res.t_embd_pooled; + } + + // extract logits + if (t_logits && n_outputs > 0) { + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); + GGML_ASSERT(backend_res != nullptr); + GGML_ASSERT(logits != nullptr); + + float * logits_out = logits; + + if (n_outputs) { + GGML_ASSERT(n_outputs <= n_outputs_all); + GGML_ASSERT(n_outputs*n_vocab <= (int64_t) logits_size); + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + } + } + + // extract embeddings + if (t_embd && n_outputs > 0) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(embd != nullptr); + float * embd_out = embd; + + if (n_outputs) { + GGML_ASSERT(n_outputs <= n_outputs_all); + GGML_ASSERT(n_outputs*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings (cleared before processing each batch) + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // extract the rerank score - a single float per sequence + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(1); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } + } + } + + // set output mappings + { + bool sorted_output = true; + + GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all); + + for (int64_t i = 0; i < n_outputs_all; ++i) { + int64_t out_id = sbatch.out_ids[i]; + output_ids[out_id] = i; + if (out_id != i) { + sorted_output = false; + } + } + + if (sorted_output) { + sbatch.out_ids.clear(); + } + } + + // wait for the computation to finish (automatically done when obtaining the model output) + //synchronize(); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + return 0; +} + +void llama_context::build_cb( + ggml_tensor * cur, + const char * name, + const llama_ubatch & ubatch, + int il) { + if (il >= 0) { + ggml_format_name(cur, "%s-%d", name, il); + } else { + ggml_set_name(cur, name); + } + + if (!cparams.offload_kqv) { + if (strcmp(name, "kqv_merged_cont") == 0) { + // all nodes between the KV store and the attention output are run on the CPU + ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu); + } + } + + // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends + // FIXME: fix in ggml_backend_sched + const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer; + if (ubatch.n_tokens < 32 || full_offload) { + if (il != -1 && strcmp(name, "norm") == 0) { + const auto & dev_layer = model.dev_layer(il); + for (auto & backend : backends) { + if (ggml_backend_get_device(backend.get()) == dev_layer) { + if (ggml_backend_supports_op(backend.get(), cur)) { + ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get()); + } + } + } + } + } +} + +ggml_tensor * llama_context::build_cvec( + ggml_context * ctx0, + ggml_tensor * cur, + int il) { + return cvec.apply_to(ctx0, cur, il); +} + +ggml_tensor * llama_context::build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } const float adapter_scale = lora.second; const float scale = lw->get_scale(lora.first->alpha, adapter_scale); @@ -1002,7 +1184,7 @@ ggml_tensor * llama_context::build_rope_factors(int il) { const auto & hparams = model.hparams; // choose long/short freq factors based on the context size - const auto n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; + const auto n_ctx_per_seq = n_ctx() / n_seq_max(); if (model.layers[il].rope_freqs != nullptr) { return model.layers[il].rope_freqs; @@ -1141,16 +1323,176 @@ ggml_tensor * llama_context::build_inp_mean( inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); ggml_set_input(inp_mean); - return inp_mean; -} + return inp_mean; +} + +ggml_tensor * llama_context::build_inp_cls( + ggml_context * ctx0, + int32_t n_tokens) { + inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp_cls); + + return inp_cls; +} + +ggml_tensor * llama_context::build_attn( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto & n_ctx = cparams.n_ctx; + + //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + const auto & kq_mask = inp_kq_mask_cnv; + + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + + //const auto & n_embd_head_k = hparams.n_embd_head_k; + const auto & n_embd_head_v = hparams.n_embd_head_v; + + // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch + GGML_UNUSED(worst_case); + const auto n_kv = n_tokens; + + struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, k_cur, 0, 2, 1, 3)); + //cb(k, "k", il); + + struct ggml_tensor * cur; + + //if (cparams.flash_attn) { + if (false) { // TODO: need to pad the batch size to a multiple of GGML_KQ_MASK_PAD + GGML_UNUSED(model); + GGML_UNUSED(n_ctx); + + struct ggml_tensor * v = ggml_cont(ctx0, ggml_permute(ctx0, v_cur, 0, 2, 1, 3)); + v = ggml_reshape_3d(ctx0, v, n_embd_head_v, n_kv, n_head_kv); + + cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, + hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); + + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + + cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); + } else { + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + //cb(kq, "kq", il); + + // note: this op tends to require high floating point range + // while for some models F16 is enough, for others it is not, so we default to F32 here + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + if (model.arch == LLM_ARCH_GROK) { + // need to do the following: + // multiply by attn_output_multiplyer of 0.08838834764831845 + // and then : + // kq = 30 * tanh(kq / 30) + // before the softmax below + + kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f)); + kq = ggml_scale(ctx0, kq, 30); + } + + if (hparams.attn_soft_cap) { + kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping); + kq = ggml_tanh(ctx0, kq); + kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping); + } + + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); + //cb(kq, "kq_soft_max_ext", il); + + // split cached v into n_head heads + struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens))); + + v = ggml_reshape_3d(ctx0, v, n_kv, n_embd_head_v, n_head_kv); + //cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + //cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + //cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); + //cb(cur, "kqv_merged_cont", il); + + if (!cparams.offload_kqv) { + // all nodes between the KV store and the attention output are run on the CPU + ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu); + } + } + + ggml_build_forward_expand(gf, cur); + + if (wo) { + cur = build_lora_mm(ctx0, wo, cur); + } + + if (wo_b) { + //cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; +} + +void llama_context::build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa, + bool worst_case) { + // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch + GGML_UNUSED(causal); + GGML_UNUSED(swa); + GGML_UNUSED(worst_case); + + inp_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + //cb(inp_kq_mask, "KQ_mask", -1); + ggml_set_input(inp_kq_mask); + + inp_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_kq_mask, GGML_TYPE_F16) : inp_kq_mask; +} + +// +// perf +// + +llama_perf_context_data llama_context::perf_get_data() const { + llama_perf_context_data data = {}; + + data.t_start_ms = 1e-3 * t_start_us; + data.t_load_ms = 1e-3 * t_load_us; + data.t_p_eval_ms = 1e-3 * t_p_eval_us; + data.t_eval_ms = 1e-3 * t_eval_us; + data.n_p_eval = std::max(1, n_p_eval); + data.n_eval = std::max(1, n_eval); -ggml_tensor * llama_context::build_inp_cls( - ggml_context * ctx0, - int32_t n_tokens) { - inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_input(inp_cls); + return data; +} - return inp_cls; +void llama_context::perf_reset() { + t_start_us = ggml_time_us(); + t_eval_us = n_eval = 0; + t_p_eval_us = n_p_eval = 0; } // @@ -1620,10 +1962,277 @@ size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_ return io.n_bytes(); } -void llama_context::perf_reset() { - t_start_us = ggml_time_us(); - t_eval_us = n_eval = 0; - t_p_eval_us = n_p_eval = 0; +// +// input +// + +void llama_context::input_set(const llama_ubatch & ubatch) { + const llama_hparams & hparams = model.hparams; + + if (ubatch.token) { + const int64_t n_tokens = ubatch.n_tokens; + + ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); + } + + if (ubatch.embd) { + const int64_t n_embd = hparams.n_embd; + const int64_t n_tokens = ubatch.n_tokens; + + ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd)); + } + + if (ubatch.pos && inp_pos) { + const int64_t n_tokens = ubatch.n_tokens; + + ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos)); + } + + if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs"); + + if (!inp_out_ids) { + LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__); + } else { + const int64_t n_tokens = ubatch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer)); + int32_t * data = (int32_t *) inp_out_ids->data; + + if (n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (ubatch.output) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch.output[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(n_outputs == n_outputs); + } else if (n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(n_outputs == 0); + } + } + } + + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(inp_mean); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer)); + + float * data = (float *) inp_mean->data; + memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean)); + + std::vector sum(n_tokens, 0); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); + + sum[seq_id] += ubatch.n_seq_tokens; + } + + std::vector div(n_tokens, 0.0f); + for (int i = 0; i < n_tokens; ++i) { + const uint64_t s = sum[i]; + if (s > 0) { + div[i] = 1.0f/float(s); + } + } + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + for (int i = 0; i < n_seq_tokens; ++i) { + data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; + } + } + } + + if (cparams.embeddings && ( + cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || + cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); + + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; + + if (pos == 0) { + data[seq_id] = s*n_seq_tokens + i; + } + } + } + } + + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); + + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); + + std::vector last_pos(n_tokens, -1); + std::vector last_row(n_tokens, -1); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; + + if (pos >= last_pos[seq_id]) { + last_pos[seq_id] = pos; + last_row[seq_id] = s*n_seq_tokens + i; + } + } + } + + for (int i = 0; i < n_tokens; ++i) { + if (last_row[i] >= 0) { + data[i] = last_row[i]; + } + } + } + + if (inp_kq_mask) { + // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. + if (cparams.causal_attn) { + // TODO: need to use the batch directly to construct the masks + GGML_ABORT("TODO"); + + //const int64_t n_kv = ubatch.n_tokens; + //const int64_t n_tokens = ubatch.n_tokens; + //const int64_t n_seq_tokens = ubatch.n_seq_tokens; + //const int64_t n_seqs = ubatch.n_seqs; + + //float * data = nullptr; + + //if (inp_kq_mask) { + // GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer)); + // data = (float *) inp_kq_mask->data; + //} + + //// For causal attention, use only the previous KV cells + //// of the correct sequence for each token of the ubatch. + //// It's assumed that if a token in the batch has multiple sequences, they are equivalent. + //for (int h = 0; h < 1; ++h) { + // for (int s = 0; s < n_seqs; ++s) { + // const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + // for (int j = 0; j < n_seq_tokens; ++j) { + // const llama_pos pos = ubatch.pos[s*n_seq_tokens + j]; + + // for (int i = 0; i < n_kv; ++i) { + // float f; + // if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + // f = -INFINITY; + // } else { + // if (hparams.use_alibi) { + // f = -std::abs(kv_self.cells[i].pos - pos); + // } else { + // f = 0.0f; + // } + // } + + // if (data) { + // data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; + // } + // } + // } + // } + + // if (data) { + // for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { + // for (int j = 0; j < n_kv; ++j) { + // data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; + // } + // } + // } + //} + } else { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_stride = ubatch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer)); + + float * data = (float *) inp_kq_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int s1 = 0; s1 < n_seqs; ++s1) { + const llama_seq_id seq_id = ubatch.seq_id[s1][0]; + + for (int j = 0; j < n_seq_tokens; ++j) { + const int32_t tj = s1*n_seq_tokens + j; + + for (int s0 = 0; s0 < n_seqs; ++s0) { + for (int i = 0; i < n_seq_tokens; ++i) { + const int32_t ti = s0*n_seq_tokens + i; + float f = -INFINITY; + + for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) { + if (ubatch.seq_id[s0][s] == seq_id) { + if (hparams.use_alibi) { + f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]); + } else { + f = 0.0f; + } + break; + } + } + + data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f; + } + } + + for (int i = n_tokens; i < n_stride; ++i) { + data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY; + } + } + } + } + } + } + + GGML_ASSERT( + // (!a || b) is a logical implication (a -> b) + // !hparams.causal_attn -> !cparams.causal_attn + (hparams.causal_attn || !cparams.causal_attn) && + "causal attention is not supported by this model" + ); } // @@ -1684,11 +2293,6 @@ llama_context_kv_self::llama_context_kv_self( llama_context_kv_self::~llama_context_kv_self() = default; -uint32_t llama_context_kv_self::n_seq_max() const { - // TODO: add notion of n_seq_max to llama_kv_cache and use it here - return kv_self.size; -} - llama_kv_cache * llama_context_kv_self::get_kv_self() { return &kv_self; } @@ -1698,14 +2302,15 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const { } ggml_cgraph * llama_context_kv_self::graph_init() { - inp_KQ_mask = nullptr; - inp_KQ_mask_cnv = nullptr; - inp_KQ_mask_swa = nullptr; - inp_KQ_mask_swa_cnv = nullptr; - inp_KQ_mask_cross = nullptr; - inp_k_shift = nullptr; - inp_embd_enc = nullptr; - inp_pos_bucket = nullptr; + inp_embd_enc = nullptr; + inp_pos_bucket = nullptr; + inp_kq_mask_cross = nullptr; + + inp_self_kq_mask = nullptr; + inp_self_kq_mask_cnv = nullptr; + inp_self_kq_mask_swa = nullptr; + inp_self_kq_mask_swa_cnv = nullptr; + inp_self_k_shift = nullptr; return llama_context::graph_init(); } @@ -1979,8 +2584,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { const auto & n_ubatch = cparams.n_ubatch; - const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - if (kv_self.recurrent) { if (embd_pooled) { // Pooled embeddings cannot be split across ubatches (yet) @@ -2033,7 +2636,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - const uint32_t pad = kv_self.get_padding(cparams); + const uint32_t pad = get_ctx_padding(cparams); kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); //kv_self.n = llama_kv_cache_cell_max(kv_self); } @@ -2246,10 +2849,10 @@ uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) c void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; - if (inp_k_shift) { - assert(ggml_backend_buffer_is_host(inp_k_shift->buffer)); + if (inp_self_k_shift) { + assert(ggml_backend_buffer_is_host(inp_self_k_shift->buffer)); - int32_t * data = (int32_t *) inp_k_shift->data; + int32_t * data = (int32_t *) inp_self_k_shift->data; for (uint32_t i = 0; i < kv_self.size; ++i) { data[i] = kv_self.cells[i].delta; @@ -2262,7 +2865,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { // call base functionality llama_context::input_set(ubatch); - if (inp_KQ_mask || inp_KQ_mask_swa) { + if (inp_self_kq_mask || inp_self_kq_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. if (cparams.causal_attn && !is_encoding) { const int64_t n_kv = kv_self.n; @@ -2273,14 +2876,14 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { float * data = nullptr; float * data_swa = nullptr; - if (inp_KQ_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer)); - data = (float *) inp_KQ_mask->data; + if (inp_self_kq_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask->buffer)); + data = (float *) inp_self_kq_mask->data; } - if (inp_KQ_mask_swa) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_swa->buffer)); - data_swa = (float *) inp_KQ_mask_swa->data; + if (inp_self_kq_mask_swa) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask_swa->buffer)); + data_swa = (float *) inp_self_kq_mask_swa->data; } // For causal attention, use only the previous KV cells @@ -2341,11 +2944,11 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; // when using kv cache, the mask needs to match the kv cache size - const int64_t n_stride = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens; + const int64_t n_stride = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask->buffer)); - float * data = (float *) inp_KQ_mask->data; + float * data = (float *) inp_self_kq_mask->data; for (int h = 0; h < 1; ++h) { for (int s1 = 0; s1 < n_seqs; ++s1) { @@ -2442,14 +3045,14 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { ggml_backend_tensor_set(inp_embd_enc, embd_enc.data(), 0, ggml_nbytes(inp_embd_enc)); } - if (!is_encoding && inp_KQ_mask_cross) { + if (!is_encoding && inp_kq_mask_cross) { const int64_t n_output_enc = embd_enc.size() / hparams.n_embd; const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_cross->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask_cross->buffer)); GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - float * data = (float *) inp_KQ_mask_cross->data; + float * data = (float *) inp_kq_mask_cross->data; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -2529,11 +3132,11 @@ void llama_context_kv_self::kv_self_update() { } } -ggml_tensor * llama_context_kv_self::build_inp_k_shift(ggml_context * ctx0) { - inp_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); - ggml_set_input(inp_k_shift); +ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) { + inp_self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); + ggml_set_input(inp_self_k_shift); - return inp_k_shift; + return inp_self_k_shift; } void llama_context_kv_self::build_attn_inp( @@ -2542,28 +3145,28 @@ void llama_context_kv_self::build_attn_inp( bool causal, bool swa, bool worst_case) { - const auto & hparams = model.hparams; - const auto n_kv = worst_case ? kv_self.size : kv_self.n; - inp_KQ_mask = causal + inp_self_kq_mask = causal ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp_KQ_mask, "KQ_mask", -1); - ggml_set_input(inp_KQ_mask); + //cb(inp_self_kq_mask, "KQ_mask", -1); + ggml_set_input(inp_self_kq_mask); - inp_KQ_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask, GGML_TYPE_F16) : inp_KQ_mask; + inp_self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_self_kq_mask, GGML_TYPE_F16) : inp_self_kq_mask; if (swa) { + const auto & hparams = model.hparams; + GGML_ASSERT(hparams.n_swa > 0); - inp_KQ_mask_swa = causal + inp_self_kq_mask_swa = causal ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp_KQ_mask_swa, "KQ_mask_swa", -1); - ggml_set_input(inp_KQ_mask_swa); + //cb(inp_self_kq_mask_swa, "KQ_mask_swa", -1); + ggml_set_input(inp_self_kq_mask_swa); - inp_KQ_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask_swa, GGML_TYPE_F16) : inp_KQ_mask_swa; + inp_self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_self_kq_mask_swa, GGML_TYPE_F16) : inp_self_kq_mask_swa; } } @@ -2598,7 +3201,7 @@ ggml_tensor * llama_context_kv_self::build_attn( // note: storing RoPE-ed version of K in the KV cache ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view)); - assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); + v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens); struct ggml_tensor * v_cache_view = nullptr; @@ -2641,7 +3244,7 @@ ggml_tensor * llama_context_kv_self::build_attn( } }; - const auto & kq_mask = is_sliding ? inp_KQ_mask_swa_cnv : inp_KQ_mask_cnv; + const auto & kq_mask = is_sliding ? inp_self_kq_mask_swa_cnv : inp_self_kq_mask_cnv; const auto n_kv = worst_case ? kv_self.size : kv_self.n; @@ -2754,15 +3357,6 @@ ggml_tensor * llama_context_kv_self::build_attn( return cur; } -ggml_tensor * llama_context_kv_self::build_attn_soft_max( - ggml_context * ctx0, - ggml_tensor * kq, - float kq_scale) { - const auto & hparams = model.hparams; - - return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); -} - void llama_context_kv_self::build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * gf) { @@ -2775,7 +3369,7 @@ void llama_context_kv_self::build_kv_self_shift( //GGML_ASSERT(kv_self.size == n_ctx); - ggml_tensor * inp_k_shift = build_inp_k_shift(ctx0); + ggml_tensor * inp_self_k_shift = build_inp_self_k_shift(ctx0); for (uint32_t il = 0; il < n_layer; ++il) { const int64_t n_head_kv = hparams.n_head_kv(il); @@ -2790,7 +3384,7 @@ void llama_context_kv_self::build_kv_self_shift( ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), 0); - ggml_tensor * cur = build_rope_shift(ctx0, k, inp_k_shift, rope_factors, kv_self.k_l[il]->buffer); + ggml_tensor * cur = build_rope_shift(ctx0, k, inp_self_k_shift, rope_factors, kv_self.k_l[il]->buffer); ggml_build_forward_expand(gf, cur); } @@ -3082,7 +3676,7 @@ ggml_tensor * llama_context_kv_self::build_inp_embd_enc( return inp_embd_enc; } -ggml_tensor * llama_context_kv_self::build_inp_KQ_mask_cross( +ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross( ggml_context * ctx0, int32_t n_tokens, bool worst_case) { @@ -3092,10 +3686,10 @@ ggml_tensor * llama_context_kv_self::build_inp_KQ_mask_cross( // TODO: not sure if this is correct const int32_t n_outputs_enc = worst_case ? n_tokens : embd_enc.size() / n_embd; - inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - ggml_set_input(inp_KQ_mask_cross); + inp_kq_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + ggml_set_input(inp_kq_mask_cross); - return inp_KQ_mask_cross; + return inp_kq_mask_cross; } // @@ -3765,11 +4359,23 @@ int32_t llama_apply_adapter_cvec( // struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { - return llama_kv_cache_view_init(*ctx->get_kv_self(), n_seq_max); + const auto * kv = ctx->get_kv_self(); + if (kv == nullptr) { + LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__); + return {}; + } + + return llama_kv_cache_view_init(*kv, n_seq_max); } void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) { - llama_kv_cache_view_update(view, *ctx->get_kv_self()); + const auto * kv = ctx->get_kv_self(); + if (kv == nullptr) { + LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__); + return; + } + + llama_kv_cache_view_update(view, *kv); } // @@ -3903,7 +4509,7 @@ void llama_kv_cache_defrag(llama_context * ctx) { } void llama_kv_self_defrag(llama_context * ctx) { - return llama_kv_cache_defrag(ctx->get_kv_self()); + llama_kv_cache_defrag(ctx->get_kv_self()); } // deprecated diff --git a/src/llama-context.h b/src/llama-context.h index 2b3d5f122bbbe..c605cec6f6a19 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -20,6 +20,7 @@ class llama_io_write_i; using llama_loras = std::unordered_map; +// basic transformer without KV cache struct llama_context : public llama_graph_i { llama_context( const llama_model & model, @@ -38,17 +39,19 @@ struct llama_context : public llama_graph_i { virtual uint32_t n_ctx_per_seq() const; virtual uint32_t n_batch() const; virtual uint32_t n_ubatch() const; - virtual uint32_t n_seq_max() const = 0; + virtual uint32_t n_seq_max() const; virtual uint32_t n_threads() const; virtual uint32_t n_threads_batch() const; virtual int32_t max_nodes() const; - virtual llama_kv_cache * get_kv_self() = 0; - virtual const llama_kv_cache * get_kv_self() const = 0; + // returns nullptr + virtual llama_kv_cache * get_kv_self(); + virtual const llama_kv_cache * get_kv_self() const; - virtual void kv_self_update() = 0; + // noop + virtual void kv_self_update(); virtual enum llama_pooling_type pooling_type() const; @@ -109,8 +112,6 @@ struct llama_context : public llama_graph_i { ggml_cgraph * gf, bool batched); - virtual void input_set(const llama_ubatch & ubatch); - // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. virtual int32_t output_reserve(int32_t n_outputs); @@ -128,7 +129,7 @@ struct llama_context : public llama_graph_i { // return positive int on warning // return negative int on error // - virtual int encode(llama_batch & inp_batch) = 0; + virtual int encode(llama_batch & inp_batch); // decode a batch of tokens by evaluating the transformer // in case of unsuccessful decoding (error or warning), @@ -142,7 +143,7 @@ struct llama_context : public llama_graph_i { // return positive int on warning // return negative int on error // - virtual int decode(llama_batch & inp_batch) = 0; + virtual int decode(llama_batch & inp_batch); // // graph build API (generic) @@ -204,6 +205,31 @@ struct llama_context : public llama_graph_i { ggml_context * ctx0, int32_t n_tokens); + virtual void build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa, + bool worst_case); + + virtual ggml_tensor * build_attn( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case); + + // perf + + virtual llama_perf_context_data perf_get_data() const; + virtual void perf_reset(); + // state save/load virtual size_t state_get_size(); @@ -238,13 +264,7 @@ struct llama_context : public llama_graph_i { const llama_token * tokens, size_t n_token_count); - // perf - - virtual llama_perf_context_data perf_get_data() const; - virtual void perf_reset(); - protected: - // state save/load virtual size_t state_get_data(llama_io_write_i & io); @@ -253,14 +273,21 @@ struct llama_context : public llama_graph_i { virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id); virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id); - // input tensors + // input + + virtual void input_set(const llama_ubatch & ubatch); - struct ggml_tensor * inp_tokens; // I32 [n_batch] - struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] - struct ggml_tensor * inp_pos; // I32 [n_batch] - struct ggml_tensor * inp_out_ids; // I32 [n_outputs] - struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] - struct ggml_tensor * inp_cls; // I32 [n_batch] + // base input tensors + ggml_tensor * inp_tokens; // I32 [n_batch] + ggml_tensor * inp_embd; // F32 [n_embd, n_batch] + ggml_tensor * inp_pos; // I32 [n_batch] + ggml_tensor * inp_out_ids; // I32 [n_outputs] + ggml_tensor * inp_mean; // F32 [n_batch, n_batch] + ggml_tensor * inp_cls; // I32 [n_batch] + + // KQ mask input tensors + ggml_tensor * inp_kq_mask; // F32 [n_tokens, n_batch] + ggml_tensor * inp_kq_mask_cnv; // [n_tokens, n_batch] // members @@ -337,8 +364,6 @@ class llama_context_kv_self : public llama_context { virtual ~llama_context_kv_self(); - virtual uint32_t n_seq_max() const override; - virtual llama_kv_cache * get_kv_self() override; virtual const llama_kv_cache * get_kv_self() const override; @@ -346,8 +371,6 @@ class llama_context_kv_self : public llama_context { virtual ggml_cgraph * graph_init() override; - virtual void input_set(const llama_ubatch & ubatch) override; - virtual int encode(llama_batch & inp_batch) override; virtual int decode(llama_batch & inp_batch) override; @@ -357,17 +380,7 @@ class llama_context_kv_self : public llama_context { // certain implementations could require a padding for the context size uint32_t get_ctx_padding(const llama_cparams & cparams) const; - // === KV cache === - - llama_kv_cache kv_self; - - ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] - ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch] - ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] - ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch] - ggml_tensor * inp_k_shift; // I32 [kv_size] - - virtual ggml_tensor * build_inp_k_shift(ggml_context * ctx0) override; + virtual ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override; virtual void build_attn_inp( ggml_context * ctx0, @@ -389,11 +402,6 @@ class llama_context_kv_self : public llama_context { int il, bool worst_case) override; - virtual ggml_tensor * build_attn_soft_max( - ggml_context * ctx0, - ggml_tensor * kq, - float kq_scale) override; - virtual void build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * gf) override; @@ -414,14 +422,14 @@ class llama_context_kv_self : public llama_context { struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] - struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] + struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch] virtual ggml_tensor * build_inp_embd_enc( ggml_context * ctx0, int32_t n_tokens, bool worst_case) override; - virtual ggml_tensor * build_inp_KQ_mask_cross( + virtual ggml_tensor * build_inp_kq_mask_cross( ggml_context * ctx0, int32_t n_tokens, bool worst_case) override; @@ -432,6 +440,16 @@ class llama_context_kv_self : public llama_context { virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; + + virtual void input_set(const llama_ubatch & ubatch) override; + + llama_kv_cache kv_self; + + ggml_tensor * inp_self_kq_mask; // F32 [kv_size, n_batch] + ggml_tensor * inp_self_kq_mask_cnv; // [kv_size, n_batch] + ggml_tensor * inp_self_kq_mask_swa; // F32 [kv_size, n_batch] + ggml_tensor * inp_self_kq_mask_swa_cnv; // [kv_size, n_batch] + ggml_tensor * inp_self_k_shift; // I32 [kv_size] }; // a recurrent transformer (ie.e RWKV, Mamba) @@ -447,8 +465,6 @@ class llama_context_recurrent : public llama_context_kv_self { virtual ggml_cgraph * graph_init() override; - virtual void input_set(const llama_ubatch & ubatch) override; - virtual ggml_tensor * build_inp_s_copy( ggml_context * ctx0, bool worst_case) override; @@ -506,6 +522,8 @@ class llama_context_recurrent : public llama_context_kv_self { bool worst_case) override; protected: + virtual void input_set(const llama_ubatch & ubatch) override; + struct ggml_tensor * inp_s_copy; // I32 [kv_size] struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 17605e74cc90b..d9d4e00e98ba0 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2,6 +2,84 @@ #include "llama-impl.h" +ggml_tensor * llama_graph_i::build_attn( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + GGML_UNUSED(wo); + GGML_UNUSED(wo_b); + GGML_UNUSED(q_cur); + GGML_UNUSED(k_cur); + GGML_UNUSED(v_cur); + GGML_UNUSED(n_tokens); + GGML_UNUSED(kq_scale); + GGML_UNUSED(il); + GGML_UNUSED(worst_case); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + return nullptr; +} + +void llama_graph_i::build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); +} + +void llama_graph_i::build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); +} + +ggml_tensor * llama_graph_i::build_inp_self_k_shift( + ggml_context * ctx0) { + GGML_UNUSED(ctx0); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + return nullptr; +} + +ggml_tensor * llama_graph_i::build_inp_embd_enc( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) { + GGML_UNUSED(ctx0); + GGML_UNUSED(n_tokens); + GGML_UNUSED(worst_case); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + return nullptr; +} + +ggml_tensor * llama_graph_i::build_inp_kq_mask_cross( + ggml_context * ctx0, + int32_t n_tokens, + bool worst_case) { + GGML_UNUSED(ctx0); + GGML_UNUSED(n_tokens); + GGML_UNUSED(worst_case); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + return nullptr; +} + ggml_tensor * llama_graph_i::build_inp_s_copy ( ggml_context * ctx0, bool worst_case) { diff --git a/src/llama-graph.h b/src/llama-graph.h index b64e0f5f4fdb0..8d237431e657a 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -99,34 +99,29 @@ class llama_graph_i { int32_t n_tokens, float kq_scale, int il, - bool worst_case) = 0; - - virtual ggml_tensor * build_attn_soft_max( - ggml_context * ctx0, - ggml_tensor * kq, - float kq_scale) = 0; + bool worst_case); virtual void build_kv_self_shift( ggml_context * ctx0, - ggml_cgraph * gf) = 0; + ggml_cgraph * gf); // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache virtual void build_kv_self_defrag( ggml_context * ctx0, - ggml_cgraph * gf) = 0; + ggml_cgraph * gf); - virtual ggml_tensor * build_inp_k_shift( - ggml_context * ctx0) = 0; + virtual ggml_tensor * build_inp_self_k_shift( + ggml_context * ctx0); virtual ggml_tensor * build_inp_embd_enc( ggml_context * ctx0, int32_t n_tokens, - bool worst_case) = 0; + bool worst_case); - virtual ggml_tensor * build_inp_KQ_mask_cross( + virtual ggml_tensor * build_inp_kq_mask_cross( ggml_context * ctx0, int32_t n_tokens, - bool worst_case) = 0; + bool worst_case); virtual ggml_tensor * build_inp_s_copy( ggml_context * ctx0, diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 8a87f91290eed..3aec6495fe02e 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1079,14 +1079,26 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t cell_count) // int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) { + if (!kv) { + return 0; + } + return kv->n_tokens(); } int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { + if (!kv) { + return 0; + } + return kv->used; } void llama_kv_cache_clear(llama_kv_cache * kv) { + if (!kv) { + return; + } + kv->clear(); } @@ -1095,6 +1107,10 @@ bool llama_kv_cache_seq_rm( llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + if (!kv) { + return true; + } + return kv->seq_rm(seq_id, p0, p1); } @@ -1104,10 +1120,18 @@ void llama_kv_cache_seq_cp( llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (!kv) { + return; + } + kv->seq_cp(seq_id_src, seq_id_dst, p0, p1); } void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id) { + if (!kv) { + return; + } + kv->seq_keep(seq_id); } @@ -1117,6 +1141,10 @@ void llama_kv_cache_seq_add( llama_pos p0, llama_pos p1, llama_pos delta) { + if (!kv) { + return; + } + kv->seq_add(seq_id, p0, p1, delta); } @@ -1126,18 +1154,34 @@ void llama_kv_cache_seq_div( llama_pos p0, llama_pos p1, int d) { + if (!kv) { + return; + } + kv->seq_div(seq_id, p0, p1, d); } llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id) { + if (!kv) { + return 0; + } + return kv->seq_pos_max(seq_id); } void llama_kv_cache_defrag(llama_kv_cache * kv) { + if (!kv) { + return; + } + kv->defrag(); } bool llama_kv_cache_can_shift(const llama_kv_cache * kv) { + if (!kv) { + return false; + } + return kv->can_shift; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index debbacbb6183b..a0a7816da2ebf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3956,8 +3956,8 @@ struct llm_build_context { } // TODO: tmp - struct ggml_tensor * build_inp_KQ_mask_cross() { - ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); + struct ggml_tensor * build_inp_kq_mask_cross() { + ggml_tensor * cur = lgf->build_inp_kq_mask_cross(ctx0, n_tokens, worst_case); cb(cur, "KQ_mask_cross", -1); return cur; @@ -5568,7 +5568,6 @@ struct llm_build_context { // self-attention if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq); - cb(Qcur, "Qcur", il); if (model.layers[il].attn_q_norm) { Qcur = build_norm(Qcur, @@ -5578,7 +5577,6 @@ struct llm_build_context { } Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk); - cb(Kcur, "Kcur", il); if (model.layers[il].attn_k_norm) { Kcur = build_norm(Kcur, @@ -5586,11 +5584,12 @@ struct llm_build_context { model.layers[il].attn_k_norm_b, LLM_NORM, il); } + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv); - cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } else { // compute Q and K and RoPE them cur = build_lora_mm(model.layers[il].wqkv, cur); @@ -5600,10 +5599,6 @@ struct llm_build_context { Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - Qcur = ggml_rope_ext( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -5617,40 +5612,17 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); - - //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); - kq = lgf->build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); - cb(kq, "kq_soft_max_ext", il); - - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - cb(v, "v", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); - - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); - - ggml_build_forward_expand(gf, cur); - - cur = build_lora_mm(model.layers[il].wo, cur); - if (model.layers[il].bo) { - cb(cur, "kqv_wo", il); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } - if (model.layers[il].bo) { - cur = ggml_add(ctx0, cur, model.layers[il].bo); - } + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "kqv_out", il); if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) { @@ -9652,7 +9624,7 @@ struct llm_build_context { // struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); // // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - // struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); + // struct ggml_tensor * KQ_mask_enc = build_inp_kq_mask(false); // for (int il = 0; il < n_layer; ++il) { // struct ggml_tensor * inpSA = inpL; @@ -9781,8 +9753,8 @@ struct llm_build_context { // struct ggml_tensor * embd_enc = build_inp_embd_enc(); // struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); - // struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - // struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); + // struct ggml_tensor * KQ_mask_dec = build_inp_kq_mask(); + // struct ggml_tensor * KQ_mask_cross = build_inp_kq_mask_cross(); // for (int il = 0; il < n_layer; ++il) { // struct ggml_tensor * inpSA = inpL; diff --git a/src/llama.cpp b/src/llama.cpp index 3db1644775fe7..9bacc9e9b4bea 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -328,6 +328,11 @@ struct llama_context * llama_init_from_model( try { // TODO: make static method of llama_context switch (model->arch) { + case LLM_ARCH_BERT: + case LLM_ARCH_JINA_BERT_V2: + case LLM_ARCH_NOMIC_BERT: + ctx = new llama_context(*model, params); + break; case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6QWEN2: case LLM_ARCH_MAMBA: From ad870c49f4bc838ed0408bdc4bc976739019c286 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Feb 2025 19:52:42 +0200 Subject: [PATCH 60/84] context : fix causal input for cache-less case ggml-ci --- src/llama-context.cpp | 91 ++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 53 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 6b2a11ad69097..648a669b16e6a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -48,6 +48,7 @@ llama_context::llama_context( // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) // ref: https://github.com/ggerganov/llama.cpp/pull/5021 + // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self if (cparams.n_batch < GGML_KQ_MASK_PAD) { LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); cparams.n_batch = GGML_KQ_MASK_PAD; @@ -2127,60 +2128,44 @@ void llama_context::input_set(const llama_ubatch & ubatch) { } if (inp_kq_mask) { - // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. if (cparams.causal_attn) { - // TODO: need to use the batch directly to construct the masks - GGML_ABORT("TODO"); - - //const int64_t n_kv = ubatch.n_tokens; - //const int64_t n_tokens = ubatch.n_tokens; - //const int64_t n_seq_tokens = ubatch.n_seq_tokens; - //const int64_t n_seqs = ubatch.n_seqs; - - //float * data = nullptr; - - //if (inp_kq_mask) { - // GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer)); - // data = (float *) inp_kq_mask->data; - //} - - //// For causal attention, use only the previous KV cells - //// of the correct sequence for each token of the ubatch. - //// It's assumed that if a token in the batch has multiple sequences, they are equivalent. - //for (int h = 0; h < 1; ++h) { - // for (int s = 0; s < n_seqs; ++s) { - // const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // for (int j = 0; j < n_seq_tokens; ++j) { - // const llama_pos pos = ubatch.pos[s*n_seq_tokens + j]; - - // for (int i = 0; i < n_kv; ++i) { - // float f; - // if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - // f = -INFINITY; - // } else { - // if (hparams.use_alibi) { - // f = -std::abs(kv_self.cells[i].pos - pos); - // } else { - // f = 0.0f; - // } - // } - - // if (data) { - // data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; - // } - // } - // } - // } - - // if (data) { - // for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - // for (int j = 0; j < n_kv; ++j) { - // data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; - // } - // } - // } - //} + const int64_t n_kv = ubatch.n_tokens; + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer)); + float * data = (float *) inp_kq_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int s1 = 0; s1 < n_seqs; ++s1) { + const llama_seq_id seq_id = ubatch.seq_id[s1][0]; + + for (int j = 0; j < n_seq_tokens; ++j) { + const int32_t tj = s1*n_seq_tokens + j; + + for (int s0 = 0; s0 < n_seqs; ++s0) { + for (int i = 0; i < n_seq_tokens; ++i) { + const int32_t ti = s0*n_seq_tokens + i; + float f = -INFINITY; + + for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) { + if (ubatch.seq_id[s0][s] == seq_id && ubatch.pos[ti] <= ubatch.pos[tj]) { + if (hparams.use_alibi) { + f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]); + } else { + f = 0.0f; + } + break; + } + } + + data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f; + } + } + } + } + } } else { const int64_t n_tokens = ubatch.n_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens; From 08011c2ca12ee95b2041561f69ef0cc0be865dca Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 20 Feb 2025 20:54:18 +0200 Subject: [PATCH 61/84] context : add llama_kv_cache_recurrent prototype ggml-ci --- src/llama-context.cpp | 548 +++++++++++++++++++++++++++++++++++------- src/llama-context.h | 20 +- src/llama-kv-cache.h | 9 +- 3 files changed, 476 insertions(+), 101 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 648a669b16e6a..64728e8b592ef 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -359,17 +359,17 @@ int32_t llama_context::max_nodes() const { } llama_kv_cache * llama_context::get_kv_self() { - LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__); + LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__); return nullptr; } const llama_kv_cache * llama_context::get_kv_self() const { - LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__); + LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__); return nullptr; } void llama_context::kv_self_update() { - LLAMA_LOG_DEBUG("%s: llama_context does not have a KV cache\n", __func__); + LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__); } enum llama_pooling_type llama_context::pooling_type() const { @@ -2246,14 +2246,7 @@ llama_context_kv_self::llama_context_kv_self( ggml_type type_k = params.type_k; ggml_type type_v = params.type_v; - // Mamba only needs a constant number of KV cache cells per sequence - if (llama_model_is_recurrent(&model)) { - // Mamba needs at least as many KV cells as there are sequences kept at any time - kv_size = std::max((uint32_t) 1, params.n_seq_max); - // it's probably best to keep as much precision as possible for the states - type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states - type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states - } + GGML_ASSERT(!llama_model_is_recurrent(&model)); GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); @@ -2286,6 +2279,61 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const { return &kv_self; } +void llama_context_kv_self::kv_self_update() { + auto & kv = kv_self; + + if (kv.has_shift) { + if (!kv.can_shift) { + GGML_ABORT("The current context does not support K-shift"); + } + + // apply K-shift if needed + if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { + ggml_backend_sched_reset(sched.get()); + + auto * gf = graph_init(); + + build_kv_self_shift(ctx_compute.get(), gf); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + input_set({}); + + graph_compute(gf, false); + + need_reserve = true; + } + + { + kv.has_shift = false; + + for (uint32_t i = 0; i < kv.size; ++i) { + kv.cells[i].delta = 0; + } + } + } + + // defragment the KV cache if needed + if (kv.do_defrag) { + ggml_backend_sched_reset(sched.get()); + + auto * gf = graph_init(); + + build_kv_self_defrag(ctx_compute.get(), gf); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + // no input + //input_set({}); + + graph_compute(gf, false); + + kv.do_defrag = false; + + need_reserve = true; + } +} + ggml_cgraph * llama_context_kv_self::graph_init() { inp_embd_enc = nullptr; inp_pos_bucket = nullptr; @@ -2310,7 +2358,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { // temporary allocate memory for the input batch if needed // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; const int32_t n_tokens = batch.n_tokens; @@ -2470,7 +2518,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // temporary allocate memory for the input batch if needed // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; @@ -2552,7 +2600,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { const bool logits_all = n_outputs_all == n_tokens_all; sbatch.from_batch(batch, n_embd, - /* simple_split */ !kv_self.recurrent, + /* simple_split */ true, /* logits_all */ logits_all); // reserve output buffer @@ -2569,18 +2617,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { const auto & n_ubatch = cparams.n_ubatch; - if (kv_self.recurrent) { - if (embd_pooled) { - // Pooled embeddings cannot be split across ubatches (yet) - ubatch = sbatch.split_seq(n_ubatch); - } else { - // recurrent model architectures are easier to implement - // with equal-length sequences - ubatch = sbatch.split_equal(n_ubatch); - } - } else { - ubatch = sbatch.split_simple(n_ubatch); - } + ubatch = sbatch.split_simple(n_ubatch); // count the outputs in this u_batch { @@ -2617,7 +2654,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { bg.save(slot_info); - if (!kv_self.recurrent) { + { // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important @@ -2821,10 +2858,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { return 0; } -llama_pos llama_context_kv_self::pos_max() const { - return kv_self.pos_max(); -} - uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) const { return kv_self.get_padding(cparams); } @@ -3062,61 +3095,6 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { } } -void llama_context_kv_self::kv_self_update() { - auto & kv = kv_self; - - if (kv.has_shift) { - if (!kv.can_shift) { - GGML_ABORT("The current context does not support K-shift"); - } - - // apply K-shift if needed - if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { - ggml_backend_sched_reset(sched.get()); - - auto * gf = graph_init(); - - build_kv_self_shift(ctx_compute.get(), gf); - - ggml_backend_sched_alloc_graph(sched.get(), gf); - - input_set({}); - - graph_compute(gf, false); - - need_reserve = true; - } - - { - kv.has_shift = false; - - for (uint32_t i = 0; i < kv.size; ++i) { - kv.cells[i].delta = 0; - } - } - } - - // defragment the KV cache if needed - if (kv.do_defrag) { - ggml_backend_sched_reset(sched.get()); - - auto * gf = graph_init(); - - build_kv_self_defrag(ctx_compute.get(), gf); - - ggml_backend_sched_alloc_graph(sched.get(), gf); - - // no input - //input_set({}); - - graph_compute(gf, false); - - kv.do_defrag = false; - - need_reserve = true; - } -} - ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) { inp_self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); ggml_set_input(inp_self_k_shift); @@ -3176,7 +3154,9 @@ ggml_tensor * llama_context_kv_self::build_attn( // store to KV cache { - const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + GGML_ASSERT(!kv_self.recurrent); + + const auto kv_head = worst_case ? kv_self.size - n_tokens : kv_self.head; GGML_ASSERT(kv_self.size == n_ctx); @@ -3684,22 +3664,406 @@ ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross( llama_context_recurrent::llama_context_recurrent( const llama_model & model, const llama_context_params & params) : - llama_context_kv_self(model, params) { + llama_context(model, params), + kv_self(model.hparams) { LLAMA_LOG_INFO("%s: constructing llama_context_recurrent\n", __func__); + + const auto & hparams = model.hparams; + + LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx); + + // Mamba only needs a constant number of KV cache cells per sequence + GGML_ASSERT(llama_model_is_recurrent(&model)); + + // Mamba needs at least as many KV cells as there are sequences kept at any time + uint32_t kv_size = std::max((uint32_t) 1, params.n_seq_max); + // it's probably best to keep as much precision as possible for the states + ggml_type type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states + ggml_type type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states + + GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); + GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); + + if (!hparams.vocab_only) { + if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { + LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); + throw std::runtime_error("failed to initialize self-attention cache"); + } + + { + const size_t memory_size_k = kv_self.size_k_bytes(); + const size_t memory_size_v = kv_self.size_v_bytes(); + + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), + ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); + } + } } llama_context_recurrent::~llama_context_recurrent() = default; +llama_kv_cache * llama_context_recurrent::get_kv_self() { + return &kv_self; +} + +const llama_kv_cache * llama_context_recurrent::get_kv_self() const { + return &kv_self; +} + +void llama_context_recurrent::kv_self_update() { + // noop +} + ggml_cgraph * llama_context_recurrent::graph_init() { - inp_s_copy = nullptr; - inp_s_mask = nullptr; + inp_s_copy = nullptr; + inp_s_mask = nullptr; - return llama_context_kv_self::graph_init(); + return llama_context::graph_init(); +} + +int llama_context_recurrent::encode(llama_batch & inp_batch) { + GGML_UNUSED(inp_batch); + + LLAMA_LOG_ERROR("%s: encode() not supported for recurrent models\n", __func__); + return -1; +} + +int llama_context_recurrent::decode(llama_batch & inp_batch) { + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } + + // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1); + + const llama_batch & batch = batch_allocr.batch; + + const auto & vocab = model.vocab; + const auto & hparams = model.hparams; + + const int32_t n_vocab = vocab.n_tokens(); + + const int64_t n_tokens_all = batch.n_tokens; + const int64_t n_embd = hparams.n_embd; + + // TODO: remove this stuff + class batch_guard { + public: + batch_guard(llama_kv_cache & kv_self) : kv_slot_restorer(kv_self) { + } + + ~batch_guard() { + if (!is_done) { + kv_slot_restorer.restore(); + } + } + + void done() { + is_done = true; + } + + void save(const llama_kv_cache_slot_info & slot_info) { + kv_slot_restorer.save(slot_info); + } + + private: + bool is_done = false; + + llama_kv_slot_restorer kv_slot_restorer; + }; + + batch_guard bg(kv_self); + + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + + if (batch.token) { + for (int64_t i = 0; i < n_tokens_all; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); + throw std::runtime_error("invalid token"); + } + } + } + + GGML_ASSERT(n_tokens_all <= cparams.n_batch); + + GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); + + if (t_compute_start_us == 0) { + t_compute_start_us = ggml_time_us(); + } + n_queued_tokens += n_tokens_all; + + // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + + embd_seq.clear(); + + int64_t n_outputs_all = 0; + + // count outputs + if (batch.logits && !embd_pooled) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + n_outputs_all += batch.logits[i] != 0; + } + } else if (logits_all || embd_pooled) { + n_outputs_all = n_tokens_all; + } else { + // keep last output only + n_outputs_all = 1; + } + + const bool logits_all = n_outputs_all == n_tokens_all; + + sbatch.from_batch(batch, n_embd, + /* simple_split */ false, + /* logits_all */ logits_all); + + // reserve output buffer + if (output_reserve(n_outputs_all) < n_outputs_all) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + return -2; + }; + + int64_t n_outputs_prev = 0; + + while (sbatch.n_tokens > 0) { + llama_ubatch ubatch = llama_ubatch(); + + const auto & n_ubatch = cparams.n_ubatch; + + if (embd_pooled) { + // Pooled embeddings cannot be split across ubatches (yet) + ubatch = sbatch.split_seq(n_ubatch); + } else { + // recurrent model architectures are easier to implement + // with equal-length sequences + ubatch = sbatch.split_equal(n_ubatch); + } + + // count the outputs in this u_batch + { + int32_t n_outputs_new = 0; + + if (n_outputs_all == n_tokens_all) { + n_outputs_new = ubatch.n_tokens; + } else { + GGML_ASSERT(ubatch.output); + for (uint32_t i = 0; i < ubatch.n_tokens; i++) { + n_outputs_new += (int32_t) (ubatch.output[i] != 0); + } + } + + // needs to happen before the graph is built + n_outputs = n_outputs_new; + } + + // non-causal masks do not use the KV cache + if (hparams.causal_attn) { + kv_self_update(); + + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) { + kv_self.head = 0; + } + + const auto slot_info = kv_self.find_slot(ubatch); + if (!slot_info) { + LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); + return -3; + } + + bg.save(slot_info); + } + + //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + + // reserve a worst case graph if needed + if (need_reserve) { + LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); + + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch, true); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(sched.get()); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + + need_reserve = false; + } + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + auto * gf = graph_init(); + auto res = graph_build(ctx_compute.get(), gf, ubatch, false); + + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + input_set(ubatch); + + const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); + if (compute_status != GGML_STATUS_SUCCESS) { + switch (compute_status) { + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + } + + // update the kv ring buffer + { + kv_self.head += ubatch.n_tokens; + + // Ensure kv cache head points to a valid index. + if (kv_self.head >= kv_self.size) { + kv_self.head = 0; + } + } + + // plot the computation graph in dot format (for debugging purposes) + //if (n_past%100 == 0) { + // ggml_graph_dump_dot(gf, NULL, "llama.dot"); + //} + + auto * t_logits = cparams.embeddings ? nullptr : res.t_logits; + auto * t_embd = cparams.embeddings ? res.t_embd : nullptr; + + if (t_embd && res.t_embd_pooled) { + t_embd = res.t_embd_pooled; + } + + // extract logits + if (t_logits && n_outputs > 0) { + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); + GGML_ASSERT(backend_res != nullptr); + GGML_ASSERT(logits != nullptr); + + float * logits_out = logits + n_outputs_prev*n_vocab; + + if (n_outputs) { + GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size); + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + } + } + + // extract embeddings + if (t_embd && n_outputs > 0) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(embd != nullptr); + float * embd_out = embd + n_outputs_prev*n_embd; + + if (n_outputs) { + GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings (cleared before processing each batch) + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // extract the rerank score - a single float per sequence + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(1); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } + } + } + + n_outputs_prev += n_outputs; + } + + // finalize the batch processing + bg.done(); + + // set output mappings + { + bool sorted_output = true; + + GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all); + + for (int64_t i = 0; i < n_outputs_all; ++i) { + int64_t out_id = sbatch.out_ids[i]; + output_ids[out_id] = i; + if (out_id != i) { + sorted_output = false; + } + } + + if (sorted_output) { + sbatch.out_ids.clear(); + } + } + + // set to total number of outputs in the batch, for use in llama_get_logits_ith + n_outputs = n_outputs_all; + + // wait for the computation to finish (automatically done when obtaining the model output) + //synchronize(); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + return 0; } void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { // call base functionality - llama_context_kv_self::input_set(ubatch); + llama_context::input_set(ubatch); GGML_ASSERT(kv_self.recurrent); diff --git a/src/llama-context.h b/src/llama-context.h index c605cec6f6a19..df6acb265d52f 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -374,9 +374,6 @@ class llama_context_kv_self : public llama_context { virtual int encode(llama_batch & inp_batch) override; virtual int decode(llama_batch & inp_batch) override; - // max token position across all sequences in the current context - llama_pos pos_max() const; - // certain implementations could require a padding for the context size uint32_t get_ctx_padding(const llama_cparams & cparams) const; @@ -453,9 +450,7 @@ class llama_context_kv_self : public llama_context { }; // a recurrent transformer (ie.e RWKV, Mamba) -// TODO: temporary reuse kv_self, but in the future, implement recurrent-specific context with specific cache -//class llama_context_recurrent : public llama_context { -class llama_context_recurrent : public llama_context_kv_self { +class llama_context_recurrent : public llama_context { public: llama_context_recurrent( const llama_model & model, @@ -463,8 +458,16 @@ class llama_context_recurrent : public llama_context_kv_self { virtual ~llama_context_recurrent(); + virtual llama_kv_cache * get_kv_self() override; + virtual const llama_kv_cache * get_kv_self() const override; + + virtual void kv_self_update() override; + virtual ggml_cgraph * graph_init() override; + virtual int encode(llama_batch & inp_batch) override; + virtual int decode(llama_batch & inp_batch) override; + virtual ggml_tensor * build_inp_s_copy( ggml_context * ctx0, bool worst_case) override; @@ -524,10 +527,11 @@ class llama_context_recurrent : public llama_context_kv_self { protected: virtual void input_set(const llama_ubatch & ubatch) override; + // TODO: change name to something more meaningful -- does "KV cache" make sense for recurrent models? + llama_kv_cache_recurrent kv_self; + struct ggml_tensor * inp_s_copy; // I32 [kv_size] struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] - - // TODO: add recurrent cache }; // For internal test use diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 049193fd0f176..dda9bfec48846 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -48,7 +48,6 @@ struct llama_kv_cache_slot_info { // ring-buffer of cached KV data // TODO: pimpl // TODO: add notion of max sequences -// TODO: add llama_hparams & struct llama_kv_cache { llama_kv_cache(const llama_hparams & hparams); virtual ~llama_kv_cache() = default; @@ -108,7 +107,10 @@ struct llama_kv_cache { bool has_shift = false; bool do_defrag = false; + + // TODO: remove this and implement llama_kv_cache_recurrent instead bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token + bool v_trans = true; // the value tensor is transposed bool can_shift = false; @@ -141,6 +143,11 @@ struct llama_kv_cache { bool state_read_data(llama_io_read_i & io, uint32_t cell_count); }; +// TODO: temporary reusing llama_kv_cache -- implement recurrent cache and simplify llama_kv_cache +struct llama_kv_cache_recurrent : public llama_kv_cache { + using llama_kv_cache::llama_kv_cache; +}; + // // kv cache restore // From 2645a7d9a999de249e15ff3dae5eea1866221b57 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 21 Feb 2025 10:28:42 +0200 Subject: [PATCH 62/84] context : add save/load for recurrent context ggml-ci --- src/llama-context.cpp | 42 ++++++++++++++++++++++++++++++++++++++---- src/llama-context.h | 6 ++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 64728e8b592ef..4ce54b0d6f890 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -3657,6 +3657,40 @@ ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross( return inp_kq_mask_cross; } +// state save/load + +size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { + llama_context::state_get_data(io); + + kv_self.state_write(io); + + return io.n_bytes(); +} + +size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { + llama_context::state_set_data(io); + + kv_self.state_read(io); + + return io.n_bytes(); +} + +size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { + llama_context::state_seq_get_data(io, seq_id); + + kv_self.state_write(io, seq_id); + + return io.n_bytes(); +} + +size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { + llama_context::state_seq_set_data(io, seq_id); + + kv_self.state_read(io, seq_id); + + return io.n_bytes(); +} + // // llama_context_recurrent // @@ -4527,7 +4561,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( // state save/load -size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { +size_t llama_context_recurrent::state_get_data(llama_io_write_i & io) { llama_context::state_get_data(io); kv_self.state_write(io); @@ -4535,7 +4569,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { return io.n_bytes(); } -size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { +size_t llama_context_recurrent::state_set_data(llama_io_read_i & io) { llama_context::state_set_data(io); kv_self.state_read(io); @@ -4543,7 +4577,7 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { return io.n_bytes(); } -size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { +size_t llama_context_recurrent::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { llama_context::state_seq_get_data(io, seq_id); kv_self.state_write(io, seq_id); @@ -4551,7 +4585,7 @@ size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_se return io.n_bytes(); } -size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { +size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { llama_context::state_seq_set_data(io, seq_id); kv_self.state_read(io, seq_id); diff --git a/src/llama-context.h b/src/llama-context.h index df6acb265d52f..9d8b702208b0b 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -525,6 +525,12 @@ class llama_context_recurrent : public llama_context { bool worst_case) override; protected: + virtual size_t state_get_data(llama_io_write_i & io) override; + virtual size_t state_set_data(llama_io_read_i & io) override; + + virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; + virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; + virtual void input_set(const llama_ubatch & ubatch) override; // TODO: change name to something more meaningful -- does "KV cache" make sense for recurrent models? From 548c230dff1060820b7ef66653896accee3772cc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 21 Feb 2025 12:10:57 +0200 Subject: [PATCH 63/84] graph : remove worst_case from the API ggml-ci --- src/llama-context.cpp | 1902 ++++++++++++++++++++-------------------- src/llama-context.h | 274 +++--- src/llama-graph.cpp | 44 +- src/llama-graph.h | 39 +- src/llama-kv-cache.cpp | 1 + src/llama-model.cpp | 132 ++- src/llama-model.h | 3 +- 7 files changed, 1193 insertions(+), 1202 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4ce54b0d6f890..dc1eb70b85a5e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -17,11 +17,12 @@ llama_context::llama_context( const llama_model & model, const llama_context_params & params) : - model (model), - t_start_us(model.t_start_us), - t_load_us (model.t_load_us) { + model (model) { LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__); + t_start_us = model.t_start_us; + t_load_us = model.t_load_us; + const auto & hparams = model.hparams; cparams.n_seq_max = std::max(1u, params.n_seq_max); @@ -186,136 +187,174 @@ void llama_context::init() { return; } - // buffer types used for the compute buffer of each backend - std::vector backend_buft; - std::vector backend_ptrs; - for (auto & backend : backends) { - auto * buft = ggml_backend_get_default_buffer_type(backend.get()); - auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); - if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) { - // use the host buffer of the first device CPU for faster transfer of the intermediate state - auto * dev = model.devices[0]; - auto * host_buft = ggml_backend_dev_host_buffer_type(dev); - if (host_buft) { - buft = host_buft; + { + // buffer types used for the compute buffer of each backend + backend_buft.clear(); + backend_ptrs.clear(); + + for (auto & backend : backends) { + auto * buft = ggml_backend_get_default_buffer_type(backend.get()); + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) { + // use the host buffer of the first device CPU for faster transfer of the intermediate state + auto * dev = model.devices[0]; + auto * host_buft = ggml_backend_dev_host_buffer_type(dev); + if (host_buft) { + buft = host_buft; + } } + backend_buft.push_back(buft); + backend_ptrs.push_back(backend.get()); } - backend_buft.push_back(buft); - backend_ptrs.push_back(backend.get()); - } - const size_t max_nodes = this->max_nodes(); + const size_t max_nodes = this->max_nodes(); - // buffer used to store the computation graph and the tensor meta data - // TODO: move to base class - buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + // buffer used to store the computation graph and the tensor meta data + // TODO: move to base class + buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); - // TODO: move these checks to ggml_backend_sched - // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary - bool pipeline_parallel = - model.n_devices() > 1 && - model.params.n_gpu_layers > (int) model.hparams.n_layer && - model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && - cparams.offload_kqv; + // TODO: move these checks to ggml_backend_sched + // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary + bool pipeline_parallel = + model.n_devices() > 1 && + model.params.n_gpu_layers > (int) model.hparams.n_layer && + model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && + cparams.offload_kqv; - // pipeline parallelism requires support for async compute and events in all devices - if (pipeline_parallel) { - for (auto & backend : backends) { - auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); - if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { - // ignore CPU backend - continue; - } - auto * dev = ggml_backend_get_device(backend.get()); - ggml_backend_dev_props props; - ggml_backend_dev_get_props(dev, &props); - if (!props.caps.async || !props.caps.events) { - // device does not support async compute or events - pipeline_parallel = false; - break; + // pipeline parallelism requires support for async compute and events in all devices + if (pipeline_parallel) { + for (auto & backend : backends) { + auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + // ignore CPU backend + continue; + } + auto * dev = ggml_backend_get_device(backend.get()); + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + if (!props.caps.async || !props.caps.events) { + // device does not support async compute or events + pipeline_parallel = false; + break; + } } } + + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); + + if (pipeline_parallel) { + LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); + } } - sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); + reserve(); +} + +void llama_context::synchronize() { + ggml_backend_sched_synchronize(sched.get()); + + // FIXME: if multiple single tokens are evaluated without a synchronization, + // the stats will be added to the prompt evaluation stats + // this should only happen when using batch size 1 to evaluate a batch + + // add the evaluation to the stats + if (n_queued_tokens == 1) { + if (!cparams.no_perf) { + t_eval_us += ggml_time_us() - t_compute_start_us; + } + n_eval++; + } else if (n_queued_tokens > 1) { + if (!cparams.no_perf) { + t_p_eval_us += ggml_time_us() - t_compute_start_us; + } + n_p_eval += n_queued_tokens; + } - if (pipeline_parallel) { - LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); + // get a more accurate load time, upon first eval + if (n_queued_tokens > 0 && !has_evaluated_once) { + t_load_us = ggml_time_us() - t_start_us; + has_evaluated_once = true; } - // initialize scheduler with the worst-case graph - { - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + n_queued_tokens = 0; + t_compute_start_us = 0; +} - int n_splits_pp = -1; - int n_nodes_pp = -1; +void llama_context::reserve() { + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - int n_splits_tg = -1; - int n_nodes_tg = -1; + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - // reserve pp graph first so that buffers are only allocated once - { - llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto * gf = graph_init(); - graph_build(ctx_compute.get(), gf, ubatch_pp, true); - if (!ggml_backend_sched_reserve(sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); - throw std::runtime_error("failed to allocate compute buffers"); - } + int n_splits_pp = -1; + int n_nodes_pp = -1; - n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); - n_nodes_pp = ggml_graph_n_nodes(gf); - } + int n_splits_tg = -1; + int n_nodes_tg = -1; - // reserve with tg graph to get the number of splits and nodes - { - llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto * gf = graph_init(); - graph_build(ctx_compute.get(), gf, ubatch_tg, true); - if (!ggml_backend_sched_reserve(sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__); - throw std::runtime_error("failed to allocate compute buffers"); - } - n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); - n_nodes_tg = ggml_graph_n_nodes(gf); - } + // max number of outputs + n_outputs = n_tokens; - // reserve again with pp graph to avoid ggml-alloc reallocations during inference - { - llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto * gf = graph_init(); - graph_build(ctx_compute.get(), gf, ubatch_pp, true); - if (!ggml_backend_sched_reserve(sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); - throw std::runtime_error("failed to allocate compute buffers"); - } + // reserve pp graph first so that buffers are only allocated once + { + llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch_pp); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); } - for (size_t i = 0; i < backend_ptrs.size(); ++i) { - ggml_backend_t backend = backend_ptrs[i]; - ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); - if (size > 1) { - LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); - } + n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); + n_nodes_pp = ggml_graph_n_nodes(gf); + } + + // reserve with tg graph to get the number of splits and nodes + { + llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch_tg); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); } + n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); + n_nodes_tg = ggml_graph_n_nodes(gf); + } - if (n_nodes_pp == n_nodes_tg) { - LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); - } else { - LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); + // reserve again with pp graph to avoid ggml-alloc reallocations during inference + { + llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch_pp); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); } + } - if (n_splits_pp == n_splits_tg) { - LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); - } else { - LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; + ggml_backend_buffer_type_t buft = backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (size > 1) { + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); } } + + if (n_nodes_pp == n_nodes_tg) { + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); + } else { + LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); + } + + if (n_splits_pp == n_splits_tg) { + LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); + } else { + LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); + } } const llama_model & llama_context::get_model() const { @@ -547,201 +586,141 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } -void llama_context::synchronize() { - ggml_backend_sched_synchronize(sched.get()); - - // FIXME: if multiple single tokens are evaluated without a synchronization, - // the stats will be added to the prompt evaluation stats - // this should only happen when using batch size 1 to evaluate a batch - - // add the evaluation to the stats - if (n_queued_tokens == 1) { - if (!cparams.no_perf) { - t_eval_us += ggml_time_us() - t_compute_start_us; - } - n_eval++; - } else if (n_queued_tokens > 1) { - if (!cparams.no_perf) { - t_p_eval_us += ggml_time_us() - t_compute_start_us; - } - n_p_eval += n_queued_tokens; +int llama_context::encode(llama_batch & inp_batch) { + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; } - // get a more accurate load time, upon first eval - if (n_queued_tokens > 0 && !has_evaluated_once) { - t_load_us = ggml_time_us() - t_start_us; - has_evaluated_once = true; - } + // temporary allocate memory for the input batch if needed + llama_batch_allocr batch_allocr(inp_batch, 0); - n_queued_tokens = 0; - t_compute_start_us = 0; -} + const llama_batch & batch = batch_allocr.batch; -ggml_cgraph * llama_context::graph_init() { - inp_tokens = nullptr; - inp_embd = nullptr; - inp_pos = nullptr; - inp_out_ids = nullptr; - inp_mean = nullptr; - inp_cls = nullptr; + const int32_t n_tokens = batch.n_tokens; - inp_kq_mask = nullptr; - inp_kq_mask_cnv = nullptr; + const auto & hparams = model.hparams; - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - ctx_compute.reset(ggml_init(params)); + if (batch.token) { + for (int32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + return -1; + } + } + } - return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false); -} + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot + GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens"); -llama_graph_result llama_context::graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - bool worst_case) { - return model.build_graph(ctx, gf, this, cparams, ubatch, worst_case); -} + if (t_compute_start_us == 0) { + t_compute_start_us = ggml_time_us(); + } -enum ggml_status llama_context::graph_compute( - ggml_cgraph * gf, - bool batched) { - int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; - ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; + n_queued_tokens += n_tokens; - if (backend_cpu != nullptr) { - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); - auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); - set_threadpool_fn(backend_cpu, tp); - } + const int64_t n_embd = hparams.n_embd; - // set the number of threads for all the backends - for (const auto & set_n_threads_fn : set_n_threads_fns) { - set_n_threads_fn.second(set_n_threads_fn.first, n_threads); - } + sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); - auto status = ggml_backend_sched_graph_compute_async(sched.get(), gf); - if (status != GGML_STATUS_SUCCESS) { - LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); - } + const llama_ubatch ubatch = sbatch.split_simple(n_tokens); - // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); + // reserve output buffer + if (output_reserve(n_tokens) < n_tokens) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); + return -2; + }; - return status; -} + for (int32_t i = 0; i < n_tokens; ++i) { + output_ids[i] = i; + } -int32_t llama_context::output_reserve(int32_t n_outputs) { - const auto & hparams = model.hparams; - const auto & vocab = model.vocab; + n_outputs = n_tokens; - const int64_t n_outputs_max = std::max(n_outputs, n_seq_max()); + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - const auto n_batch = cparams.n_batch; - const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; + auto * gf = graph_init(); + auto res = graph_build(ctx_compute.get(), gf, ubatch); - // TODO: use a per-batch flag for logits presence instead - const bool has_logits = !cparams.embeddings; - const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + ggml_backend_sched_alloc_graph(sched.get(), gf); - logits_size = has_logits ? n_vocab*n_outputs_max : 0; - embd_size = has_embd ? n_embd*n_outputs_max : 0; + input_set(ubatch); - if (output_ids.empty()) { - // init, never resized afterwards - output_ids.resize(n_batch); + const auto compute_status = graph_compute(gf, n_tokens > 1); + switch (compute_status) { + case GGML_STATUS_SUCCESS: + break; + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; } - const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; - const size_t new_size = (logits_size + embd_size) * sizeof(float); + auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd; - // alloc only when more than the current capacity is required - // TODO: also consider shrinking the buffer - if (!buf_output || prev_size < new_size) { - if (buf_output) { -#ifndef NDEBUG - // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) - LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); -#endif - buf_output = nullptr; - logits = nullptr; - embd = nullptr; - } + // extract embeddings + if (t_embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); - auto * buft = ggml_backend_cpu_buffer_type(); - // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory - auto * output_dev = model.dev_output(); - auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; - if (output_dev_host_buft) { - buft = output_dev_host_buft; - } - buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); - if (buf_output == nullptr) { - LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); - return 0; - } - } + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + GGML_ASSERT(embd != nullptr); - float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get()); + // extract token embeddings + float * embd_out = embd; - logits = has_logits ? output_base : nullptr; - embd = has_embd ? output_base + logits_size : nullptr; - - output_size = n_outputs_max; - - // set all ids as invalid (negative) - std::fill(output_ids.begin(), output_ids.end(), -1); - - ggml_backend_buffer_clear(buf_output.get(), 0); - - n_outputs = 0; - - return n_outputs_max; -} - -void llama_context::output_reorder() { - auto & out_ids = sbatch.out_ids; - if (!out_ids.empty()) { - const uint32_t n_vocab = model.vocab.n_tokens(); - const uint32_t n_embd = model.hparams.n_embd; + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = embd_seq; + embd_seq_out.clear(); - GGML_ASSERT((size_t) n_outputs == out_ids.size()); + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - // TODO: is there something more efficient which also minimizes swaps? - // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) - for (int32_t i = 0; i < n_outputs - 1; ++i) { - int32_t j_min = i; - for (int32_t j = i + 1; j < n_outputs; ++j) { - if (out_ids[j] < out_ids[j_min]) { - j_min = j; - } - } - if (j_min == i) { continue; } - std::swap(out_ids[i], out_ids[j_min]); - if (logits_size > 0) { - for (uint32_t k = 0; k < n_vocab; k++) { - std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); + for (int32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = ubatch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // TODO: this likely should be the same logic as in llama_decoder_internal, but better to + // wait for an encoder model that requires this pooling type in order to test it + // https://github.com/ggerganov/llama.cpp/pull/9510 + GGML_ABORT("RANK pooling not implemented yet"); } - } - if (embd_size > 0) { - for (uint32_t k = 0; k < n_embd; k++) { - std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); } - } - } - std::fill(output_ids.begin(), output_ids.end(), -1); - for (int32_t i = 0; i < n_outputs; ++i) { - output_ids[out_ids[i]] = i; } - out_ids.clear(); } + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + return 0; } -int llama_context::encode(llama_batch & inp_batch) { +int llama_context::decode(llama_batch & inp_batch) { if (inp_batch.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; @@ -752,103 +731,142 @@ int llama_context::encode(llama_batch & inp_batch) { const llama_batch & batch = batch_allocr.batch; - const int32_t n_tokens = batch.n_tokens; - + const auto & vocab = model.vocab; const auto & hparams = model.hparams; + const int32_t n_vocab = vocab.n_tokens(); + + const int64_t n_tokens = batch.n_tokens; + const int64_t n_embd = hparams.n_embd; + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT if (batch.token) { - for (int32_t i = 0; i < n_tokens; ++i) { + for (int64_t i = 0; i < n_tokens; ++i) { if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); - return -1; + LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); + throw std::runtime_error("invalid token"); } } } - // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot - GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens"); + // micro-batching is not possible without KV cache + GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "llama_context requires n_ubatch >= n_tokens"); if (t_compute_start_us == 0) { t_compute_start_us = ggml_time_us(); } - n_queued_tokens += n_tokens; - const int64_t n_embd = hparams.n_embd; + // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); + embd_seq.clear(); + + int64_t n_outputs_all = 0; + + // count outputs + if (batch.logits && !embd_pooled) { + for (uint32_t i = 0; i < n_tokens; ++i) { + n_outputs_all += batch.logits[i] != 0; + } + } else if (logits_all || embd_pooled) { + n_outputs_all = n_tokens; + } else { + // keep last output only + n_outputs_all = 1; + } + + const bool logits_all = n_outputs_all == n_tokens; + + sbatch.from_batch(batch, n_embd, + /* simple_split */ true, + /* logits_all */ logits_all); const llama_ubatch ubatch = sbatch.split_simple(n_tokens); // reserve output buffer - if (output_reserve(n_tokens) < n_tokens) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); + if (output_reserve(n_outputs_all) < n_outputs_all) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); return -2; }; - for (int32_t i = 0; i < n_tokens; ++i) { - output_ids[i] = i; - } - - n_outputs = n_tokens; - - GGML_ASSERT(need_reserve == false); + n_outputs = n_outputs_all; ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, false); + auto res = graph_build(ctx_compute.get(), gf, ubatch); + + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); ggml_backend_sched_alloc_graph(sched.get(), gf); input_set(ubatch); - const auto compute_status = graph_compute(gf, n_tokens > 1); - switch (compute_status) { - case GGML_STATUS_SUCCESS: - break; - case GGML_STATUS_ABORTED: - return 2; - case GGML_STATUS_ALLOC_FAILED: - return -2; - case GGML_STATUS_FAILED: - default: - return -3; + const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); + if (compute_status != GGML_STATUS_SUCCESS) { + switch (compute_status) { + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } } - auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd; + auto * t_logits = cparams.embeddings ? nullptr : res.t_logits; + auto * t_embd = cparams.embeddings ? res.t_embd : nullptr; + + if (t_embd && res.t_embd_pooled) { + t_embd = res.t_embd_pooled; + } + + // extract logits + if (t_logits && n_outputs > 0) { + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); + GGML_ASSERT(backend_res != nullptr); + GGML_ASSERT(logits != nullptr); + + float * logits_out = logits; + + if (n_outputs) { + GGML_ASSERT(n_outputs <= n_outputs_all); + GGML_ASSERT(n_outputs*n_vocab <= (int64_t) logits_size); + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + } + } // extract embeddings - if (t_embd) { + if (t_embd && n_outputs > 0) { ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); GGML_ASSERT(backend_embd != nullptr); switch (cparams.pooling_type) { case LLAMA_POOLING_TYPE_NONE: { - GGML_ASSERT(embd != nullptr); - // extract token embeddings + GGML_ASSERT(embd != nullptr); float * embd_out = embd; - GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + if (n_outputs) { + GGML_ASSERT(n_outputs <= n_outputs_all); + GGML_ASSERT(n_outputs*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float)); + } } break; case LLAMA_POOLING_TYPE_MEAN: case LLAMA_POOLING_TYPE_CLS: case LLAMA_POOLING_TYPE_LAST: { - // extract sequence embeddings + // extract sequence embeddings (cleared before processing each batch) auto & embd_seq_out = embd_seq; - embd_seq_out.clear(); - - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - for (int32_t i = 0; i < n_tokens; i++) { - const llama_seq_id seq_id = ubatch.seq_id[i][0]; + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { continue; } @@ -858,11 +876,18 @@ int llama_context::encode(llama_batch & inp_batch) { } break; case LLAMA_POOLING_TYPE_RANK: { - // TODO: this likely should be the same logic as in llama_decoder_internal, but better to - // wait for an encoder model that requires this pooling type in order to test it - // https://github.com/ggerganov/llama.cpp/pull/9510 - GGML_ABORT("RANK pooling not implemented yet"); - } + // extract the rerank score - a single float per sequence + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(1); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); + } + } break; case LLAMA_POOLING_TYPE_UNSPECIFIED: { GGML_ABORT("unknown pooling type"); @@ -870,6 +895,28 @@ int llama_context::encode(llama_batch & inp_batch) { } } + // set output mappings + { + bool sorted_output = true; + + GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all); + + for (int64_t i = 0; i < n_outputs_all; ++i) { + int64_t out_id = sbatch.out_ids[i]; + output_ids[out_id] = i; + if (out_id != i) { + sorted_output = false; + } + } + + if (sorted_output) { + sbatch.out_ids.clear(); + } + } + + // wait for the computation to finish (automatically done when obtaining the model output) + //synchronize(); + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. ggml_backend_sched_reset(sched.get()); @@ -877,212 +924,438 @@ int llama_context::encode(llama_batch & inp_batch) { return 0; } -int llama_context::decode(llama_batch & inp_batch) { - if (inp_batch.n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); - return -1; - } - - // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, 0); +// +// input +// - const llama_batch & batch = batch_allocr.batch; +void llama_context::input_set(const llama_ubatch & ubatch) { + const llama_hparams & hparams = model.hparams; - const auto & vocab = model.vocab; - const auto & hparams = model.hparams; + if (ubatch.token) { + const int64_t n_tokens = ubatch.n_tokens; - const int32_t n_vocab = vocab.n_tokens(); + ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); + } - const int64_t n_tokens = batch.n_tokens; - const int64_t n_embd = hparams.n_embd; + if (ubatch.embd) { + const int64_t n_embd = hparams.n_embd; + const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd)); + } - if (batch.token) { - for (int64_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); - throw std::runtime_error("invalid token"); + if (ubatch.pos && inp_pos) { + const int64_t n_tokens = ubatch.n_tokens; + + ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos)); + } + + if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs"); + + if (!inp_out_ids) { + LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__); + } else { + const int64_t n_tokens = ubatch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer)); + int32_t * data = (int32_t *) inp_out_ids->data; + + if (n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (ubatch.output) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch.output[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(n_outputs == n_outputs); + } else if (n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(n_outputs == 0); } } } - // micro-batching is not possible without KV cache - GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "llama_context requires n_ubatch >= n_tokens"); + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; - if (t_compute_start_us == 0) { - t_compute_start_us = ggml_time_us(); + GGML_ASSERT(inp_mean); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer)); + + float * data = (float *) inp_mean->data; + memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean)); + + std::vector sum(n_tokens, 0); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); + + sum[seq_id] += ubatch.n_seq_tokens; + } + + std::vector div(n_tokens, 0.0f); + for (int i = 0; i < n_tokens; ++i) { + const uint64_t s = sum[i]; + if (s > 0) { + div[i] = 1.0f/float(s); + } + } + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + for (int i = 0; i < n_seq_tokens; ++i) { + data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; + } + } } - n_queued_tokens += n_tokens; - // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens - const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + if (cparams.embeddings && ( + cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || + cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; - embd_seq.clear(); + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - int64_t n_outputs_all = 0; + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); - // count outputs - if (batch.logits && !embd_pooled) { - for (uint32_t i = 0; i < n_tokens; ++i) { - n_outputs_all += batch.logits[i] != 0; + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; + + if (pos == 0) { + data[seq_id] = s*n_seq_tokens + i; + } + } + } + } + + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); + + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); + + std::vector last_pos(n_tokens, -1); + std::vector last_row(n_tokens, -1); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; + + if (pos >= last_pos[seq_id]) { + last_pos[seq_id] = pos; + last_row[seq_id] = s*n_seq_tokens + i; + } + } + } + + for (int i = 0; i < n_tokens; ++i) { + if (last_row[i] >= 0) { + data[i] = last_row[i]; + } + } + } + + if (inp_kq_mask) { + if (cparams.causal_attn) { + const int64_t n_kv = ubatch.n_tokens; + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer)); + float * data = (float *) inp_kq_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int s1 = 0; s1 < n_seqs; ++s1) { + const llama_seq_id seq_id = ubatch.seq_id[s1][0]; + + for (int j = 0; j < n_seq_tokens; ++j) { + const int32_t tj = s1*n_seq_tokens + j; + + for (int s0 = 0; s0 < n_seqs; ++s0) { + for (int i = 0; i < n_seq_tokens; ++i) { + const int32_t ti = s0*n_seq_tokens + i; + float f = -INFINITY; + + for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) { + if (ubatch.seq_id[s0][s] == seq_id && ubatch.pos[ti] <= ubatch.pos[tj]) { + if (hparams.use_alibi) { + f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]); + } else { + f = 0.0f; + } + break; + } + } + + data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f; + } + } + } + } + } + } else { + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_stride = ubatch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer)); + + float * data = (float *) inp_kq_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int s1 = 0; s1 < n_seqs; ++s1) { + const llama_seq_id seq_id = ubatch.seq_id[s1][0]; + + for (int j = 0; j < n_seq_tokens; ++j) { + const int32_t tj = s1*n_seq_tokens + j; + + for (int s0 = 0; s0 < n_seqs; ++s0) { + for (int i = 0; i < n_seq_tokens; ++i) { + const int32_t ti = s0*n_seq_tokens + i; + float f = -INFINITY; + + for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) { + if (ubatch.seq_id[s0][s] == seq_id) { + if (hparams.use_alibi) { + f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]); + } else { + f = 0.0f; + } + break; + } + } + + data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f; + } + } + + for (int i = n_tokens; i < n_stride; ++i) { + data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY; + } + } + } + } + } + } + + GGML_ASSERT( + // (!a || b) is a logical implication (a -> b) + // !hparams.causal_attn -> !cparams.causal_attn + (hparams.causal_attn || !cparams.causal_attn) && + "causal attention is not supported by this model" + ); +} + +// +// output +// + +int32_t llama_context::output_reserve(int32_t n_outputs) { + const auto & hparams = model.hparams; + const auto & vocab = model.vocab; + + const int64_t n_outputs_max = std::max(n_outputs, n_seq_max()); + + const auto n_batch = cparams.n_batch; + const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; + + // TODO: use a per-batch flag for logits presence instead + const bool has_logits = !cparams.embeddings; + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + + logits_size = has_logits ? n_vocab*n_outputs_max : 0; + embd_size = has_embd ? n_embd*n_outputs_max : 0; + + if (output_ids.empty()) { + // init, never resized afterwards + output_ids.resize(n_batch); + } + + const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; + const size_t new_size = (logits_size + embd_size) * sizeof(float); + + // alloc only when more than the current capacity is required + // TODO: also consider shrinking the buffer + if (!buf_output || prev_size < new_size) { + if (buf_output) { +#ifndef NDEBUG + // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) + LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + buf_output = nullptr; + logits = nullptr; + embd = nullptr; + } + + auto * buft = ggml_backend_cpu_buffer_type(); + // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory + auto * output_dev = model.dev_output(); + auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; + if (output_dev_host_buft) { + buft = output_dev_host_buft; + } + buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); + if (buf_output == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); + return 0; } - } else if (logits_all || embd_pooled) { - n_outputs_all = n_tokens; - } else { - // keep last output only - n_outputs_all = 1; } - const bool logits_all = n_outputs_all == n_tokens; - - sbatch.from_batch(batch, n_embd, - /* simple_split */ true, - /* logits_all */ logits_all); - - const llama_ubatch ubatch = sbatch.split_simple(n_tokens); + float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get()); - // reserve output buffer - if (output_reserve(n_outputs_all) < n_outputs_all) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); - return -2; - }; + logits = has_logits ? output_base : nullptr; + embd = has_embd ? output_base + logits_size : nullptr; - n_outputs = n_outputs_all; + output_size = n_outputs_max; - GGML_ASSERT(need_reserve == false); + // set all ids as invalid (negative) + std::fill(output_ids.begin(), output_ids.end(), -1); - ggml_backend_sched_reset(sched.get()); - ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + ggml_backend_buffer_clear(buf_output.get(), 0); - auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, false); + n_outputs = 0; - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + return n_outputs_max; +} - ggml_backend_sched_alloc_graph(sched.get(), gf); +void llama_context::output_reorder() { + auto & out_ids = sbatch.out_ids; + if (!out_ids.empty()) { + const uint32_t n_vocab = model.vocab.n_tokens(); + const uint32_t n_embd = model.hparams.n_embd; - input_set(ubatch); + GGML_ASSERT((size_t) n_outputs == out_ids.size()); - const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); - if (compute_status != GGML_STATUS_SUCCESS) { - switch (compute_status) { - case GGML_STATUS_ABORTED: - return 2; - case GGML_STATUS_ALLOC_FAILED: - return -2; - case GGML_STATUS_FAILED: - default: - return -3; + // TODO: is there something more efficient which also minimizes swaps? + // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) + for (int32_t i = 0; i < n_outputs - 1; ++i) { + int32_t j_min = i; + for (int32_t j = i + 1; j < n_outputs; ++j) { + if (out_ids[j] < out_ids[j_min]) { + j_min = j; + } + } + if (j_min == i) { continue; } + std::swap(out_ids[i], out_ids[j_min]); + if (logits_size > 0) { + for (uint32_t k = 0; k < n_vocab; k++) { + std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); + } + } + if (embd_size > 0) { + for (uint32_t k = 0; k < n_embd; k++) { + std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); + } + } + } + std::fill(output_ids.begin(), output_ids.end(), -1); + for (int32_t i = 0; i < n_outputs; ++i) { + output_ids[out_ids[i]] = i; } + out_ids.clear(); } +} - auto * t_logits = cparams.embeddings ? nullptr : res.t_logits; - auto * t_embd = cparams.embeddings ? res.t_embd : nullptr; - - if (t_embd && res.t_embd_pooled) { - t_embd = res.t_embd_pooled; - } +// +// graph +// - // extract logits - if (t_logits && n_outputs > 0) { - ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); - GGML_ASSERT(backend_res != nullptr); - GGML_ASSERT(logits != nullptr); +ggml_cgraph * llama_context::graph_init() { + inp_tokens = nullptr; + inp_embd = nullptr; + inp_pos = nullptr; + inp_out_ids = nullptr; + inp_mean = nullptr; + inp_cls = nullptr; - float * logits_out = logits; + inp_kq_mask = nullptr; + inp_kq_mask_cnv = nullptr; - if (n_outputs) { - GGML_ASSERT(n_outputs <= n_outputs_all); - GGML_ASSERT(n_outputs*n_vocab <= (int64_t) logits_size); - ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); - } - } + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; - // extract embeddings - if (t_embd && n_outputs > 0) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); - GGML_ASSERT(backend_embd != nullptr); + ctx_compute.reset(ggml_init(params)); - switch (cparams.pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - // extract token embeddings - GGML_ASSERT(embd != nullptr); - float * embd_out = embd; + return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false); +} - if (n_outputs) { - GGML_ASSERT(n_outputs <= n_outputs_all); - GGML_ASSERT(n_outputs*n_embd <= (int64_t) embd_size); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_MEAN: - case LLAMA_POOLING_TYPE_CLS: - case LLAMA_POOLING_TYPE_LAST: - { - // extract sequence embeddings (cleared before processing each batch) - auto & embd_seq_out = embd_seq; +llama_graph_result llama_context::graph_build( + ggml_context * ctx, + ggml_cgraph * gf, + const llama_ubatch & ubatch) { + return model.build_graph(ctx, gf, this, cparams, ubatch); +} - for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_RANK: - { - // extract the rerank score - a single float per sequence - auto & embd_seq_out = embd_seq; +enum ggml_status llama_context::graph_compute( + ggml_cgraph * gf, + bool batched) { + int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; + ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; - for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(1); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_UNSPECIFIED: - { - GGML_ABORT("unknown pooling type"); - } - } + if (backend_cpu != nullptr) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(backend_cpu, tp); } - // set output mappings - { - bool sorted_output = true; - - GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all); - - for (int64_t i = 0; i < n_outputs_all; ++i) { - int64_t out_id = sbatch.out_ids[i]; - output_ids[out_id] = i; - if (out_id != i) { - sorted_output = false; - } - } - - if (sorted_output) { - sbatch.out_ids.clear(); - } + // set the number of threads for all the backends + for (const auto & set_n_threads_fn : set_n_threads_fns) { + set_n_threads_fn.second(set_n_threads_fn.first, n_threads); } - // wait for the computation to finish (automatically done when obtaining the model output) - //synchronize(); + auto status = ggml_backend_sched_graph_compute_async(sched.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); + } - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - ggml_backend_sched_reset(sched.get()); + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); - return 0; + return status; } +// +// graph build API +// + void llama_context::build_cb( ggml_tensor * cur, const char * name, @@ -1307,10 +1580,8 @@ ggml_tensor * llama_context::build_inp_pos( } ggml_tensor * llama_context::build_inp_out_ids( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) { - const int32_t n_out_ids = worst_case ? n_tokens : n_outputs; + ggml_context * ctx0) { + const int32_t n_out_ids = n_outputs; inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids); ggml_set_input(inp_out_ids); @@ -1336,6 +1607,22 @@ ggml_tensor * llama_context::build_inp_cls( return inp_cls; } +void llama_context::build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa) { + // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch + GGML_UNUSED(causal); + GGML_UNUSED(swa); + + inp_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + //cb(inp_kq_mask, "KQ_mask", -1); + ggml_set_input(inp_kq_mask); + + inp_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_kq_mask, GGML_TYPE_F16) : inp_kq_mask; +} + ggml_tensor * llama_context::build_attn( ggml_context * ctx0, ggml_cgraph * gf, @@ -1346,8 +1633,7 @@ ggml_tensor * llama_context::build_attn( ggml_tensor * v_cur, int32_t n_tokens, float kq_scale, - int il, - bool worst_case) { + int il) { const auto & hparams = model.hparams; const auto & n_ctx = cparams.n_ctx; @@ -1364,7 +1650,6 @@ ggml_tensor * llama_context::build_attn( const auto & n_embd_head_v = hparams.n_embd_head_v; // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch - GGML_UNUSED(worst_case); const auto n_kv = n_tokens; struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); @@ -1450,27 +1735,9 @@ ggml_tensor * llama_context::build_attn( if (wo_b) { cur = ggml_add(ctx0, cur, wo_b); - } - - return cur; -} - -void llama_context::build_attn_inp( - ggml_context * ctx0, - int32_t n_tokens, - bool causal, - bool swa, - bool worst_case) { - // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch - GGML_UNUSED(causal); - GGML_UNUSED(swa); - GGML_UNUSED(worst_case); - - inp_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp_kq_mask, "KQ_mask", -1); - ggml_set_input(inp_kq_mask); + } - inp_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_kq_mask, GGML_TYPE_F16) : inp_kq_mask; + return cur; } // @@ -1497,7 +1764,7 @@ void llama_context::perf_reset() { } // -// state +// state save/load // class llama_io_write_dummy : public llama_io_write_i { @@ -1857,367 +2124,110 @@ size_t llama_context::state_get_data(llama_io_write_i & io) { // write logits { - const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens()); - - io.write(&logits_size, sizeof(logits_size)); - - if (logits_size) { - io.write(logits, logits_size * sizeof(float)); - } - } - - // write embeddings - { - const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd); - - io.write(&embd_size, sizeof(embd_size)); - - if (embd_size) { - io.write(embd, embd_size * sizeof(float)); - } - } - - return io.n_bytes(); -} - -size_t llama_context::state_set_data(llama_io_read_i & io) { - // read model info - { - const std::string cur_arch_str = llm_arch_name(model.arch); - - std::string arch_str; - io.read_string(arch_str); - if (cur_arch_str != arch_str) { - throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); - } - // TODO: add more info which needs to be identical but which is not verified otherwise - } - - // read output ids - { - auto n_outputs = this->n_outputs; - io.read_to(&n_outputs, sizeof(n_outputs)); - - if (n_outputs > output_reserve(n_outputs)) { - throw std::runtime_error("could not reserve outputs"); - } - - std::vector output_pos; - - if (n_outputs) { - output_pos.resize(n_outputs); - io.read_to(output_pos.data(), n_outputs * sizeof(int32_t)); - - for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { - int32_t id = output_pos[i]; - if ((uint32_t) id >= n_batch()) { - throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch())); - } - this->output_ids[id] = i; - } - - this->n_outputs = n_outputs; - } - } - - // read logits - { - uint64_t logits_size; - io.read_to(&logits_size, sizeof(logits_size)); - - if (this->logits_size < logits_size) { - throw std::runtime_error("logits buffer too small"); - } - - if (logits_size) { - io.read_to(this->logits, logits_size * sizeof(float)); - } - } - - // read embeddings - { - uint64_t embd_size; - io.read_to(&embd_size, sizeof(embd_size)); - - if (this->embd_size < embd_size) { - throw std::runtime_error("embeddings buffer too small"); - } - - if (embd_size) { - io.read_to(this->embd, embd_size * sizeof(float)); - } - } - - return io.n_bytes(); -} - -size_t llama_context::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { - GGML_UNUSED(seq_id); - - return io.n_bytes(); -} - -size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { - GGML_UNUSED(seq_id); - - return io.n_bytes(); -} - -// -// input -// - -void llama_context::input_set(const llama_ubatch & ubatch) { - const llama_hparams & hparams = model.hparams; - - if (ubatch.token) { - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); - } - - if (ubatch.embd) { - const int64_t n_embd = hparams.n_embd; - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd)); - } - - if (ubatch.pos && inp_pos) { - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos)); - } - - if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs"); - - if (!inp_out_ids) { - LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__); - } else { - const int64_t n_tokens = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer)); - int32_t * data = (int32_t *) inp_out_ids->data; - - if (n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { - data[i] = i; - } - } else if (ubatch.output) { - int32_t n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch.output[i]) { - data[n_outputs++] = i; - } - } - // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(n_outputs == n_outputs); - } else if (n_outputs == 1) { - // only keep last output - data[0] = n_tokens - 1; - } else { - GGML_ASSERT(n_outputs == 0); - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp_mean); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer)); - - float * data = (float *) inp_mean->data; - memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean)); - - std::vector sum(n_tokens, 0); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); - - sum[seq_id] += ubatch.n_seq_tokens; - } - - std::vector div(n_tokens, 0.0f); - for (int i = 0; i < n_tokens; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); - } - } - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - for (int i = 0; i < n_seq_tokens; ++i) { - data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; - } - } - } - - if (cparams.embeddings && ( - cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || - cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - - uint32_t * data = (uint32_t *) inp_cls->data; - memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; - - if (pos == 0) { - data[seq_id] = s*n_seq_tokens + i; - } - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - - uint32_t * data = (uint32_t *) inp_cls->data; - memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); - - std::vector last_pos(n_tokens, -1); - std::vector last_row(n_tokens, -1); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); + const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens()); - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; + io.write(&logits_size, sizeof(logits_size)); - if (pos >= last_pos[seq_id]) { - last_pos[seq_id] = pos; - last_row[seq_id] = s*n_seq_tokens + i; - } - } + if (logits_size) { + io.write(logits, logits_size * sizeof(float)); } + } - for (int i = 0; i < n_tokens; ++i) { - if (last_row[i] >= 0) { - data[i] = last_row[i]; - } + // write embeddings + { + const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd); + + io.write(&embd_size, sizeof(embd_size)); + + if (embd_size) { + io.write(embd, embd_size * sizeof(float)); } } - if (inp_kq_mask) { - if (cparams.causal_attn) { - const int64_t n_kv = ubatch.n_tokens; - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; + return io.n_bytes(); +} - GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer)); - float * data = (float *) inp_kq_mask->data; +size_t llama_context::state_set_data(llama_io_read_i & io) { + // read model info + { + const std::string cur_arch_str = llm_arch_name(model.arch); - for (int h = 0; h < 1; ++h) { - for (int s1 = 0; s1 < n_seqs; ++s1) { - const llama_seq_id seq_id = ubatch.seq_id[s1][0]; + std::string arch_str; + io.read_string(arch_str); + if (cur_arch_str != arch_str) { + throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str())); + } + // TODO: add more info which needs to be identical but which is not verified otherwise + } - for (int j = 0; j < n_seq_tokens; ++j) { - const int32_t tj = s1*n_seq_tokens + j; + // read output ids + { + auto n_outputs = this->n_outputs; + io.read_to(&n_outputs, sizeof(n_outputs)); - for (int s0 = 0; s0 < n_seqs; ++s0) { - for (int i = 0; i < n_seq_tokens; ++i) { - const int32_t ti = s0*n_seq_tokens + i; - float f = -INFINITY; + if (n_outputs > output_reserve(n_outputs)) { + throw std::runtime_error("could not reserve outputs"); + } - for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) { - if (ubatch.seq_id[s0][s] == seq_id && ubatch.pos[ti] <= ubatch.pos[tj]) { - if (hparams.use_alibi) { - f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]); - } else { - f = 0.0f; - } - break; - } - } + std::vector output_pos; - data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f; - } - } - } + if (n_outputs) { + output_pos.resize(n_outputs); + io.read_to(output_pos.data(), n_outputs * sizeof(int32_t)); + + for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { + int32_t id = output_pos[i]; + if ((uint32_t) id >= n_batch()) { + throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch())); } + this->output_ids[id] = i; } - } else { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_stride = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer)); - float * data = (float *) inp_kq_mask->data; + this->n_outputs = n_outputs; + } + } - for (int h = 0; h < 1; ++h) { - for (int s1 = 0; s1 < n_seqs; ++s1) { - const llama_seq_id seq_id = ubatch.seq_id[s1][0]; + // read logits + { + uint64_t logits_size; + io.read_to(&logits_size, sizeof(logits_size)); - for (int j = 0; j < n_seq_tokens; ++j) { - const int32_t tj = s1*n_seq_tokens + j; + if (this->logits_size < logits_size) { + throw std::runtime_error("logits buffer too small"); + } - for (int s0 = 0; s0 < n_seqs; ++s0) { - for (int i = 0; i < n_seq_tokens; ++i) { - const int32_t ti = s0*n_seq_tokens + i; - float f = -INFINITY; + if (logits_size) { + io.read_to(this->logits, logits_size * sizeof(float)); + } + } - for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) { - if (ubatch.seq_id[s0][s] == seq_id) { - if (hparams.use_alibi) { - f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]); - } else { - f = 0.0f; - } - break; - } - } + // read embeddings + { + uint64_t embd_size; + io.read_to(&embd_size, sizeof(embd_size)); - data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f; - } - } + if (this->embd_size < embd_size) { + throw std::runtime_error("embeddings buffer too small"); + } - for (int i = n_tokens; i < n_stride; ++i) { - data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY; - } - } - } - } + if (embd_size) { + io.read_to(this->embd, embd_size * sizeof(float)); } } - GGML_ASSERT( - // (!a || b) is a logical implication (a -> b) - // !hparams.causal_attn -> !cparams.causal_attn - (hparams.causal_attn || !cparams.causal_attn) && - "causal attention is not supported by this model" - ); + return io.n_bytes(); +} + +size_t llama_context::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { + GGML_UNUSED(seq_id); + + return io.n_bytes(); +} + +size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { + GGML_UNUSED(seq_id); + + return io.n_bytes(); } // @@ -2235,7 +2245,7 @@ llama_context_kv_self::llama_context_kv_self( LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - cparams.n_ctx = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams)); + cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv_self.get_padding(cparams)); LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx); @@ -2271,6 +2281,13 @@ llama_context_kv_self::llama_context_kv_self( llama_context_kv_self::~llama_context_kv_self() = default; +void llama_context_kv_self::reserve() { + // simulate full KV cache + kv_self.n = kv_self.size; + + llama_context::reserve(); +} + llama_kv_cache * llama_context_kv_self::get_kv_self() { return &kv_self; } @@ -2282,6 +2299,8 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const { void llama_context_kv_self::kv_self_update() { auto & kv = kv_self; + bool need_reserve = false; + if (kv.has_shift) { if (!kv.can_shift) { GGML_ABORT("The current context does not support K-shift"); @@ -2332,20 +2351,30 @@ void llama_context_kv_self::kv_self_update() { need_reserve = true; } -} -ggml_cgraph * llama_context_kv_self::graph_init() { - inp_embd_enc = nullptr; - inp_pos_bucket = nullptr; - inp_kq_mask_cross = nullptr; + // reserve a worst case graph if needed + if (need_reserve) { + LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); - inp_self_kq_mask = nullptr; - inp_self_kq_mask_cnv = nullptr; - inp_self_kq_mask_swa = nullptr; - inp_self_kq_mask_swa_cnv = nullptr; - inp_self_k_shift = nullptr; + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - return llama_context::graph_init(); + // simulate full KV cache + kv_self.n = kv_self.size; + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(sched.get()); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + } } int llama_context_kv_self::encode(llama_batch & inp_batch) { @@ -2406,14 +2435,11 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { //batch_manager->prepare(ubatch); - // TODO: do reserve - GGML_ASSERT(need_reserve == false); - ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, false); + auto res = graph_build(ctx_compute.get(), gf, ubatch); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2658,42 +2684,18 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - const uint32_t pad = get_ctx_padding(cparams); + const uint32_t pad = kv_self.get_padding(cparams); kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); - //kv_self.n = llama_kv_cache_cell_max(kv_self); } } //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - // reserve a worst case graph if needed - if (need_reserve) { - LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); - - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - - auto * gf = graph_init(); - graph_build(ctx_compute.get(), gf, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(sched.get()); - if (!ggml_backend_sched_reserve(sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - } - - need_reserve = false; - } - ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, false); + auto res = graph_build(ctx_compute.get(), gf, ubatch); // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -2841,7 +2843,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { if (cparams.causal_attn && cparams.defrag_thold > 0.0f) { // - do not defrag small contexts (i.e. < 2048 tokens) // - count the padding towards the number of used tokens - const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + get_ctx_padding(cparams))/float(kv_self.n)) : 0.0f; + const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + kv_self.get_padding(cparams))/float(kv_self.n)) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > cparams.defrag_thold) { @@ -2858,12 +2860,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { return 0; } -uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) const { - return kv_self.get_padding(cparams); -} - -// llama input - void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; @@ -3095,6 +3091,20 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { } } +ggml_cgraph * llama_context_kv_self::graph_init() { + inp_embd_enc = nullptr; + inp_pos_bucket = nullptr; + inp_kq_mask_cross = nullptr; + + inp_self_kq_mask = nullptr; + inp_self_kq_mask_cnv = nullptr; + inp_self_kq_mask_swa = nullptr; + inp_self_kq_mask_swa_cnv = nullptr; + inp_self_k_shift = nullptr; + + return llama_context::graph_init(); +} + ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) { inp_self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); ggml_set_input(inp_self_k_shift); @@ -3106,9 +3116,8 @@ void llama_context_kv_self::build_attn_inp( ggml_context * ctx0, int32_t n_tokens, bool causal, - bool swa, - bool worst_case) { - const auto n_kv = worst_case ? kv_self.size : kv_self.n; + bool swa) { + const auto n_kv = kv_self.n; inp_self_kq_mask = causal ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) @@ -3143,8 +3152,7 @@ ggml_tensor * llama_context_kv_self::build_attn( ggml_tensor * v_cur, int32_t n_tokens, float kq_scale, - int il, - bool worst_case) { + int il) { const auto & hparams = model.hparams; const auto & n_ctx = cparams.n_ctx; @@ -3156,7 +3164,7 @@ ggml_tensor * llama_context_kv_self::build_attn( { GGML_ASSERT(!kv_self.recurrent); - const auto kv_head = worst_case ? kv_self.size - n_tokens : kv_self.head; + const auto kv_head = kv_self.head; GGML_ASSERT(kv_self.size == n_ctx); @@ -3211,7 +3219,7 @@ ggml_tensor * llama_context_kv_self::build_attn( const auto & kq_mask = is_sliding ? inp_self_kq_mask_swa_cnv : inp_self_kq_mask_cnv; - const auto n_kv = worst_case ? kv_self.size : kv_self.n; + const auto n_kv = kv_self.n; const int64_t n_head = hparams.n_head(il); const int64_t n_head_kv = hparams.n_head_kv(il); @@ -3626,14 +3634,12 @@ void llama_context_kv_self::build_kv_self_defrag( } ggml_tensor * llama_context_kv_self::build_inp_embd_enc( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) { + ggml_context * ctx0) { const auto & hparams = model.hparams; const int64_t n_embd = hparams.n_embd; // TODO: not sure if this is correct - const int32_t n_outputs_enc = worst_case ? n_tokens : embd_enc.size() / n_embd; + const int32_t n_outputs_enc = embd_enc.size() / n_embd; inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); ggml_set_input(inp_embd_enc); @@ -3643,13 +3649,12 @@ ggml_tensor * llama_context_kv_self::build_inp_embd_enc( ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross( ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) { + int32_t n_tokens) { const auto & hparams = model.hparams; const int64_t n_embd = hparams.n_embd; // TODO: not sure if this is correct - const int32_t n_outputs_enc = worst_case ? n_tokens : embd_enc.size() / n_embd; + const int32_t n_outputs_enc = embd_enc.size() / n_embd; inp_kq_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); ggml_set_input(inp_kq_mask_cross); @@ -3738,6 +3743,11 @@ llama_context_recurrent::llama_context_recurrent( llama_context_recurrent::~llama_context_recurrent() = default; +void llama_context_recurrent::reserve() { + // TODO: implement recurrent-specific reserve logic + llama_context::reserve(); +} + llama_kv_cache * llama_context_recurrent::get_kv_self() { return &kv_self; } @@ -3750,13 +3760,6 @@ void llama_context_recurrent::kv_self_update() { // noop } -ggml_cgraph * llama_context_recurrent::graph_init() { - inp_s_copy = nullptr; - inp_s_mask = nullptr; - - return llama_context::graph_init(); -} - int llama_context_recurrent::encode(llama_batch & inp_batch) { GGML_UNUSED(inp_batch); @@ -3917,34 +3920,11 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - // reserve a worst case graph if needed - if (need_reserve) { - LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); - - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - - auto * gf = graph_init(); - graph_build(ctx_compute.get(), gf, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(sched.get()); - if (!ggml_backend_sched_reserve(sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - } - - need_reserve = false; - } - ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, false); + auto res = graph_build(ctx_compute.get(), gf, ubatch); // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -4147,24 +4127,32 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { } } +ggml_cgraph * llama_context_recurrent::graph_init() { + inp_s_copy = nullptr; + inp_s_mask = nullptr; + + return llama_context::graph_init(); +} + ggml_tensor * llama_context_recurrent::build_inp_s_copy( - ggml_context * ctx0, - bool worst_case) { - const auto n_kv = worst_case ? kv_self.size : kv_self.n; + ggml_context * ctx0) { + const auto n_kv = kv_self.n; inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); //cb(inp_s_copy, "inp_s_copy", -1); ggml_set_input(inp_s_copy); + return inp_s_copy; } ggml_tensor * llama_context_recurrent::build_inp_s_mask( - ggml_context * ctx0, - bool worst_case) { - const auto n_kv = worst_case ? kv_self.size : kv_self.n; + ggml_context * ctx0) { + const auto n_kv = kv_self.n; + inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); //cb(inp_s_mask, "inp_s_mask", -1); ggml_set_input(inp_s_mask); + return inp_s_mask; } @@ -4174,12 +4162,10 @@ ggml_tensor * llama_context_recurrent::build_copy_mask_state( ggml_tensor * s, ggml_tensor * state_copy, ggml_tensor * state_mask, - int32_t n_tokens, int32_t n_state, - int32_t n_seqs, - bool worst_case) { - const auto n_kv = worst_case ? kv_self.size : kv_self.n; - const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + int32_t n_seqs) { + const auto n_kv = kv_self.n; + const auto kv_head = kv_self.head; struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self.size); @@ -4210,13 +4196,10 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case) { + int il) { const auto & hparams = model.hparams; - const auto & n_tokens = ubatch.n_tokens; - - const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + const auto kv_head = kv_self.head; const int64_t d_conv = hparams.ssm_d_conv; const int64_t d_inner = hparams.ssm_d_inner; @@ -4240,11 +4223,11 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer( // (ab)using the KV cache to store the states struct ggml_tensor * conv = build_copy_mask_state( ctx0, gf, conv_states_all, state_copy, state_mask, - n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); + hparams.n_embd_k_s(), n_seqs); conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); struct ggml_tensor * ssm = build_copy_mask_state( ctx0, gf, ssm_states_all, state_copy, state_mask, - n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); + hparams.n_embd_v_s(), n_seqs); ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs); // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} @@ -4345,20 +4328,18 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_load( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case) { + int il) { const auto & hparams = model.hparams; const auto token_shift_count = hparams.token_shift_count; - const auto & n_tokens = ubatch.n_tokens; const int64_t n_seqs = ubatch.n_seqs; struct ggml_tensor * token_shift_all = kv_self.k_l[il]; struct ggml_tensor * token_shift = build_copy_mask_state( ctx0, gf, token_shift_all, state_copy, state_mask, - n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); + hparams.n_embd_k_s(), n_seqs); token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs); @@ -4369,17 +4350,15 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, - int il, - bool worst_case) { + int il) { const auto & hparams = model.hparams; const auto token_shift_count = hparams.token_shift_count; const auto n_embd = hparams.n_embd; - const auto & n_tokens = ubatch.n_tokens; - const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_seqs = ubatch.n_seqs; - const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + const auto kv_head = kv_self.head; return ggml_cpy( ctx0, @@ -4396,8 +4375,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case) { + int il) { const auto & hparams = model.hparams; const auto n_tokens = ubatch.n_tokens; @@ -4407,7 +4385,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( const auto n_head = n_embd / head_size; const auto n_head_kv = hparams.n_head_kv(il); - const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + const auto kv_head = kv_self.head; const auto & layer = model.layers[il]; @@ -4516,7 +4494,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( struct ggml_tensor * wkv_state = build_copy_mask_state( ctx0, gf, kv_self.v_l[il], state_copy, state_mask, - n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); + hparams.n_embd_v_s(), n_seqs); struct ggml_tensor * wkv_output; if (is_qrwkv) { diff --git a/src/llama-context.h b/src/llama-context.h index 9d8b702208b0b..d4ab5d509b155 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -22,16 +22,25 @@ using llama_loras = std::unordered_map; // basic transformer without KV cache struct llama_context : public llama_graph_i { +public: llama_context( const llama_model & model, const llama_context_params & params); virtual ~llama_context(); - // init scheduler and compute buffers + // init scheduler and compute buffers, reserve worst-case graphs // call once after the context is constructed virtual void init(); + virtual void synchronize(); + +protected: + // called by init() to reserve the worst-case graphs + // override in child classes + virtual void reserve(); + +public: const llama_model & get_model() const; const llama_cparams & get_cparams() const; @@ -93,33 +102,6 @@ struct llama_context : public llama_graph_i { int32_t il_start, int32_t il_end); - //// - - virtual void synchronize(); - - // zero-out inputs and create ggml_context - virtual ggml_cgraph * graph_init(); - - // TODO: add encode/decode graphs - virtual llama_graph_result graph_build( - ggml_context * ctx, - ggml_cgraph * gf, - const llama_ubatch & ubatch, - bool worst_case); - - // returns the result of ggml_backend_sched_graph_compute_async execution - virtual enum ggml_status graph_compute( - ggml_cgraph * gf, - bool batched); - - // Make sure enough space is available for outputs. - // Returns max number of outputs for which space was reserved. - virtual int32_t output_reserve(int32_t n_outputs); - - // make the outputs have the same order they had in the user-provided batch - // TODO: maybe remove this - virtual void output_reorder(); - // encode a batch of tokens by evaluating the encoder part of the transformer // // - lctx: llama context @@ -145,6 +127,60 @@ struct llama_context : public llama_graph_i { // virtual int decode(llama_batch & inp_batch); +protected: + // + // input + // + + // when the compute graph is built, it creates the input tensors that it needs + // the contents of the input tensors are set by the input_set() function + + virtual void input_set(const llama_ubatch & ubatch); + + // base input tensors + ggml_tensor * inp_tokens; // I32 [n_batch] + ggml_tensor * inp_embd; // F32 [n_embd, n_batch] + ggml_tensor * inp_pos; // I32 [n_batch] + ggml_tensor * inp_out_ids; // I32 [n_outputs] + ggml_tensor * inp_mean; // F32 [n_batch, n_batch] + ggml_tensor * inp_cls; // I32 [n_batch] + + // KQ mask input tensors + ggml_tensor * inp_kq_mask; // F32 [n_tokens, n_batch] + ggml_tensor * inp_kq_mask_cnv; // [n_tokens, n_batch] + + // + // output + // + + // Make sure enough space is available for outputs. + // Returns max number of outputs for which space was reserved. + virtual int32_t output_reserve(int32_t n_outputs); + + // make the outputs have the same order they had in the user-provided batch + // TODO: maybe remove this + virtual void output_reorder(); + + // + // graph + // + + // zero-out inputs and create the ctx_context for the compute graph + virtual ggml_cgraph * graph_init(); + + // TODO: add encode/decode graphs + virtual llama_graph_result graph_build( + ggml_context * ctx, + ggml_cgraph * gf, + const llama_ubatch & ubatch); + + // returns the result of ggml_backend_sched_graph_compute_async execution + virtual enum ggml_status graph_compute( + ggml_cgraph * gf, + bool batched); + + ggml_context_ptr ctx_compute; + // // graph build API (generic) // @@ -193,9 +229,7 @@ struct llama_context : public llama_graph_i { int32_t n_tokens); virtual ggml_tensor * build_inp_out_ids( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case); + ggml_context * ctx0); virtual ggml_tensor * build_inp_mean( ggml_context * ctx0, @@ -209,8 +243,7 @@ struct llama_context : public llama_graph_i { ggml_context * ctx0, int32_t n_tokens, bool causal, - bool swa, - bool worst_case); + bool swa); virtual ggml_tensor * build_attn( ggml_context * ctx0, @@ -222,15 +255,32 @@ struct llama_context : public llama_graph_i { ggml_tensor * v_cur, int32_t n_tokens, float kq_scale, - int il, - bool worst_case); + int il); +public: + // // perf + // virtual llama_perf_context_data perf_get_data() const; virtual void perf_reset(); +protected: + mutable int64_t t_start_us = 0; + mutable int64_t t_load_us = 0; + mutable int64_t t_p_eval_us = 0; + mutable int64_t t_eval_us = 0; + + mutable int64_t t_compute_start_us = 0; + mutable int64_t n_queued_tokens = 0; + + mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) + mutable int32_t n_eval = 0; // number of eval calls + +public: + // // state save/load + // virtual size_t state_get_size(); virtual size_t state_get_data( uint8_t * dst, size_t size); @@ -265,31 +315,15 @@ struct llama_context : public llama_graph_i { size_t n_token_count); protected: - // state save/load - virtual size_t state_get_data(llama_io_write_i & io); virtual size_t state_set_data(llama_io_read_i & io); virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id); virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id); - // input - - virtual void input_set(const llama_ubatch & ubatch); - - // base input tensors - ggml_tensor * inp_tokens; // I32 [n_batch] - ggml_tensor * inp_embd; // F32 [n_embd, n_batch] - ggml_tensor * inp_pos; // I32 [n_batch] - ggml_tensor * inp_out_ids; // I32 [n_outputs] - ggml_tensor * inp_mean; // F32 [n_batch, n_batch] - ggml_tensor * inp_cls; // I32 [n_batch] - - // KQ mask input tensors - ggml_tensor * inp_kq_mask; // F32 [n_tokens, n_batch] - ggml_tensor * inp_kq_mask_cnv; // [n_tokens, n_batch] - + // // members + // const llama_model & model; @@ -311,7 +345,9 @@ struct llama_context : public llama_graph_i { ggml_backend_sched_ptr sched; - ggml_context_ptr ctx_compute; + // buffer types used for the compute buffer of each backend + std::vector backend_ptrs; + std::vector backend_buft; // memory buffers used to evaluate the model std::vector buf_compute_meta; @@ -340,19 +376,7 @@ struct llama_context : public llama_graph_i { std::vector output_ids; // map batch token positions to ids of the logits and embd buffers - bool need_reserve = false; bool has_evaluated_once = false; - - mutable int64_t t_start_us = 0; - mutable int64_t t_load_us = 0; - mutable int64_t t_p_eval_us = 0; - mutable int64_t t_eval_us = 0; - - mutable int64_t t_compute_start_us = 0; - mutable int64_t n_queued_tokens = 0; - - mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - mutable int32_t n_eval = 0; // number of eval calls }; // transformer with a self-attention KV cache @@ -364,18 +388,40 @@ class llama_context_kv_self : public llama_context { virtual ~llama_context_kv_self(); +protected: + virtual void reserve() override; + +public: virtual llama_kv_cache * get_kv_self() override; virtual const llama_kv_cache * get_kv_self() const override; virtual void kv_self_update() override; - virtual ggml_cgraph * graph_init() override; - virtual int encode(llama_batch & inp_batch) override; virtual int decode(llama_batch & inp_batch) override; - // certain implementations could require a padding for the context size - uint32_t get_ctx_padding(const llama_cparams & cparams) const; +protected: + // + // input + // + + virtual void input_set(const llama_ubatch & ubatch) override; + + ggml_tensor * inp_self_kq_mask; // F32 [kv_size, n_batch] + ggml_tensor * inp_self_kq_mask_cnv; // [kv_size, n_batch] + ggml_tensor * inp_self_kq_mask_swa; // F32 [kv_size, n_batch] + ggml_tensor * inp_self_kq_mask_swa_cnv; // [kv_size, n_batch] + ggml_tensor * inp_self_k_shift; // I32 [kv_size] + + // + // graph + // + + virtual ggml_cgraph * graph_init() override; + + // + // graph build + // virtual ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override; @@ -383,8 +429,7 @@ class llama_context_kv_self : public llama_context { ggml_context * ctx0, int32_t n_tokens, bool causal, - bool swa, - bool worst_case) override; + bool swa) override; virtual ggml_tensor * build_attn( ggml_context * ctx0, @@ -396,8 +441,7 @@ class llama_context_kv_self : public llama_context { ggml_tensor * v_cur, int32_t n_tokens, float kq_scale, - int il, - bool worst_case) override; + int il) override; virtual void build_kv_self_shift( ggml_context * ctx0, @@ -422,31 +466,27 @@ class llama_context_kv_self : public llama_context { struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch] virtual ggml_tensor * build_inp_embd_enc( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) override; + ggml_context * ctx0) override; virtual ggml_tensor * build_inp_kq_mask_cross( ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) override; + int32_t n_tokens) override; + + // + // state save/load + // -protected: virtual size_t state_get_data(llama_io_write_i & io) override; virtual size_t state_set_data(llama_io_read_i & io) override; virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; - virtual void input_set(const llama_ubatch & ubatch) override; + // + // members + // llama_kv_cache kv_self; - - ggml_tensor * inp_self_kq_mask; // F32 [kv_size, n_batch] - ggml_tensor * inp_self_kq_mask_cnv; // [kv_size, n_batch] - ggml_tensor * inp_self_kq_mask_swa; // F32 [kv_size, n_batch] - ggml_tensor * inp_self_kq_mask_swa_cnv; // [kv_size, n_batch] - ggml_tensor * inp_self_k_shift; // I32 [kv_size] }; // a recurrent transformer (ie.e RWKV, Mamba) @@ -458,23 +498,43 @@ class llama_context_recurrent : public llama_context { virtual ~llama_context_recurrent(); +protected: + virtual void reserve() override; + +public: virtual llama_kv_cache * get_kv_self() override; virtual const llama_kv_cache * get_kv_self() const override; virtual void kv_self_update() override; - virtual ggml_cgraph * graph_init() override; - virtual int encode(llama_batch & inp_batch) override; virtual int decode(llama_batch & inp_batch) override; +protected: + // + // input + // + + virtual void input_set(const llama_ubatch & ubatch) override; + + struct ggml_tensor * inp_s_copy; // I32 [kv_size] + struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] + + // + // graph + // + + virtual ggml_cgraph * graph_init() override; + + // + // graph build + // + virtual ggml_tensor * build_inp_s_copy( - ggml_context * ctx0, - bool worst_case) override; + ggml_context * ctx0) override; virtual ggml_tensor * build_inp_s_mask( - ggml_context * ctx0, - bool worst_case) override; + ggml_context * ctx0) override; virtual ggml_tensor * build_copy_mask_state( ggml_context * ctx0, @@ -482,10 +542,8 @@ class llama_context_recurrent : public llama_context { ggml_tensor * s, ggml_tensor * state_copy, ggml_tensor * state_mask, - int32_t n_tokens, int32_t n_state, - int32_t n_seqs, - bool worst_case) override; + int32_t n_seqs) override; virtual ggml_tensor * build_mamba_layer( ggml_context * ctx0, @@ -494,8 +552,7 @@ class llama_context_recurrent : public llama_context { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case) override; + int il) override; virtual ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, @@ -503,15 +560,13 @@ class llama_context_recurrent : public llama_context { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case) override; + int il) override; virtual ggml_tensor * build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, - int il, - bool worst_case) override; + int il) override; virtual ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, @@ -521,23 +576,24 @@ class llama_context_recurrent : public llama_context { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case) override; + int il) override; + + // + // state save/load + // -protected: virtual size_t state_get_data(llama_io_write_i & io) override; virtual size_t state_set_data(llama_io_read_i & io) override; virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; - virtual void input_set(const llama_ubatch & ubatch) override; + // + // members + // // TODO: change name to something more meaningful -- does "KV cache" make sense for recurrent models? llama_kv_cache_recurrent kv_self; - - struct ggml_tensor * inp_s_copy; // I32 [kv_size] - struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] }; // For internal test use diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index d9d4e00e98ba0..af556f5bb81f0 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -12,8 +12,7 @@ ggml_tensor * llama_graph_i::build_attn( ggml_tensor * v_cur, int32_t n_tokens, float kq_scale, - int il, - bool worst_case) { + int il) { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(wo); @@ -24,7 +23,6 @@ ggml_tensor * llama_graph_i::build_attn( GGML_UNUSED(n_tokens); GGML_UNUSED(kq_scale); GGML_UNUSED(il); - GGML_UNUSED(worst_case); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); return nullptr; @@ -57,12 +55,8 @@ ggml_tensor * llama_graph_i::build_inp_self_k_shift( } ggml_tensor * llama_graph_i::build_inp_embd_enc( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) { + ggml_context * ctx0) { GGML_UNUSED(ctx0); - GGML_UNUSED(n_tokens); - GGML_UNUSED(worst_case); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); return nullptr; @@ -70,21 +64,17 @@ ggml_tensor * llama_graph_i::build_inp_embd_enc( ggml_tensor * llama_graph_i::build_inp_kq_mask_cross( ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) { + int32_t n_tokens) { GGML_UNUSED(ctx0); GGML_UNUSED(n_tokens); - GGML_UNUSED(worst_case); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); return nullptr; } ggml_tensor * llama_graph_i::build_inp_s_copy ( - ggml_context * ctx0, - bool worst_case) { + ggml_context * ctx0) { GGML_UNUSED(ctx0); - GGML_UNUSED(worst_case); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -92,10 +82,8 @@ ggml_tensor * llama_graph_i::build_inp_s_copy ( } ggml_tensor * llama_graph_i::build_inp_s_mask( - ggml_context * ctx0, - bool worst_case) { + ggml_context * ctx0) { GGML_UNUSED(ctx0); - GGML_UNUSED(worst_case); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -108,19 +96,15 @@ ggml_tensor * llama_graph_i::build_copy_mask_state( ggml_tensor * s, ggml_tensor * state_copy, ggml_tensor * state_mask, - int32_t n_tokens, int32_t n_state, - int32_t n_seqs, - bool worst_case) { + int32_t n_seqs) { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(s); GGML_UNUSED(state_copy); GGML_UNUSED(state_mask); - GGML_UNUSED(n_tokens); GGML_UNUSED(n_state); GGML_UNUSED(n_seqs); - GGML_UNUSED(worst_case); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -134,8 +118,7 @@ ggml_tensor * llama_graph_i::build_mamba_layer( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case) { + int il) { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(cur); @@ -143,7 +126,6 @@ ggml_tensor * llama_graph_i::build_mamba_layer( GGML_UNUSED(state_mask); GGML_UNUSED(ubatch); GGML_UNUSED(il); - GGML_UNUSED(worst_case); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -156,15 +138,13 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_load( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case) { + int il) { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(state_copy); GGML_UNUSED(state_mask); GGML_UNUSED(ubatch); GGML_UNUSED(il); - GGML_UNUSED(worst_case); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -175,13 +155,11 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, - int il, - bool worst_case) { + int il) { GGML_UNUSED(ctx0); GGML_UNUSED(token_shift); GGML_UNUSED(ubatch); GGML_UNUSED(il); - GGML_UNUSED(worst_case); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -196,8 +174,7 @@ ggml_tensor * llama_graph_i::build_rwkv6_time_mix( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case) { + int il) { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(cur); @@ -206,7 +183,6 @@ ggml_tensor * llama_graph_i::build_rwkv6_time_mix( GGML_UNUSED(state_mask); GGML_UNUSED(ubatch); GGML_UNUSED(il); - GGML_UNUSED(worst_case); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); diff --git a/src/llama-graph.h b/src/llama-graph.h index 8d237431e657a..05349e5872710 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -69,9 +69,7 @@ class llama_graph_i { int32_t n_tokens) = 0; virtual ggml_tensor * build_inp_out_ids( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case) = 0; + ggml_context * ctx0) = 0; virtual ggml_tensor * build_inp_mean( ggml_context * ctx0, @@ -85,8 +83,7 @@ class llama_graph_i { ggml_context * ctx0, int32_t n_tokens, bool causal, - bool swa, - bool worst_case) = 0; + bool swa) = 0; virtual ggml_tensor * build_attn( ggml_context * ctx0, @@ -98,8 +95,7 @@ class llama_graph_i { ggml_tensor * v_cur, int32_t n_tokens, float kq_scale, - int il, - bool worst_case); + int il); virtual void build_kv_self_shift( ggml_context * ctx0, @@ -114,22 +110,17 @@ class llama_graph_i { ggml_context * ctx0); virtual ggml_tensor * build_inp_embd_enc( - ggml_context * ctx0, - int32_t n_tokens, - bool worst_case); + ggml_context * ctx0); virtual ggml_tensor * build_inp_kq_mask_cross( ggml_context * ctx0, - int32_t n_tokens, - bool worst_case); + int32_t n_tokens); virtual ggml_tensor * build_inp_s_copy( - ggml_context * ctx0, - bool worst_case); + ggml_context * ctx0); virtual ggml_tensor * build_inp_s_mask( - ggml_context * ctx0, - bool worst_case); + ggml_context * ctx0); virtual ggml_tensor * build_copy_mask_state( ggml_context * ctx0, @@ -137,10 +128,8 @@ class llama_graph_i { ggml_tensor * s, ggml_tensor * state_copy, ggml_tensor * state_mask, - int32_t n_tokens, int32_t n_state, - int32_t n_seqs, - bool worst_case); + int32_t n_seqs); virtual ggml_tensor * build_mamba_layer( ggml_context * ctx0, @@ -149,8 +138,7 @@ class llama_graph_i { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case); + int il); virtual ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, @@ -158,15 +146,13 @@ class llama_graph_i { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case); + int il); virtual ggml_tensor * build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, - int il, - bool worst_case); + int il); virtual ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, @@ -176,6 +162,5 @@ class llama_graph_i { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il, - bool worst_case); + int il); }; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 3aec6495fe02e..e1b07c9932166 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -610,6 +610,7 @@ struct llama_kv_cache_slot_info llama_kv_cache::find_slot( // sanity check return llama_kv_cache_slot_info(n >= n_seqs); } + // otherwise, one cell per token. if (n_tokens > size) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a0a7816da2ebf..8eb99995ea232 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3834,7 +3834,6 @@ struct llm_build_context { const int32_t n_tokens; const int32_t n_ctx_orig; - const bool worst_case; const bool flash_attn; const enum llama_pooling_type pooling_type; @@ -3851,8 +3850,7 @@ struct llm_build_context { llama_graph_i * lgf, const llama_model & model, const llama_cparams & cparams, - const llama_ubatch & ubatch, - bool worst_case) : + const llama_ubatch & ubatch) : model (model), hparams (model.hparams), cparams (cparams), @@ -3879,7 +3877,6 @@ struct llm_build_context { norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (ubatch.n_tokens), n_ctx_orig (cparams.n_ctx_orig_yarn), - worst_case (worst_case), flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), @@ -3910,7 +3907,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_out_ids() { - ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case); + ggml_tensor * cur = lgf->build_inp_out_ids(ctx0); cb(cur, "inp_out_ids", -1); return cur; @@ -3949,7 +3946,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_embd_enc() { - ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case); + ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0); cb(cur, "embd_enc", -1); return cur; @@ -3957,7 +3954,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_kq_mask_cross() { - ggml_tensor * cur = lgf->build_inp_kq_mask_cross(ctx0, n_tokens, worst_case); + ggml_tensor * cur = lgf->build_inp_kq_mask_cross(ctx0, n_tokens); cb(cur, "KQ_mask_cross", -1); return cur; @@ -4258,7 +4255,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il, worst_case); + ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il); cb(cur, "kqv_out", il); return cur; @@ -4405,7 +4402,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4566,7 +4563,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4722,7 +4719,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4838,7 +4835,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4943,7 +4940,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5066,7 +5063,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5218,7 +5215,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5340,7 +5337,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -5441,7 +5438,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5555,7 +5552,7 @@ struct llm_build_context { inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); cb(inpL, "inp_norm", -1); - lgf->build_attn_inp(ctx0, n_tokens, false, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, false, false); // iterate layers for (int il = 0; il < n_layer; ++il) { @@ -5700,7 +5697,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); inpL = build_norm(inpL, model.tok_norm, @@ -5803,7 +5800,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); if (model.pos_embd) { // inp_pos - contains the positions @@ -5945,7 +5942,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { @@ -6096,7 +6093,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6210,7 +6207,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6323,7 +6320,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -6441,7 +6438,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6588,7 +6585,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { attn_norm_output = build_norm(inpL, @@ -6711,7 +6708,7 @@ struct llm_build_context { struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, true); for (int il = 0; il < n_layer; ++il) { auto * residual = inpL; @@ -6855,7 +6852,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { @@ -6961,7 +6958,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -7067,7 +7064,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -7178,7 +7175,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7297,7 +7294,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7425,7 +7422,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7626,7 +7623,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { // norm @@ -7734,7 +7731,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, true); for (int il = 0; il < n_layer; ++il) { // norm @@ -7864,7 +7861,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7977,8 +7974,8 @@ struct llm_build_context { // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0); for (int il = 0; il < n_layer; ++il) { // norm @@ -7988,7 +7985,7 @@ struct llm_build_context { cb(cur, "attn_norm", il); //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il); - cur = lgf->build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case); + cur = lgf->build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il); if (il == n_layer - 1) { // skip computing output for unused tokens @@ -8039,7 +8036,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { @@ -8187,7 +8184,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, true); // sliding window switch pattern const int32_t sliding_window_pattern = 4; @@ -8322,7 +8319,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8442,7 +8439,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8566,7 +8563,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8687,7 +8684,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { const int64_t n_head = hparams.n_head(il); @@ -8815,7 +8812,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -8959,7 +8956,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9089,7 +9086,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -9252,7 +9249,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9470,7 +9467,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9951,7 +9948,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -10045,7 +10042,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10175,7 +10172,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10296,7 +10293,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10414,8 +10411,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10425,7 +10422,7 @@ struct llm_build_context { const llama_layer * layer = &model.layers[il]; struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load( - ctx0, gf, state_copy, state_mask, ubatch, il, worst_case + ctx0, gf, state_copy, state_mask, ubatch, il ); struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); @@ -10441,7 +10438,7 @@ struct llm_build_context { 1 ); - cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); + cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il); struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -10464,7 +10461,7 @@ struct llm_build_context { ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), 1 ); - ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); + ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il)); if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { cur = ggml_scale(ctx0, cur, 0.5F); @@ -10506,8 +10503,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10519,7 +10516,7 @@ struct llm_build_context { const llama_layer * layer = &model.layers[il]; struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load( - ctx0, gf, state_copy, state_mask, ubatch, il, worst_case + ctx0, gf, state_copy, state_mask, ubatch, il ); struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); @@ -10532,10 +10529,10 @@ struct llm_build_context { 1 ); - cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); + cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il); token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); - ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); + ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il)); struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -10601,7 +10598,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case); + lgf->build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10912,9 +10909,8 @@ llama_graph_result llama_model::build_graph( ggml_cgraph * gf, llama_graph_i * lgf, const llama_cparams & cparams, - const llama_ubatch & ubatch, - bool worst_case) const { - struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch, worst_case); + const llama_ubatch & ubatch) const { + struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch); switch (arch) { case LLM_ARCH_LLAMA: diff --git a/src/llama-model.h b/src/llama-model.h index 94e7622943937..b2d75e593f2f3 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -374,8 +374,7 @@ struct llama_model { ggml_cgraph * gf, llama_graph_i * lgf, const llama_cparams & cparams, - const llama_ubatch & ubatch, - bool worst_case) const; + const llama_ubatch & ubatch) const; private: struct impl; From ebf1bdf97bed94d46c48b3c3b14f1893fa5bfa5e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 21 Feb 2025 14:35:23 +0200 Subject: [PATCH 64/84] context : add logs ggml-ci --- examples/save-load-state/save-load-state.cpp | 2 +- src/llama-context.cpp | 76 ++++++++++++++++++-- src/llama-context.h | 10 +-- 3 files changed, 76 insertions(+), 12 deletions(-) diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 77b1572a9dec5..760ebbbf08788 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -15,7 +15,7 @@ int main(int argc, char ** argv) { return 1; } - print_build_info(); + common_init(); if (params.n_predict < 0) { params.n_predict = 16; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index dc1eb70b85a5e..2a7a4083b547f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -180,6 +180,8 @@ llama_context::llama_context( llama_context::~llama_context() = default; void llama_context::init() { + LLAMA_LOG_DEBUG("%s: call\n", __func__); + const auto & hparams = model.hparams; if (hparams.vocab_only) { @@ -188,13 +190,15 @@ void llama_context::init() { } { - // buffer types used for the compute buffer of each backend + LLAMA_LOG_DEBUG("%s: enumerating backends\n", __func__); + backend_buft.clear(); backend_ptrs.clear(); for (auto & backend : backends) { auto * buft = ggml_backend_get_default_buffer_type(backend.get()); auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) { // use the host buffer of the first device CPU for faster transfer of the intermediate state auto * dev = model.devices[0]; @@ -203,14 +207,18 @@ void llama_context::init() { buft = host_buft; } } + backend_buft.push_back(buft); backend_ptrs.push_back(backend.get()); } + LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size()); + const size_t max_nodes = this->max_nodes(); + LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes); + // buffer used to store the computation graph and the tensor meta data - // TODO: move to base class buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); // TODO: move these checks to ggml_backend_sched @@ -247,6 +255,8 @@ void llama_context::init() { } } + LLAMA_LOG_DEBUG("%s: calling reserve()\n", __func__); + reserve(); } @@ -286,15 +296,17 @@ void llama_context::reserve() { llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + // max number of outputs + n_outputs = n_tokens; + + LLAMA_LOG_DEBUG("%s: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs); + int n_splits_pp = -1; int n_nodes_pp = -1; int n_splits_tg = -1; int n_nodes_tg = -1; - // max number of outputs - n_outputs = n_tokens; - // reserve pp graph first so that buffers are only allocated once { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; @@ -521,21 +533,29 @@ int64_t llama_context::n_pos_per_token() const { void llama_context::attach_threadpool( ggml_threadpool_t threadpool, ggml_threadpool_t threadpool_batch) { + LLAMA_LOG_DEBUG("%s: call\n", __func__); + this->threadpool = threadpool; this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; } void llama_context::detach_threadpool() { + LLAMA_LOG_DEBUG("%s: call\n", __func__); + this->threadpool = nullptr; this->threadpool_batch = nullptr; } void llama_context::set_n_threads(int32_t n_threads, int32_t n_threads_batch) { + LLAMA_LOG_DEBUG("%s: n_threads = %d, n_threads_batch = %d\n", __func__, n_threads, n_threads_batch); + cparams.n_threads = n_threads; cparams.n_threads_batch = n_threads_batch; } void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) { + LLAMA_LOG_DEBUG("%s: call\n", __func__); + this->abort_callback = abort_callback; this->abort_callback_data = abort_callback_data; @@ -549,21 +569,29 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void } void llama_context::set_embeddings(bool value) { + LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); + cparams.embeddings = value; } void llama_context::set_causal_attn(bool value) { + LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); + cparams.causal_attn = value; } void llama_context::set_adapter_lora( - struct llama_adapter_lora * adapter, + llama_adapter_lora * adapter, float scale) { + LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale); + loras[adapter] = scale; } bool llama_context::rm_adapter_lora( - struct llama_adapter_lora * adapter) { + llama_adapter_lora * adapter) { + LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter); + auto pos = loras.find(adapter); if (pos != loras.end()) { loras.erase(pos); @@ -574,6 +602,8 @@ bool llama_context::rm_adapter_lora( } void llama_context::clear_adapter_lora() { + LLAMA_LOG_DEBUG("%s: call\n", __func__); + loras.clear(); } @@ -583,6 +613,8 @@ bool llama_context::apply_adapter_cvec( int32_t n_embd, int32_t il_start, int32_t il_end) { + LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end); + return cvec.apply(model, data, len, n_embd, il_start, il_end); } @@ -2085,8 +2117,12 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file } size_t llama_context::state_get_data(llama_io_write_i & io) { + LLAMA_LOG_DEBUG("%s: writing state\n", __func__); + // write model info { + LLAMA_LOG_DEBUG("%s: - writing model info\n", __func__); + const std::string arch_str = llm_arch_name(model.arch); io.write_string(arch_str); // TODO: add more model-specific info which should prevent loading the session file if not identical @@ -2094,6 +2130,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) { // write output ids { + LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__); + output_reorder(); const auto n_outputs = this->n_outputs; @@ -2124,6 +2162,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) { // write logits { + LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); + const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens()); io.write(&logits_size, sizeof(logits_size)); @@ -2135,6 +2175,8 @@ size_t llama_context::state_get_data(llama_io_write_i & io) { // write embeddings { + LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__); + const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd); io.write(&embd_size, sizeof(embd_size)); @@ -2148,8 +2190,12 @@ size_t llama_context::state_get_data(llama_io_write_i & io) { } size_t llama_context::state_set_data(llama_io_read_i & io) { + LLAMA_LOG_DEBUG("%s: reading state\n", __func__); + // read model info { + LLAMA_LOG_DEBUG("%s: - reading model info\n", __func__); + const std::string cur_arch_str = llm_arch_name(model.arch); std::string arch_str; @@ -2162,6 +2208,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) { // read output ids { + LLAMA_LOG_DEBUG("%s: - reading output ids\n", __func__); + auto n_outputs = this->n_outputs; io.read_to(&n_outputs, sizeof(n_outputs)); @@ -2189,6 +2237,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) { // read logits { + LLAMA_LOG_DEBUG("%s: - reading logits\n", __func__); + uint64_t logits_size; io.read_to(&logits_size, sizeof(logits_size)); @@ -2203,6 +2253,8 @@ size_t llama_context::state_set_data(llama_io_read_i & io) { // read embeddings { + LLAMA_LOG_DEBUG("%s: - reading embeddings\n", __func__); + uint64_t embd_size; io.read_to(&embd_size, sizeof(embd_size)); @@ -2285,6 +2337,8 @@ void llama_context_kv_self::reserve() { // simulate full KV cache kv_self.n = kv_self.size; + LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n); + llama_context::reserve(); } @@ -2297,6 +2351,8 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const { } void llama_context_kv_self::kv_self_update() { + LLAMA_LOG_DEBUG("%s: kv_self_update()\n", __func__); + auto & kv = kv_self; bool need_reserve = false; @@ -2306,6 +2362,8 @@ void llama_context_kv_self::kv_self_update() { GGML_ABORT("The current context does not support K-shift"); } + LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__); + // apply K-shift if needed if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(sched.get()); @@ -2334,6 +2392,8 @@ void llama_context_kv_self::kv_self_update() { // defragment the KV cache if needed if (kv.do_defrag) { + LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); + ggml_backend_sched_reset(sched.get()); auto * gf = graph_init(); @@ -3667,6 +3727,7 @@ ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross( size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { llama_context::state_get_data(io); + LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__); kv_self.state_write(io); return io.n_bytes(); @@ -3675,6 +3736,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { llama_context::state_set_data(io); + LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__); kv_self.state_read(io); return io.n_bytes(); diff --git a/src/llama-context.h b/src/llama-context.h index d4ab5d509b155..bc6a0e291edbd 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -55,11 +55,13 @@ struct llama_context : public llama_graph_i { virtual int32_t max_nodes() const; - // returns nullptr + // self-attention: + + // if the context does not have a KV cache, return nullptr virtual llama_kv_cache * get_kv_self(); virtual const llama_kv_cache * get_kv_self() const; - // noop + // if the context does not have a KV cache, noop virtual void kv_self_update(); virtual enum llama_pooling_type pooling_type() const; @@ -87,11 +89,11 @@ struct llama_context : public llama_graph_i { virtual void set_causal_attn(bool value); virtual void set_adapter_lora( - struct llama_adapter_lora * adapter, + llama_adapter_lora * adapter, float scale); virtual bool rm_adapter_lora( - struct llama_adapter_lora * adapter); + llama_adapter_lora * adapter); virtual void clear_adapter_lora(); From f588a70da3a1177d98e8bc00fe074ab010093709 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 21 Feb 2025 15:08:25 +0200 Subject: [PATCH 65/84] context : wrap input tensors in struct ggml-ci --- src/llama-context.cpp | 196 ++++++++++++++++++++---------------------- src/llama-context.h | 40 +++++---- 2 files changed, 115 insertions(+), 121 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2a7a4083b547f..40d4e47a448bc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -966,32 +966,32 @@ void llama_context::input_set(const llama_ubatch & ubatch) { if (ubatch.token) { const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); + ggml_backend_tensor_set(inp.tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp.tokens)); } if (ubatch.embd) { const int64_t n_embd = hparams.n_embd; const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd)); + ggml_backend_tensor_set(inp.embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp.embd)); } - if (ubatch.pos && inp_pos) { + if (ubatch.pos && inp.pos) { const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp_pos)); + ggml_backend_tensor_set(inp.pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp.pos)); } if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs"); + //GGML_ASSERT(inp.out_ids && "every model that can must skip unused outputs"); - if (!inp_out_ids) { - LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__); + if (!inp.out_ids) { + LLAMA_LOG_WARN("%s: 'inp.out_ids' is not created\n", __func__); } else { const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer)); - int32_t * data = (int32_t *) inp_out_ids->data; + GGML_ASSERT(ggml_backend_buffer_is_host(inp.out_ids->buffer)); + int32_t * data = (int32_t *) inp.out_ids->data; if (n_outputs == n_tokens) { for (int i = 0; i < n_tokens; ++i) { @@ -1020,11 +1020,11 @@ void llama_context::input_set(const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(inp_mean); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer)); + GGML_ASSERT(inp.mean); + GGML_ASSERT(ggml_backend_buffer_is_host(inp.mean->buffer)); - float * data = (float *) inp_mean->data; - memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean)); + float * data = (float *) inp.mean->data; + memset(inp.mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp.mean)); std::vector sum(n_tokens, 0); @@ -1061,11 +1061,11 @@ void llama_context::input_set(const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); + GGML_ASSERT(inp.cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp.cls->buffer)); - uint32_t * data = (uint32_t *) inp_cls->data; - memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); + uint32_t * data = (uint32_t *) inp.cls->data; + memset(inp.cls->data, 0, n_tokens * ggml_element_size(inp.cls)); for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch.seq_id[s][0]; @@ -1088,11 +1088,11 @@ void llama_context::input_set(const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); + GGML_ASSERT(inp.cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp.cls->buffer)); - uint32_t * data = (uint32_t *) inp_cls->data; - memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); + uint32_t * data = (uint32_t *) inp.cls->data; + memset(inp.cls->data, 0, n_tokens * ggml_element_size(inp.cls)); std::vector last_pos(n_tokens, -1); std::vector last_row(n_tokens, -1); @@ -1120,15 +1120,15 @@ void llama_context::input_set(const llama_ubatch & ubatch) { } } - if (inp_kq_mask) { + if (inp.kq_mask) { if (cparams.causal_attn) { const int64_t n_kv = ubatch.n_tokens; const int64_t n_tokens = ubatch.n_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer)); - float * data = (float *) inp_kq_mask->data; + GGML_ASSERT(ggml_backend_buffer_is_host(inp.kq_mask->buffer)); + float * data = (float *) inp.kq_mask->data; for (int h = 0; h < 1; ++h) { for (int s1 = 0; s1 < n_seqs; ++s1) { @@ -1165,9 +1165,9 @@ void llama_context::input_set(const llama_ubatch & ubatch) { const int64_t n_seqs = ubatch.n_seqs; const int64_t n_stride = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp.kq_mask->buffer)); - float * data = (float *) inp_kq_mask->data; + float * data = (float *) inp.kq_mask->data; for (int h = 0; h < 1; ++h) { for (int s1 = 0; s1 < n_seqs; ++s1) { @@ -1329,15 +1329,7 @@ void llama_context::output_reorder() { // ggml_cgraph * llama_context::graph_init() { - inp_tokens = nullptr; - inp_embd = nullptr; - inp_pos = nullptr; - inp_out_ids = nullptr; - inp_mean = nullptr; - inp_cls = nullptr; - - inp_kq_mask = nullptr; - inp_kq_mask_cnv = nullptr; + inp = {}; struct ggml_init_params params = { /*.mem_size =*/ buf_compute_meta.size(), @@ -1563,11 +1555,11 @@ ggml_tensor * llama_context::build_inp_embd( struct ggml_tensor * inpL; if (ubatch.token) { - inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - //cb(inp_tokens, "inp_tokens", -1); - ggml_set_input(inp_tokens); + inp.tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + //cb(inp.tokens, "inp_tokens", -1); + ggml_set_input(inp.tokens); - inpL = ggml_get_rows(ctx0, tok_embd, inp_tokens); + inpL = ggml_get_rows(ctx0, tok_embd, inp.tokens); // apply lora for embedding tokens if needed for (const auto & lora : loras) { @@ -1581,15 +1573,15 @@ ggml_tensor * llama_context::build_inp_embd( struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( ctx0, lw->b, // non-transposed lora_b - ggml_get_rows(ctx0, lw->a, inp_tokens) + ggml_get_rows(ctx0, lw->a, inp.tokens) ), scale); inpL = ggml_add(ctx0, inpL, inpL_delta); } } else { - inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - inpL = inp_embd; - ggml_set_input(inp_embd); + inp.embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); + inpL = inp.embd; + ggml_set_input(inp.embd); } // For Granite architecture @@ -1605,38 +1597,38 @@ ggml_tensor * llama_context::build_inp_embd( ggml_tensor * llama_context::build_inp_pos( ggml_context * ctx0, int32_t n_tokens) { - inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); - ggml_set_input(inp_pos); + inp.pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + ggml_set_input(inp.pos); - return inp_pos; + return inp.pos; } ggml_tensor * llama_context::build_inp_out_ids( ggml_context * ctx0) { const int32_t n_out_ids = n_outputs; - inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids); - ggml_set_input(inp_out_ids); + inp.out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids); + ggml_set_input(inp.out_ids); - return inp_out_ids; + return inp.out_ids; } ggml_tensor * llama_context::build_inp_mean( ggml_context * ctx0, int32_t n_tokens) { - inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); - ggml_set_input(inp_mean); + inp.mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); + ggml_set_input(inp.mean); - return inp_mean; + return inp.mean; } ggml_tensor * llama_context::build_inp_cls( ggml_context * ctx0, int32_t n_tokens) { - inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_input(inp_cls); + inp.cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp.cls); - return inp_cls; + return inp.cls; } void llama_context::build_attn_inp( @@ -1648,11 +1640,11 @@ void llama_context::build_attn_inp( GGML_UNUSED(causal); GGML_UNUSED(swa); - inp_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + inp.kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); //cb(inp_kq_mask, "KQ_mask", -1); - ggml_set_input(inp_kq_mask); + ggml_set_input(inp.kq_mask); - inp_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_kq_mask, GGML_TYPE_F16) : inp_kq_mask; + inp.kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.kq_mask, GGML_TYPE_F16) : inp.kq_mask; } ggml_tensor * llama_context::build_attn( @@ -1673,7 +1665,7 @@ ggml_tensor * llama_context::build_attn( //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - const auto & kq_mask = inp_kq_mask_cnv; + const auto & kq_mask = inp.kq_mask_cnv; const int64_t n_head = hparams.n_head(il); const int64_t n_head_kv = hparams.n_head_kv(il); @@ -2923,10 +2915,10 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; - if (inp_self_k_shift) { - assert(ggml_backend_buffer_is_host(inp_self_k_shift->buffer)); + if (inp.self_k_shift) { + assert(ggml_backend_buffer_is_host(inp.self_k_shift->buffer)); - int32_t * data = (int32_t *) inp_self_k_shift->data; + int32_t * data = (int32_t *) inp.self_k_shift->data; for (uint32_t i = 0; i < kv_self.size; ++i) { data[i] = kv_self.cells[i].delta; @@ -2939,7 +2931,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { // call base functionality llama_context::input_set(ubatch); - if (inp_self_kq_mask || inp_self_kq_mask_swa) { + if (inp.self_kq_mask || inp.self_kq_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. if (cparams.causal_attn && !is_encoding) { const int64_t n_kv = kv_self.n; @@ -2950,14 +2942,14 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { float * data = nullptr; float * data_swa = nullptr; - if (inp_self_kq_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask->buffer)); - data = (float *) inp_self_kq_mask->data; + if (inp.self_kq_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask->buffer)); + data = (float *) inp.self_kq_mask->data; } - if (inp_self_kq_mask_swa) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask_swa->buffer)); - data_swa = (float *) inp_self_kq_mask_swa->data; + if (inp.self_kq_mask_swa) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask_swa->buffer)); + data_swa = (float *) inp.self_kq_mask_swa->data; } // For causal attention, use only the previous KV cells @@ -3020,9 +3012,9 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { // when using kv cache, the mask needs to match the kv cache size const int64_t n_stride = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(inp_self_kq_mask->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask->buffer)); - float * data = (float *) inp_self_kq_mask->data; + float * data = (float *) inp.self_kq_mask->data; for (int h = 0; h < 1; ++h) { for (int s1 = 0; s1 < n_seqs; ++s1) { @@ -3156,20 +3148,16 @@ ggml_cgraph * llama_context_kv_self::graph_init() { inp_pos_bucket = nullptr; inp_kq_mask_cross = nullptr; - inp_self_kq_mask = nullptr; - inp_self_kq_mask_cnv = nullptr; - inp_self_kq_mask_swa = nullptr; - inp_self_kq_mask_swa_cnv = nullptr; - inp_self_k_shift = nullptr; + inp = {}; return llama_context::graph_init(); } ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) { - inp_self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); - ggml_set_input(inp_self_k_shift); + inp.self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); + ggml_set_input(inp.self_k_shift); - return inp_self_k_shift; + return inp.self_k_shift; } void llama_context_kv_self::build_attn_inp( @@ -3179,26 +3167,26 @@ void llama_context_kv_self::build_attn_inp( bool swa) { const auto n_kv = kv_self.n; - inp_self_kq_mask = causal + inp.self_kq_mask = causal ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp_self_kq_mask, "KQ_mask", -1); - ggml_set_input(inp_self_kq_mask); + //cb(inp.self_kq_mask, "KQ_mask", -1); + ggml_set_input(inp.self_kq_mask); - inp_self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_self_kq_mask, GGML_TYPE_F16) : inp_self_kq_mask; + inp.self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.self_kq_mask, GGML_TYPE_F16) : inp.self_kq_mask; if (swa) { const auto & hparams = model.hparams; GGML_ASSERT(hparams.n_swa > 0); - inp_self_kq_mask_swa = causal + inp.self_kq_mask_swa = causal ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp_self_kq_mask_swa, "KQ_mask_swa", -1); - ggml_set_input(inp_self_kq_mask_swa); + //cb(inp.self_kq_mask_swa, "KQ_mask_swa", -1); + ggml_set_input(inp.self_kq_mask_swa); - inp_self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_self_kq_mask_swa, GGML_TYPE_F16) : inp_self_kq_mask_swa; + inp.self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.self_kq_mask_swa, GGML_TYPE_F16) : inp.self_kq_mask_swa; } } @@ -3277,7 +3265,7 @@ ggml_tensor * llama_context_kv_self::build_attn( } }; - const auto & kq_mask = is_sliding ? inp_self_kq_mask_swa_cnv : inp_self_kq_mask_cnv; + const auto & kq_mask = is_sliding ? inp.self_kq_mask_swa_cnv : inp.self_kq_mask_cnv; const auto n_kv = kv_self.n; @@ -4145,9 +4133,9 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { const int64_t n_kv = kv_self.n; - if (inp_s_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer)); - float * data = (float *) inp_s_mask->data; + if (inp.s_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_mask->buffer)); + float * data = (float *) inp.s_mask->data; // clear unused states for (int i = 0; i < n_kv; ++i) { @@ -4164,9 +4152,9 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { } } - if (inp_s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer)); - int32_t * data = (int32_t *) inp_s_copy->data; + if (inp.s_copy) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_copy->buffer)); + int32_t * data = (int32_t *) inp.s_copy->data; // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_kv; ++i) { @@ -4190,8 +4178,8 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { } ggml_cgraph * llama_context_recurrent::graph_init() { - inp_s_copy = nullptr; - inp_s_mask = nullptr; + inp.s_copy = nullptr; + inp.s_mask = nullptr; return llama_context::graph_init(); } @@ -4200,22 +4188,22 @@ ggml_tensor * llama_context_recurrent::build_inp_s_copy( ggml_context * ctx0) { const auto n_kv = kv_self.n; - inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); - //cb(inp_s_copy, "inp_s_copy", -1); - ggml_set_input(inp_s_copy); + inp.s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); + //cb(inp.s_copy, "inp_s_copy", -1); + ggml_set_input(inp.s_copy); - return inp_s_copy; + return inp.s_copy; } ggml_tensor * llama_context_recurrent::build_inp_s_mask( ggml_context * ctx0) { const auto n_kv = kv_self.n; - inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); - //cb(inp_s_mask, "inp_s_mask", -1); - ggml_set_input(inp_s_mask); + inp.s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); + //cb(inp.s_mask, "inp_s_mask", -1); + ggml_set_input(inp.s_mask); - return inp_s_mask; + return inp.s_mask; } ggml_tensor * llama_context_recurrent::build_copy_mask_state( diff --git a/src/llama-context.h b/src/llama-context.h index bc6a0e291edbd..ccb84874f8b62 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -139,17 +139,19 @@ struct llama_context : public llama_graph_i { virtual void input_set(const llama_ubatch & ubatch); - // base input tensors - ggml_tensor * inp_tokens; // I32 [n_batch] - ggml_tensor * inp_embd; // F32 [n_embd, n_batch] - ggml_tensor * inp_pos; // I32 [n_batch] - ggml_tensor * inp_out_ids; // I32 [n_outputs] - ggml_tensor * inp_mean; // F32 [n_batch, n_batch] - ggml_tensor * inp_cls; // I32 [n_batch] + struct { + // base input tensors + ggml_tensor * tokens; // I32 [n_batch] + ggml_tensor * embd; // F32 [n_embd, n_batch] + ggml_tensor * pos; // I32 [n_batch] + ggml_tensor * out_ids; // I32 [n_outputs] + ggml_tensor * mean; // F32 [n_batch, n_batch] + ggml_tensor * cls; // I32 [n_batch] - // KQ mask input tensors - ggml_tensor * inp_kq_mask; // F32 [n_tokens, n_batch] - ggml_tensor * inp_kq_mask_cnv; // [n_tokens, n_batch] + // KQ mask input tensors + ggml_tensor * kq_mask; // F32 [n_tokens, n_batch] + ggml_tensor * kq_mask_cnv; // [n_tokens, n_batch] + } inp; // // output @@ -409,11 +411,13 @@ class llama_context_kv_self : public llama_context { virtual void input_set(const llama_ubatch & ubatch) override; - ggml_tensor * inp_self_kq_mask; // F32 [kv_size, n_batch] - ggml_tensor * inp_self_kq_mask_cnv; // [kv_size, n_batch] - ggml_tensor * inp_self_kq_mask_swa; // F32 [kv_size, n_batch] - ggml_tensor * inp_self_kq_mask_swa_cnv; // [kv_size, n_batch] - ggml_tensor * inp_self_k_shift; // I32 [kv_size] + struct { + ggml_tensor * self_kq_mask; // F32 [kv_size, n_batch] + ggml_tensor * self_kq_mask_cnv; // [kv_size, n_batch] + ggml_tensor * self_kq_mask_swa; // F32 [kv_size, n_batch] + ggml_tensor * self_kq_mask_swa_cnv; // [kv_size, n_batch] + ggml_tensor * self_k_shift; // I32 [kv_size] + } inp; // // graph @@ -519,8 +523,10 @@ class llama_context_recurrent : public llama_context { virtual void input_set(const llama_ubatch & ubatch) override; - struct ggml_tensor * inp_s_copy; // I32 [kv_size] - struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] + struct { + ggml_tensor * s_copy; // I32 [kv_size] + ggml_tensor * s_mask; // F32 [1, n_kv] + } inp; // // graph From 3753b30d658c93c62f1481d4ed0b2d0800f0d284 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 21 Feb 2025 15:50:27 +0200 Subject: [PATCH 66/84] context : fix n_outputs init ggml-ci --- src/llama-context.cpp | 8 +++----- src/llama-context.h | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 40d4e47a448bc..ce68d410a3795 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1274,14 +1274,13 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { logits = has_logits ? output_base : nullptr; embd = has_embd ? output_base + logits_size : nullptr; - output_size = n_outputs_max; - // set all ids as invalid (negative) std::fill(output_ids.begin(), output_ids.end(), -1); ggml_backend_buffer_clear(buf_output.get(), 0); - n_outputs = 0; + this->n_outputs = 0; + this->n_outputs_max = n_outputs_max; return n_outputs_max; } @@ -2131,7 +2130,7 @@ size_t llama_context::state_get_data(llama_io_write_i & io) { std::vector w_output_pos; - GGML_ASSERT(n_outputs <= output_size); + GGML_ASSERT(n_outputs <= n_outputs_max); w_output_pos.resize(n_outputs); @@ -2682,7 +2681,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { /* logits_all */ logits_all); // reserve output buffer - // TODO: move to batch manager? if (output_reserve(n_outputs_all) < n_outputs_all) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); return -2; diff --git a/src/llama-context.h b/src/llama-context.h index ccb84874f8b62..f8f01e1bdfe25 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -375,8 +375,8 @@ struct llama_context : public llama_graph_i { // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE std::map> embd_seq; - int32_t output_size = 0; // capacity (of tokens positions) for the output buffers - int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch + int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch + int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers std::vector output_ids; // map batch token positions to ids of the logits and embd buffers From f5e80208c51ea9ec7c3aa0baac0c029278c86c7c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 21 Feb 2025 19:17:47 +0200 Subject: [PATCH 67/84] wip enc-dec --- src/llama-context.cpp | 32 ++++++++++++++++++++++++++------ src/llama-context.h | 26 +++++++++++++++++++++++--- src/llama-graph.cpp | 2 ++ src/llama-graph.h | 15 +++++++++++++++ src/llama-model.h | 2 -- src/llama.cpp | 9 ++++++--- 6 files changed, 72 insertions(+), 14 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index ce68d410a3795..9b341aa1824e6 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -16,8 +16,10 @@ llama_context::llama_context( const llama_model & model, - const llama_context_params & params) : - model (model) { + const llama_context_params & params, + llama_graph_type gtype) : + llama_graph_i(gtype), + model(model) { LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__); t_start_us = model.t_start_us; @@ -2279,8 +2281,9 @@ size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_ llama_context_kv_self::llama_context_kv_self( const llama_model & model, - const llama_context_params & params) : - llama_context(model, params), + const llama_context_params & params, + llama_graph_type gtype) : + llama_context(model, params, gtype), kv_self(model.hparams) { LLAMA_LOG_INFO("%s: constructing llama_context_kv_self\n", __func__); @@ -3750,8 +3753,9 @@ size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq llama_context_recurrent::llama_context_recurrent( const llama_model & model, - const llama_context_params & params) : - llama_context(model, params), + const llama_context_params & params, + llama_graph_type gtype) : + llama_context(model, params, gtype), kv_self(model.hparams) { LLAMA_LOG_INFO("%s: constructing llama_context_recurrent\n", __func__); @@ -4619,6 +4623,22 @@ size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_s return io.n_bytes(); } +// +// llama_context_enc_dec +// + +llama_context_enc_dec::llama_context_enc_dec( + const llama_model & model, + const llama_context_params & params) : + llama_context(model, params, LLAMA_GRAPH_TYPE_ENCODER), + ctx_dec(model, params, LLAMA_GRAPH_TYPE_DECODER) { + LLAMA_LOG_INFO("%s: constructing llama_context_enc_dec\n", __func__); +} + +llama_context_enc_dec::~llama_context_enc_dec() { + LLAMA_LOG_INFO("%s: destructing llama_context_enc_dec\n", __func__); +} + // // interface implementation // diff --git a/src/llama-context.h b/src/llama-context.h index f8f01e1bdfe25..7cc982e10bef0 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -25,7 +25,8 @@ struct llama_context : public llama_graph_i { public: llama_context( const llama_model & model, - const llama_context_params & params); + const llama_context_params & params, + llama_graph_type gtype); virtual ~llama_context(); @@ -388,7 +389,8 @@ class llama_context_kv_self : public llama_context { public: llama_context_kv_self( const llama_model & model, - const llama_context_params & params); + const llama_context_params & params, + llama_graph_type gtype); virtual ~llama_context_kv_self(); @@ -500,7 +502,8 @@ class llama_context_recurrent : public llama_context { public: llama_context_recurrent( const llama_model & model, - const llama_context_params & params); + const llama_context_params & params, + llama_graph_type gtype); virtual ~llama_context_recurrent(); @@ -604,6 +607,23 @@ class llama_context_recurrent : public llama_context { llama_kv_cache_recurrent kv_self; }; +class llama_context_enc : public llama_context { +public: + using llama_context::llama_context; +}; + +class llama_context_enc_dec : public llama_context { +public: + llama_context_enc_dec( + const llama_model & model, + const llama_context_params & params); + + virtual ~llama_context_enc_dec(); + +protected: + llama_context_kv_self ctx_dec; +}; + // For internal test use // TODO: remove const std::vector> & llama_internal_get_tensor_map(struct llama_context * ctx); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index af556f5bb81f0..af2c94be7f85a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2,6 +2,8 @@ #include "llama-impl.h" +llama_graph_i::llama_graph_i(llama_graph_type type) : type(type) {} + ggml_tensor * llama_graph_i::build_attn( ggml_context * ctx0, ggml_cgraph * gf, diff --git a/src/llama-graph.h b/src/llama-graph.h index 05349e5872710..82d2dc736257a 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -11,6 +11,12 @@ struct ggml_tensor; struct ggml_backend_buffer; struct llama_ubatch; +enum llama_graph_type { + LLAMA_GRAPH_TYPE_DEFAULT, + LLAMA_GRAPH_TYPE_ENCODER, + LLAMA_GRAPH_TYPE_DECODER, +}; + struct llama_graph_result { // important graph nodes ggml_tensor * t_logits = nullptr; @@ -20,6 +26,15 @@ struct llama_graph_result { // TODO: can become more granular in the future class llama_graph_i { +public: + llama_graph_i(llama_graph_type type); + virtual ~llama_graph_i() = default; + + llama_graph_type get_type() const { return type; } + +protected: + llama_graph_type type; + public: // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) virtual void build_cb( diff --git a/src/llama-model.h b/src/llama-model.h index b2d75e593f2f3..447fc0d0576d6 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -5,8 +5,6 @@ #include "llama-hparams.h" #include "llama-vocab.h" -#include "ggml-cpp.h" - #include #include #include diff --git a/src/llama.cpp b/src/llama.cpp index 9bacc9e9b4bea..4ce0c92c4df35 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -331,17 +331,20 @@ struct llama_context * llama_init_from_model( case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: - ctx = new llama_context(*model, params); + ctx = new llama_context_enc(*model, params, LLAMA_GRAPH_TYPE_DEFAULT); + break; + case LLM_ARCH_T5: + ctx = new llama_context_enc_dec(*model, params); break; case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6QWEN2: case LLM_ARCH_MAMBA: GGML_ASSERT(llama_model_is_recurrent(model)); - ctx = new llama_context_recurrent(*model, params); + ctx = new llama_context_recurrent(*model, params, LLAMA_GRAPH_TYPE_DEFAULT); break; default: GGML_ASSERT(!llama_model_is_recurrent(model)); - ctx = new llama_context_kv_self(*model, params); + ctx = new llama_context_kv_self(*model, params, LLAMA_GRAPH_TYPE_DEFAULT); }; ctx->init(); From 372fa3a894757cdd844a27141c6396718fce4f4c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 23 Feb 2025 11:38:59 +0200 Subject: [PATCH 68/84] cont : enc should work now, next is dec ggml-ci --- src/llama-context.cpp | 188 +++++++++++++++++++---------- src/llama-context.h | 41 ++++--- src/llama-graph.cpp | 2 + src/llama-graph.h | 5 + src/llama-model.cpp | 274 +++++++++++++++++++++--------------------- 5 files changed, 293 insertions(+), 217 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9b341aa1824e6..d98f4662c2463 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -10,21 +10,64 @@ #include #include +// +// helpers +// + +static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { + // TODO move to hparams if a T5 variant appears that uses a different value + const int64_t max_distance = 128; + + if (bidirectional) { + n_buckets >>= 1; + } + + const int64_t max_exact = n_buckets >> 1; + + int32_t relative_position = x - y; + int32_t relative_bucket = 0; + + if (bidirectional) { + relative_bucket += (relative_position > 0) * n_buckets; + relative_position = abs(relative_position); + } else { + relative_position = -std::min(relative_position, 0); + } + + int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact)); + relative_position_if_large = std::min(relative_position_if_large, n_buckets - 1); + relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large); + + return relative_bucket; +} + // // llama_context // llama_context::llama_context( const llama_model & model, - const llama_context_params & params, + llama_context_params params, llama_graph_type gtype) : llama_graph_i(gtype), model(model) { - LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__); + LLAMA_LOG_INFO("%s: constructing llama_context, gtype = %d\n", __func__, gtype); t_start_us = model.t_start_us; t_load_us = model.t_load_us; + switch (gtype) { + case LLAMA_GRAPH_TYPE_DEFAULT: + case LLAMA_GRAPH_TYPE_DECODER: + { + } break; + case LLAMA_GRAPH_TYPE_ENCODER: + { + params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; + params.embeddings = true; + } break; + } + const auto & hparams = model.hparams; cparams.n_seq_max = std::max(1u, params.n_seq_max); @@ -45,20 +88,6 @@ llama_context::llama_context( cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; - // with causal attention, the batch size is limited by the context size - cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; - - // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask - // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) - // ref: https://github.com/ggerganov/llama.cpp/pull/5021 - // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self - if (cparams.n_batch < GGML_KQ_MASK_PAD) { - LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); - cparams.n_batch = GGML_KQ_MASK_PAD; - } - - cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : hparams.n_ctx_train; @@ -95,6 +124,20 @@ llama_context::llama_context( cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; } + // with causal attention, the batch size is limited by the context size + cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; + + // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask + // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) + // ref: https://github.com/ggerganov/llama.cpp/pull/5021 + // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self + if (cparams.n_batch < GGML_KQ_MASK_PAD) { + LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); + cparams.n_batch = GGML_KQ_MASK_PAD; + } + + cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); @@ -102,6 +145,7 @@ llama_context::llama_context( LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); + LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn); LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); @@ -1207,6 +1251,23 @@ void llama_context::input_set(const llama_ubatch & ubatch) { } } + if (inp.pos_bucket) { + const int64_t n_tokens = ubatch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(inp.pos_bucket->buffer)); + GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing + + int32_t * data = (int32_t *) inp.pos_bucket->data; + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + for (int i = 0; i < n_tokens; ++i) { + data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, true); + } + } + } + } + GGML_ASSERT( // (!a || b) is a logical implication (a -> b) // !hparams.causal_attn -> !cparams.causal_attn @@ -1604,6 +1665,15 @@ ggml_tensor * llama_context::build_inp_pos( return inp.pos; } +ggml_tensor * llama_context::build_inp_pos_bucket( + ggml_context * ctx0, + int32_t n_tokens) { + inp.pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); + ggml_set_input(inp.pos_bucket); + + return inp.pos_bucket; +} + ggml_tensor * llama_context::build_inp_out_ids( ggml_context * ctx0) { const int32_t n_out_ids = n_outputs; @@ -1656,6 +1726,7 @@ ggml_tensor * llama_context::build_attn( ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, + ggml_tensor * kq_b, int32_t n_tokens, float kq_scale, int il) { @@ -1690,6 +1761,8 @@ ggml_tensor * llama_context::build_attn( GGML_UNUSED(model); GGML_UNUSED(n_ctx); + GGML_ASSERT(kq_b == nullptr); + struct ggml_tensor * v = ggml_cont(ctx0, ggml_permute(ctx0, v_cur, 0, 2, 1, 3)); v = ggml_reshape_3d(ctx0, v, n_embd_head_v, n_kv, n_head_kv); @@ -1720,10 +1793,14 @@ ggml_tensor * llama_context::build_attn( if (hparams.attn_soft_cap) { kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping); - kq = ggml_tanh(ctx0, kq); + kq = ggml_tanh (ctx0, kq); kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping); } + if (kq_b) { + kq = ggml_add(ctx0, kq, kq_b); + } + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); //cb(kq, "kq_soft_max_ext", il); @@ -2281,7 +2358,7 @@ size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_ llama_context_kv_self::llama_context_kv_self( const llama_model & model, - const llama_context_params & params, + llama_context_params params, llama_graph_type gtype) : llama_context(model, params, gtype), kv_self(model.hparams) { @@ -3053,53 +3130,19 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { } } - if (inp_pos_bucket) { + if (inp.self_pos_bucket) { const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(inp_pos_bucket->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_pos_bucket->buffer)); GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - static const auto relative_position_bucket = [](llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { - // TODO move to hparams if a T5 variant appears that uses a different value - const int64_t max_distance = 128; - - if (bidirectional) { - n_buckets >>= 1; - } + int32_t * data = (int32_t *) inp.self_pos_bucket->data; - const int64_t max_exact = n_buckets >> 1; - - int32_t relative_position = x - y; - int32_t relative_bucket = 0; - if (bidirectional) { - relative_bucket += (relative_position > 0) * n_buckets; - relative_position = abs(relative_position); - } else { - relative_position = -std::min(relative_position, 0); - } - int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact)); - relative_position_if_large = std::min(relative_position_if_large, n_buckets - 1); - relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large); - return relative_bucket; - }; - - int32_t * data = (int32_t *) inp_pos_bucket->data; - - if (!is_encoding) { - const int64_t n_kv = kv_self.n; - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - for (int i = 0; i < n_kv; ++i) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); - } - } - } - } else { - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - for (int i = 0; i < n_tokens; ++i) { - data[h*(n_tokens*n_tokens) + j*n_tokens + i] = relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); - } + const int64_t n_kv = kv_self.n; + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + for (int i = 0; i < n_kv; ++i) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, false); } } } @@ -3146,7 +3189,6 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { ggml_cgraph * llama_context_kv_self::graph_init() { inp_embd_enc = nullptr; - inp_pos_bucket = nullptr; inp_kq_mask_cross = nullptr; inp = {}; @@ -3161,6 +3203,17 @@ ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) return inp.self_k_shift; } +ggml_tensor * llama_context_kv_self::build_inp_pos_bucket( + ggml_context * ctx0, + int32_t n_tokens) { + const auto n_kv = kv_self.n; + + inp.self_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); + ggml_set_input(inp.self_pos_bucket); + + return inp.self_pos_bucket; +} + void llama_context_kv_self::build_attn_inp( ggml_context * ctx0, int32_t n_tokens, @@ -3199,6 +3252,7 @@ ggml_tensor * llama_context_kv_self::build_attn( ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, + ggml_tensor * kq_b, int32_t n_tokens, float kq_scale, int il) { @@ -3293,6 +3347,8 @@ ggml_tensor * llama_context_kv_self::build_attn( GGML_UNUSED(model); GGML_UNUSED(n_ctx); + GGML_ASSERT(kq_b == nullptr); + // split cached v into n_head heads (not transposed) struct ggml_tensor * v = ggml_view_3d(ctx0, kv_self.v_l[il], @@ -3329,10 +3385,14 @@ ggml_tensor * llama_context_kv_self::build_attn( if (hparams.attn_soft_cap) { kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping); - kq = ggml_tanh(ctx0, kq); + kq = ggml_tanh (ctx0, kq); kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping); } + if (kq_b) { + kq = ggml_add(ctx0, kq, kq_b); + } + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); //cb(kq, "kq_soft_max_ext", il); @@ -3753,7 +3813,7 @@ size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq llama_context_recurrent::llama_context_recurrent( const llama_model & model, - const llama_context_params & params, + llama_context_params params, llama_graph_type gtype) : llama_context(model, params, gtype), kv_self(model.hparams) { @@ -4629,7 +4689,7 @@ size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_s llama_context_enc_dec::llama_context_enc_dec( const llama_model & model, - const llama_context_params & params) : + llama_context_params params) : llama_context(model, params, LLAMA_GRAPH_TYPE_ENCODER), ctx_dec(model, params, LLAMA_GRAPH_TYPE_DECODER) { LLAMA_LOG_INFO("%s: constructing llama_context_enc_dec\n", __func__); diff --git a/src/llama-context.h b/src/llama-context.h index 7cc982e10bef0..3e9baabfb5e67 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -25,7 +25,7 @@ struct llama_context : public llama_graph_i { public: llama_context( const llama_model & model, - const llama_context_params & params, + llama_context_params params, llama_graph_type gtype); virtual ~llama_context(); @@ -142,12 +142,13 @@ struct llama_context : public llama_graph_i { struct { // base input tensors - ggml_tensor * tokens; // I32 [n_batch] - ggml_tensor * embd; // F32 [n_embd, n_batch] - ggml_tensor * pos; // I32 [n_batch] - ggml_tensor * out_ids; // I32 [n_outputs] - ggml_tensor * mean; // F32 [n_batch, n_batch] - ggml_tensor * cls; // I32 [n_batch] + ggml_tensor * tokens; // I32 [n_batch] + ggml_tensor * embd; // F32 [n_embd, n_batch] + ggml_tensor * pos; // I32 [n_batch] + ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] + ggml_tensor * out_ids; // I32 [n_outputs] + ggml_tensor * mean; // F32 [n_batch, n_batch] + ggml_tensor * cls; // I32 [n_batch] // KQ mask input tensors ggml_tensor * kq_mask; // F32 [n_tokens, n_batch] @@ -233,6 +234,10 @@ struct llama_context : public llama_graph_i { ggml_context * ctx0, int32_t n_tokens); + virtual ggml_tensor * build_inp_pos_bucket( + ggml_context * ctx0, + int32_t n_tokens); + virtual ggml_tensor * build_inp_out_ids( ggml_context * ctx0); @@ -258,6 +263,7 @@ struct llama_context : public llama_graph_i { ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, + ggml_tensor * kq_b, int32_t n_tokens, float kq_scale, int il); @@ -389,7 +395,7 @@ class llama_context_kv_self : public llama_context { public: llama_context_kv_self( const llama_model & model, - const llama_context_params & params, + llama_context_params params, llama_graph_type gtype); virtual ~llama_context_kv_self(); @@ -414,10 +420,11 @@ class llama_context_kv_self : public llama_context { virtual void input_set(const llama_ubatch & ubatch) override; struct { - ggml_tensor * self_kq_mask; // F32 [kv_size, n_batch] - ggml_tensor * self_kq_mask_cnv; // [kv_size, n_batch] - ggml_tensor * self_kq_mask_swa; // F32 [kv_size, n_batch] - ggml_tensor * self_kq_mask_swa_cnv; // [kv_size, n_batch] + ggml_tensor * self_pos_bucket; // I32 [n_kv, n_batch] + ggml_tensor * self_kq_mask; // F32 [n_kv, n_batch] + ggml_tensor * self_kq_mask_cnv; // [n_kv, n_batch] + ggml_tensor * self_kq_mask_swa; // F32 [n_kv, n_batch] + ggml_tensor * self_kq_mask_swa_cnv; // [n_kv, n_batch] ggml_tensor * self_k_shift; // I32 [kv_size] } inp; @@ -433,6 +440,10 @@ class llama_context_kv_self : public llama_context { virtual ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override; + virtual ggml_tensor * build_inp_pos_bucket( + ggml_context * ctx0, + int32_t n_tokens) override; + virtual void build_attn_inp( ggml_context * ctx0, int32_t n_tokens, @@ -447,6 +458,7 @@ class llama_context_kv_self : public llama_context { ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, + ggml_tensor * kq_b, int32_t n_tokens, float kq_scale, int il) override; @@ -470,7 +482,6 @@ class llama_context_kv_self : public llama_context { std::vector> seq_ids_enc; struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] - struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch] virtual ggml_tensor * build_inp_embd_enc( @@ -502,7 +513,7 @@ class llama_context_recurrent : public llama_context { public: llama_context_recurrent( const llama_model & model, - const llama_context_params & params, + llama_context_params params, llama_graph_type gtype); virtual ~llama_context_recurrent(); @@ -616,7 +627,7 @@ class llama_context_enc_dec : public llama_context { public: llama_context_enc_dec( const llama_model & model, - const llama_context_params & params); + llama_context_params params); virtual ~llama_context_enc_dec(); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index af2c94be7f85a..3ac96908d69e5 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -12,6 +12,7 @@ ggml_tensor * llama_graph_i::build_attn( ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, + ggml_tensor * kq_b, int32_t n_tokens, float kq_scale, int il) { @@ -22,6 +23,7 @@ ggml_tensor * llama_graph_i::build_attn( GGML_UNUSED(q_cur); GGML_UNUSED(k_cur); GGML_UNUSED(v_cur); + GGML_UNUSED(kq_b); GGML_UNUSED(n_tokens); GGML_UNUSED(kq_scale); GGML_UNUSED(il); diff --git a/src/llama-graph.h b/src/llama-graph.h index 82d2dc736257a..5df90e76d5e3d 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -83,6 +83,10 @@ class llama_graph_i { ggml_context * ctx0, int32_t n_tokens) = 0; + virtual ggml_tensor * build_inp_pos_bucket( + ggml_context * ctx0, + int32_t n_tokens) = 0; + virtual ggml_tensor * build_inp_out_ids( ggml_context * ctx0) = 0; @@ -108,6 +112,7 @@ class llama_graph_i { ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, + ggml_tensor * kq_b, int32_t n_tokens, float kq_scale, int il); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c862502d3cbac..1e34ed80388bb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1432,7 +1432,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // skip unused tensors if (info.op == GGML_OP_NONE) { - LLAMA_LOG_WARN("model has unused tensor %s -- ignoring\n", tn.str().c_str()); + const size_t nbytes = ggml_nbytes(t_meta); + LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes); + + ml.size_data -= nbytes; ml.n_created++; return nullptr; @@ -3952,6 +3955,14 @@ struct llm_build_context { return lgf->build_lora_mm_id(ctx0, w, cur, ids); } + // TODO: tmp + struct ggml_tensor * build_pos_bucket() { + ggml_tensor * cur = lgf->build_inp_pos_bucket(ctx0, n_tokens); + cb(cur, "pos_bucket", -1); + + return cur; + } + // TODO: tmp struct ggml_tensor * build_inp_embd_enc() { ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0); @@ -4263,7 +4274,30 @@ struct llm_build_context { ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il); + ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, nullptr, n_tokens, kq_scale, il); + cb(cur, "kqv_out", il); + + return cur; + } + + struct ggml_tensor * build_attn_with_kq_b( + struct ggml_cgraph * gf, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * q_cur, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + struct ggml_tensor * kq_b, + int32_t n_tokens, + float kq_scale, + int il) { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, k_cur); + ggml_build_forward_expand(gf, v_cur); + + ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, kq_b, n_tokens, kq_scale, il); cb(cur, "kqv_out", il); return cur; @@ -4364,37 +4398,24 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); } - //struct ggml_tensor * build_pos_bucket(bool causal) { - // if (causal) { - // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); - // } else { - // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); - // } + struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { + struct ggml_tensor * pos_bucket_1d = ggml_reshape_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1]); + cb(pos_bucket_1d, "pos_bucket_1d", -1); - // ggml_set_input(lctx.inp_pos_bucket); - // cb(lctx.inp_pos_bucket, "pos_bucket", -1); + struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); + cb(pos_bias, "pos_bias", -1); - // return lctx.inp_pos_bucket; - //} + pos_bias = ggml_reshape_3d(ctx0, pos_bias, pos_bias->ne[0], pos_bucket->ne[0], pos_bucket->ne[1]); + cb(pos_bias, "pos_bias", -1); - //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { - // struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); - // cb(pos_bucket_1d, "pos_bucket_1d", -1); + pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); + cb(pos_bias, "pos_bias", -1); - // struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); - // cb(pos_bias, "pos_bias", -1); + pos_bias = ggml_cont(ctx0, pos_bias); + cb(pos_bias, "pos_bias", -1); - // pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0); - // cb(pos_bias, "pos_bias", -1); - - // pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); - // cb(pos_bias, "pos_bias", -1); - - // pos_bias = ggml_cont(ctx0, pos_bias); - // cb(pos_bias, "pos_bias", -1); - - // return pos_bias; - //} + return pos_bias; + } void build_llama(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -9614,132 +9635,104 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); } - //void build_t5_enc(ggml_cgraph * gf) { - // const int64_t n_embd_head = hparams.n_embd_head_v; - // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - - // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - // struct ggml_tensor * cur; - // struct ggml_tensor * inpL; - - // inpL = build_inp_embd(model.tok_embd); - - // GGML_ASSERT(lctx.is_encoding); - // struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); - - // // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - // struct ggml_tensor * KQ_mask_enc = build_inp_kq_mask(false); - - // for (int il = 0; il < n_layer; ++il) { - // struct ggml_tensor * inpSA = inpL; - - // // norm - // cur = build_norm(inpL, - // model.layers[il].attn_norm_enc, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "attn_norm", il); + void build_t5_enc(ggml_cgraph * gf) { + const int64_t n_embd_head = hparams.n_embd_head_v; - // // self-attention - // { - // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur); - // cb(Qcur, "Qcur", il); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur); - // cb(Kcur, "Kcur", il); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur); - // cb(Vcur, "Vcur", il); + inpL = build_inp_embd(model.tok_embd); - // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + struct ggml_tensor * pos_bucket_enc = build_pos_bucket(); - // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + lgf->build_attn_inp(ctx0, n_tokens, false, false); - // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - // cb(kq, "kq", il); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; - // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); - // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - // cb(kq_b, "kq_b", il); + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm_enc, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); - // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); - // cb(kq, "kq_soft_max_ext", il); + // self-attention + { + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur); + cb(Qcur, "Qcur", il); - // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - // cb(v, "v", il); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur); + cb(Kcur, "Kcur", il); - // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - // cb(kqv, "kqv", il); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur); + cb(Vcur, "Vcur", il); - // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - // cb(kqv_merged, "kqv_merged", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - // cb(cur, "kqv_merged_cont", il); + struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; + struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b); - // ggml_build_forward_expand(gf, cur); + cur = build_attn_with_kq_b(gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, kq_b, n_tokens, 1.0f, il); + cb(cur, "kqv_out", il); + } - // cur = build_lora_mm(model.layers[il].wo_enc, cur); - // cb(cur, "kqv_out", il); - // } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } - // if (il == n_layer - 1) { - // // skip computing output for unused tokens - // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - // cur = ggml_get_rows(ctx0, cur, inp_out_ids); - // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - // } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); - // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - // cb(ffn_inp, "ffn_inp", il); + // feed-forward network + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm_enc, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); - // // feed-forward network - // { - // cur = build_norm(ffn_inp, - // model.layers[il].ffn_norm_enc, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "ffn_norm", il); + // T5 uses relu, flan-T5 uses gelu-gated + cur = build_ffn(cur, + model.layers[il].ffn_up_enc, NULL, NULL, + model.layers[il].ffn_gate_enc, NULL, NULL, + model.layers[il].ffn_down_enc, NULL, NULL, + NULL, + model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + il); + cb(cur, "ffn_out", il); + } - // // T5 uses relu, flan-T5 uses gelu-gated - // cur = build_ffn(cur, - // model.layers[il].ffn_up_enc, NULL, NULL, - // model.layers[il].ffn_gate_enc, NULL, NULL, - // model.layers[il].ffn_down_enc, NULL, NULL, - // NULL, - // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - // il); - // cb(cur, "ffn_out", il); - // } + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); - // cur = ggml_add(ctx0, cur, ffn_inp); - // cb(cur, "ffn_out", il); + cur = lgf->build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); - // ggml_tensor * layer_dir = cvec.tensor_for(il); - // if (layer_dir != nullptr) { - // cur = ggml_add(ctx0, cur, layer_dir); - // } - // cb(cur, "l_out", il); + // input for next layer + inpL = cur; + } - // // input for next layer - // inpL = cur; - // } + cur = inpL; + cb(cur, "result_embd", -1); - // cur = inpL; - // cb(cur, "result_embd", -1); + cur = build_norm(cur, + model.output_norm_enc, NULL, + LLM_NORM_RMS, -1); - // cur = build_norm(cur, - // model.output_norm_enc, NULL, - // LLM_NORM_RMS, -1); - // - // cb(cur, "result_norm", -1); - // res.t_embd = cur; + cb(cur, "result_norm", -1); + res.t_embd = cur; - // ggml_build_forward_expand(gf, cur); - //} + ggml_build_forward_expand(gf, cur); + } //void build_t5_dec(ggml_cgraph * gf) { // const int64_t n_embd_head = hparams.n_embd_head_v; @@ -11091,14 +11084,19 @@ llama_graph_result llama_model::build_graph( { llm.build_bitnet(gf); } break; - //case LLM_ARCH_T5: - // { - // if (lctx.is_encoding) { - // llm.build_t5_enc(gf); - // } else { - // llm.build_t5_dec(gf); - // } - // } break; + case LLM_ARCH_T5: + { + switch (lgf->get_type()) { + case LLAMA_GRAPH_TYPE_ENCODER: + llm.build_t5_enc(gf); + break; + case LLAMA_GRAPH_TYPE_DECODER: + //llm.build_t5_dec(gf); + break; + default: + GGML_ABORT("invalid graph type"); + }; + } break; //case LLM_ARCH_T5ENCODER: // { // llm.build_t5_enc(gf); From 6378112cb5c91125f32bcf35e7f556ee6be40fb9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 23 Feb 2025 19:39:22 +0200 Subject: [PATCH 69/84] graph : remove the build_kv_... API from llama_graph_i ggml-ci --- src/llama-context.cpp | 19 +++++++++++++++++ src/llama-context.h | 47 ++++++++++++++++++++++++++++--------------- src/llama-graph.cpp | 18 ----------------- src/llama-graph.h | 9 --------- 4 files changed, 50 insertions(+), 43 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d98f4662c2463..5ad1e2a61edbb 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1842,6 +1842,25 @@ ggml_tensor * llama_context::build_attn( return cur; } +void llama_context::build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); +} + +void llama_context::build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); +} + + // // perf // diff --git a/src/llama-context.h b/src/llama-context.h index 3e9baabfb5e67..09c8f484251c6 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -171,7 +171,7 @@ struct llama_context : public llama_graph_i { // graph // - // zero-out inputs and create the ctx_context for the compute graph + // zero-out inputs and create the ctx_compute for the compute graph virtual ggml_cgraph * graph_init(); // TODO: add encode/decode graphs @@ -187,73 +187,74 @@ struct llama_context : public llama_graph_i { ggml_context_ptr ctx_compute; +public: // - // graph build API (generic) + // graph build // virtual void build_cb( ggml_tensor * cur, const char * name, const llama_ubatch & ubatch, - int il); + int il) override; // apply control vector for layer il virtual ggml_tensor * build_cvec( ggml_context * ctx0, ggml_tensor * cur, - int il); + int il) override; // do mat_mul, while optionally apply lora virtual ggml_tensor * build_lora_mm( ggml_context * ctx0, ggml_tensor * w, - ggml_tensor * cur); + ggml_tensor * cur) override; // do mat_mul_id, while optionally apply lora virtual ggml_tensor * build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, // struct ggml_tensor * as ggml_tensor * cur, // struct ggml_tensor * b - ggml_tensor * ids); + ggml_tensor * ids) override; - virtual ggml_tensor * build_rope_factors(int il); + virtual ggml_tensor * build_rope_factors(int il) override; virtual ggml_tensor * build_rope_shift( ggml_context * ctx0, ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, - ggml_backend_buffer * bbuf); + ggml_backend_buffer * bbuf) override; virtual ggml_tensor * build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, - const llama_ubatch & ubatch); + const llama_ubatch & ubatch) override; virtual ggml_tensor * build_inp_pos( ggml_context * ctx0, - int32_t n_tokens); + int32_t n_tokens) override; virtual ggml_tensor * build_inp_pos_bucket( ggml_context * ctx0, - int32_t n_tokens); + int32_t n_tokens) override; virtual ggml_tensor * build_inp_out_ids( - ggml_context * ctx0); + ggml_context * ctx0) override; virtual ggml_tensor * build_inp_mean( ggml_context * ctx0, - int32_t n_tokens); + int32_t n_tokens) override; virtual ggml_tensor * build_inp_cls( ggml_context * ctx0, - int32_t n_tokens); + int32_t n_tokens) override; virtual void build_attn_inp( ggml_context * ctx0, int32_t n_tokens, bool causal, - bool swa); + bool swa) override; virtual ggml_tensor * build_attn( ggml_context * ctx0, @@ -266,7 +267,17 @@ struct llama_context : public llama_graph_i { ggml_tensor * kq_b, int32_t n_tokens, float kq_scale, - int il); + int il) override; + +protected: + virtual void build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf); + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + virtual void build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf); public: // @@ -434,6 +445,7 @@ class llama_context_kv_self : public llama_context { virtual ggml_cgraph * graph_init() override; +public: // // graph build // @@ -463,6 +475,7 @@ class llama_context_kv_self : public llama_context { float kq_scale, int il) override; +protected: virtual void build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * gf) override; @@ -548,6 +561,7 @@ class llama_context_recurrent : public llama_context { virtual ggml_cgraph * graph_init() override; +public: // // graph build // @@ -600,6 +614,7 @@ class llama_context_recurrent : public llama_context { const llama_ubatch & ubatch, int il) override; +protected: // // state save/load // diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 3ac96908d69e5..25922260d2a7c 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -32,24 +32,6 @@ ggml_tensor * llama_graph_i::build_attn( return nullptr; } -void llama_graph_i::build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * gf) { - GGML_UNUSED(ctx0); - GGML_UNUSED(gf); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); -} - -void llama_graph_i::build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * gf) { - GGML_UNUSED(ctx0); - GGML_UNUSED(gf); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); -} - ggml_tensor * llama_graph_i::build_inp_self_k_shift( ggml_context * ctx0) { GGML_UNUSED(ctx0); diff --git a/src/llama-graph.h b/src/llama-graph.h index 5df90e76d5e3d..3433caf63ac89 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -117,15 +117,6 @@ class llama_graph_i { float kq_scale, int il); - virtual void build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * gf); - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * gf); - virtual ggml_tensor * build_inp_self_k_shift( ggml_context * ctx0); From 0699a44c83b5349e13c0e4abe0b3ab09e1d6462c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 23 Feb 2025 20:02:11 +0200 Subject: [PATCH 70/84] context : remove redundant virtual, protected -> private ggml-ci --- src/llama-context.cpp | 8 +++ src/llama-context.h | 126 ++++++++++++++++++++++++------------------ src/llama-graph.cpp | 8 --- src/llama-graph.h | 3 - 4 files changed, 79 insertions(+), 66 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 5ad1e2a61edbb..7628cbc9bf20c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1842,6 +1842,14 @@ ggml_tensor * llama_context::build_attn( return cur; } +ggml_tensor * llama_context::build_inp_self_k_shift( + ggml_context * ctx0) { + GGML_UNUSED(ctx0); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + return nullptr; +} + void llama_context::build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * gf) { diff --git a/src/llama-context.h b/src/llama-context.h index 09c8f484251c6..0e55aae1c8dfb 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -140,6 +140,7 @@ struct llama_context : public llama_graph_i { virtual void input_set(const llama_ubatch & ubatch); +private: struct { // base input tensors ggml_tensor * tokens; // I32 [n_batch] @@ -155,6 +156,7 @@ struct llama_context : public llama_graph_i { ggml_tensor * kq_mask_cnv; // [n_tokens, n_batch] } inp; +protected: // // output // @@ -192,71 +194,71 @@ struct llama_context : public llama_graph_i { // graph build // - virtual void build_cb( + void build_cb( ggml_tensor * cur, const char * name, const llama_ubatch & ubatch, int il) override; // apply control vector for layer il - virtual ggml_tensor * build_cvec( + ggml_tensor * build_cvec( ggml_context * ctx0, ggml_tensor * cur, int il) override; // do mat_mul, while optionally apply lora - virtual ggml_tensor * build_lora_mm( + ggml_tensor * build_lora_mm( ggml_context * ctx0, ggml_tensor * w, ggml_tensor * cur) override; // do mat_mul_id, while optionally apply lora - virtual ggml_tensor * build_lora_mm_id( + ggml_tensor * build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, // struct ggml_tensor * as ggml_tensor * cur, // struct ggml_tensor * b ggml_tensor * ids) override; - virtual ggml_tensor * build_rope_factors(int il) override; + ggml_tensor * build_rope_factors(int il) override; - virtual ggml_tensor * build_rope_shift( + ggml_tensor * build_rope_shift( ggml_context * ctx0, ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, ggml_backend_buffer * bbuf) override; - virtual ggml_tensor * build_inp_embd( + ggml_tensor * build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, const llama_ubatch & ubatch) override; - virtual ggml_tensor * build_inp_pos( + ggml_tensor * build_inp_pos( ggml_context * ctx0, int32_t n_tokens) override; - virtual ggml_tensor * build_inp_pos_bucket( + ggml_tensor * build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) override; - virtual ggml_tensor * build_inp_out_ids( + ggml_tensor * build_inp_out_ids( ggml_context * ctx0) override; - virtual ggml_tensor * build_inp_mean( + ggml_tensor * build_inp_mean( ggml_context * ctx0, int32_t n_tokens) override; - virtual ggml_tensor * build_inp_cls( + ggml_tensor * build_inp_cls( ggml_context * ctx0, int32_t n_tokens) override; - virtual void build_attn_inp( + void build_attn_inp( ggml_context * ctx0, int32_t n_tokens, bool causal, bool swa) override; - virtual ggml_tensor * build_attn( + ggml_tensor * build_attn( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * wo, @@ -270,6 +272,9 @@ struct llama_context : public llama_graph_i { int il) override; protected: + virtual ggml_tensor * build_inp_self_k_shift( + ggml_context * ctx0); + virtual void build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * gf); @@ -288,6 +293,7 @@ struct llama_context : public llama_graph_i { virtual void perf_reset(); protected: + // TODO: become private mutable int64_t t_start_us = 0; mutable int64_t t_load_us = 0; mutable int64_t t_p_eval_us = 0; @@ -346,6 +352,7 @@ struct llama_context : public llama_graph_i { // // members // + // TODO: become private / move to llama_graph_i const llama_model & model; @@ -412,24 +419,25 @@ class llama_context_kv_self : public llama_context { virtual ~llama_context_kv_self(); protected: - virtual void reserve() override; + void reserve() override; public: - virtual llama_kv_cache * get_kv_self() override; - virtual const llama_kv_cache * get_kv_self() const override; + llama_kv_cache * get_kv_self() override; + const llama_kv_cache * get_kv_self() const override; - virtual void kv_self_update() override; + void kv_self_update() override; - virtual int encode(llama_batch & inp_batch) override; - virtual int decode(llama_batch & inp_batch) override; + int encode(llama_batch & inp_batch) override; + int decode(llama_batch & inp_batch) override; protected: // // input // - virtual void input_set(const llama_ubatch & ubatch) override; + void input_set(const llama_ubatch & ubatch) override; +private: struct { ggml_tensor * self_pos_bucket; // I32 [n_kv, n_batch] ggml_tensor * self_kq_mask; // F32 [n_kv, n_batch] @@ -443,26 +451,24 @@ class llama_context_kv_self : public llama_context { // graph // - virtual ggml_cgraph * graph_init() override; + ggml_cgraph * graph_init() override; public: // // graph build // - virtual ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override; - - virtual ggml_tensor * build_inp_pos_bucket( + ggml_tensor * build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) override; - virtual void build_attn_inp( + void build_attn_inp( ggml_context * ctx0, int32_t n_tokens, bool causal, bool swa) override; - virtual ggml_tensor * build_attn( + ggml_tensor * build_attn( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * wo, @@ -476,16 +482,22 @@ class llama_context_kv_self : public llama_context { int il) override; protected: - virtual void build_kv_self_shift( + ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override; + + void build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * gf) override; // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_kv_self_defrag( + void build_kv_self_defrag( ggml_context * ctx0, ggml_cgraph * gf) override; + // ======================================================= // === encoder-decoder === + // + // TODO: this is temporary here, it will be moved + // // whether we are computing encoder output or decoder output bool is_encoding = false; @@ -497,23 +509,25 @@ class llama_context_kv_self : public llama_context { struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch] - virtual ggml_tensor * build_inp_embd_enc( + ggml_tensor * build_inp_embd_enc( ggml_context * ctx0) override; - virtual ggml_tensor * build_inp_kq_mask_cross( + ggml_tensor * build_inp_kq_mask_cross( ggml_context * ctx0, int32_t n_tokens) override; + // ====================================================== // // state save/load // - virtual size_t state_get_data(llama_io_write_i & io) override; - virtual size_t state_set_data(llama_io_read_i & io) override; + size_t state_get_data(llama_io_write_i & io) override; + size_t state_set_data(llama_io_read_i & io) override; - virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; - virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; + size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; + size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; +private: // // members // @@ -532,24 +546,25 @@ class llama_context_recurrent : public llama_context { virtual ~llama_context_recurrent(); protected: - virtual void reserve() override; + void reserve() override; public: - virtual llama_kv_cache * get_kv_self() override; - virtual const llama_kv_cache * get_kv_self() const override; + llama_kv_cache * get_kv_self() override; + const llama_kv_cache * get_kv_self() const override; - virtual void kv_self_update() override; + void kv_self_update() override; - virtual int encode(llama_batch & inp_batch) override; - virtual int decode(llama_batch & inp_batch) override; + int encode(llama_batch & inp_batch) override; + int decode(llama_batch & inp_batch) override; protected: // // input // - virtual void input_set(const llama_ubatch & ubatch) override; + void input_set(const llama_ubatch & ubatch) override; +private: struct { ggml_tensor * s_copy; // I32 [kv_size] ggml_tensor * s_mask; // F32 [1, n_kv] @@ -559,20 +574,20 @@ class llama_context_recurrent : public llama_context { // graph // - virtual ggml_cgraph * graph_init() override; + ggml_cgraph * graph_init() override; public: // // graph build // - virtual ggml_tensor * build_inp_s_copy( + ggml_tensor * build_inp_s_copy( ggml_context * ctx0) override; - virtual ggml_tensor * build_inp_s_mask( + ggml_tensor * build_inp_s_mask( ggml_context * ctx0) override; - virtual ggml_tensor * build_copy_mask_state( + ggml_tensor * build_copy_mask_state( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * s, @@ -581,7 +596,7 @@ class llama_context_recurrent : public llama_context { int32_t n_state, int32_t n_seqs) override; - virtual ggml_tensor * build_mamba_layer( + ggml_tensor * build_mamba_layer( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * cur, @@ -590,7 +605,7 @@ class llama_context_recurrent : public llama_context { const llama_ubatch & ubatch, int il) override; - virtual ggml_tensor * build_rwkv_token_shift_load( + ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * state_copy, @@ -598,13 +613,13 @@ class llama_context_recurrent : public llama_context { const llama_ubatch & ubatch, int il) override; - virtual ggml_tensor * build_rwkv_token_shift_store( + ggml_tensor * build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, int il) override; - virtual ggml_tensor * build_rwkv6_time_mix( + ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * cur, @@ -619,12 +634,13 @@ class llama_context_recurrent : public llama_context { // state save/load // - virtual size_t state_get_data(llama_io_write_i & io) override; - virtual size_t state_set_data(llama_io_read_i & io) override; + size_t state_get_data(llama_io_write_i & io) override; + size_t state_set_data(llama_io_read_i & io) override; - virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; - virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; + size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; + size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; +private: // // members // @@ -646,7 +662,7 @@ class llama_context_enc_dec : public llama_context { virtual ~llama_context_enc_dec(); -protected: +private: llama_context_kv_self ctx_dec; }; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 25922260d2a7c..c058ee2498880 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -32,14 +32,6 @@ ggml_tensor * llama_graph_i::build_attn( return nullptr; } -ggml_tensor * llama_graph_i::build_inp_self_k_shift( - ggml_context * ctx0) { - GGML_UNUSED(ctx0); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); - return nullptr; -} - ggml_tensor * llama_graph_i::build_inp_embd_enc( ggml_context * ctx0) { GGML_UNUSED(ctx0); diff --git a/src/llama-graph.h b/src/llama-graph.h index 3433caf63ac89..ee56f08396a63 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -117,9 +117,6 @@ class llama_graph_i { float kq_scale, int il); - virtual ggml_tensor * build_inp_self_k_shift( - ggml_context * ctx0); - virtual ggml_tensor * build_inp_embd_enc( ggml_context * ctx0); From a5a85a3bc0c45d4f31f8ef4bc16ef158b0a8d670 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 24 Feb 2025 08:59:12 +0200 Subject: [PATCH 71/84] context : fix recurrent reserve ggml-ci --- src/llama-context.cpp | 5 +++++ src/llama-context.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7628cbc9bf20c..f73d4b9bf4c2f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -3883,6 +3883,11 @@ llama_context_recurrent::llama_context_recurrent( llama_context_recurrent::~llama_context_recurrent() = default; void llama_context_recurrent::reserve() { + // simulate full KV cache + kv_self.n = kv_self.size; + + LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n); + // TODO: implement recurrent-specific reserve logic llama_context::reserve(); } diff --git a/src/llama-context.h b/src/llama-context.h index 0e55aae1c8dfb..2945cbabe4559 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -447,6 +447,7 @@ class llama_context_kv_self : public llama_context { ggml_tensor * self_k_shift; // I32 [kv_size] } inp; +protected: // // graph // @@ -570,6 +571,7 @@ class llama_context_recurrent : public llama_context { ggml_tensor * s_mask; // F32 [1, n_kv] } inp; +protected: // // graph // From 4a1054b55259cb3d43929121294e0ac28a632435 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 24 Feb 2025 11:18:40 +0200 Subject: [PATCH 72/84] context : reuse built_attn_mha ggml-ci --- src/llama-context.cpp | 210 +++++++++++++----------------------------- src/llama-context.h | 17 ++-- src/llama-graph.cpp | 6 -- src/llama-graph.h | 3 - src/llama-model.cpp | 36 +++++++- 5 files changed, 107 insertions(+), 165 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f73d4b9bf4c2f..e05afb5646afc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1721,50 +1721,67 @@ void llama_context::build_attn_inp( ggml_tensor * llama_context::build_attn( ggml_context * ctx0, ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - int32_t n_tokens, float kq_scale, int il) { - const auto & hparams = model.hparams; + GGML_UNUSED(il); - const auto & n_ctx = cparams.n_ctx; + const auto & kq_mask = inp.kq_mask_cnv; - //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); - const auto & kq_mask = inp.kq_mask_cnv; + ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); + //cb(k, "k", il); - const int64_t n_head = hparams.n_head(il); - const int64_t n_head_kv = hparams.n_head_kv(il); + ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); + //cb(k, "v", il); - //const auto & n_embd_head_k = hparams.n_embd_head_k; - const auto & n_embd_head_v = hparams.n_embd_head_v; + ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, false, kq_scale); - // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch - const auto n_kv = n_tokens; + return cur; +} - struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); - //cb(q, "q", il); +ggml_tensor * llama_context::build_attn_mha( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * kq_b, + ggml_tensor * kq_mask, + bool v_trans, + float kq_scale) { + const auto & hparams = model.hparams; - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, k_cur, 0, 2, 1, 3)); - //cb(k, "k", il); + //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + //const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + //const int64_t n_head = hparams.n_head(il); + //const int64_t n_head_kv = hparams.n_head_kv(il); + + //const auto & n_embd_head_k = hparams.n_embd_head_k; + //const auto & n_embd_head_v = hparams.n_embd_head_v; + + const auto n_embd_head_v = v_trans ? v->ne[1] : v->ne[0]; + + const auto n_tokens = q->ne[1]; + const auto n_head = q->ne[2]; + const auto n_kv = k->ne[1]; struct ggml_tensor * cur; - //if (cparams.flash_attn) { - if (false) { // TODO: need to pad the batch size to a multiple of GGML_KQ_MASK_PAD + if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) { GGML_UNUSED(model); - GGML_UNUSED(n_ctx); - GGML_ASSERT(kq_b == nullptr); + GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet"); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_permute(ctx0, v_cur, 0, 2, 1, 3)); - v = ggml_reshape_3d(ctx0, v, n_embd_head_v, n_kv, n_head_kv); + if (v_trans) { + v = ggml_transpose(ctx0, v); + } cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); @@ -1774,7 +1791,6 @@ ggml_tensor * llama_context::build_attn( cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - //cb(kq, "kq", il); // note: this op tends to require high floating point range // while for some models F16 is enough, for others it is not, so we default to F32 here @@ -1802,22 +1818,17 @@ ggml_tensor * llama_context::build_attn( } kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); - //cb(kq, "kq_soft_max_ext", il); - - // split cached v into n_head heads - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens))); - v = ggml_reshape_3d(ctx0, v, n_kv, n_embd_head_v, n_head_kv); - //cb(v, "v", il); + if (!v_trans) { + // note: avoid this branch + v = ggml_cont(ctx0, ggml_transpose(ctx0, v)); + } struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - //cb(kqv, "kqv", il); struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - //cb(kqv_merged, "kqv_merged", il); cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); - //cb(cur, "kqv_merged_cont", il); if (!cparams.offload_kqv) { // all nodes between the KV store and the attention output are run on the CPU @@ -1827,18 +1838,6 @@ ggml_tensor * llama_context::build_attn( ggml_build_forward_expand(gf, cur); - if (wo) { - cur = build_lora_mm(ctx0, wo, cur); - } - - if (wo_b) { - //cb(cur, "kqv_wo", il); - } - - if (wo_b) { - cur = ggml_add(ctx0, cur, wo_b); - } - return cur; } @@ -3274,13 +3273,10 @@ void llama_context_kv_self::build_attn_inp( ggml_tensor * llama_context_kv_self::build_attn( ggml_context * ctx0, ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - int32_t n_tokens, float kq_scale, int il) { const auto & hparams = model.hparams; @@ -3290,6 +3286,10 @@ ggml_tensor * llama_context_kv_self::build_attn( const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + const auto n_tokens = q_cur->ne[2]; + + const bool v_trans = !cparams.flash_attn; + // store to KV cache { GGML_ASSERT(!kv_self.recurrent); @@ -3308,7 +3308,7 @@ ggml_tensor * llama_context_kv_self::build_attn( struct ggml_tensor * v_cache_view = nullptr; - if (cparams.flash_attn) { + if (!v_trans) { v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head); } else { // note: the V cache is transposed when not using flash attention @@ -3351,16 +3351,15 @@ ggml_tensor * llama_context_kv_self::build_attn( const auto n_kv = kv_self.n; - const int64_t n_head = hparams.n_head(il); const int64_t n_head_kv = hparams.n_head_kv(il); const auto & n_embd_head_k = hparams.n_embd_head_k; const auto & n_embd_head_v = hparams.n_embd_head_v; - struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); //cb(q, "q", il); - struct ggml_tensor * k = + ggml_tensor * k = ggml_view_3d(ctx0, kv_self.k_l[il], n_embd_head_k, n_kv, n_head_kv, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), @@ -3368,100 +3367,19 @@ ggml_tensor * llama_context_kv_self::build_attn( 0); //cb(k, "k", il); - struct ggml_tensor * cur; - - if (cparams.flash_attn) { - GGML_UNUSED(model); - GGML_UNUSED(n_ctx); - - GGML_ASSERT(kq_b == nullptr); - - // split cached v into n_head heads (not transposed) - struct ggml_tensor * v = - ggml_view_3d(ctx0, kv_self.v_l[il], - n_embd_head_v, n_kv, n_head_kv, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v), - 0); - //cb(v, "v", il); - - cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, - hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); - - ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); - - cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); - } else { - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - //cb(kq, "kq", il); - - // note: this op tends to require high floating point range - // while for some models F16 is enough, for others it is not, so we default to F32 here - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - - if (model.arch == LLM_ARCH_GROK) { - // need to do the following: - // multiply by attn_output_multiplyer of 0.08838834764831845 - // and then : - // kq = 30 * tanh(kq / 30) - // before the softmax below - - kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f)); - kq = ggml_scale(ctx0, kq, 30); - } - - if (hparams.attn_soft_cap) { - kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping); - kq = ggml_tanh (ctx0, kq); - kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping); - } - - if (kq_b) { - kq = ggml_add(ctx0, kq, kq_b); - } - - kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); - //cb(kq, "kq_soft_max_ext", il); - - GGML_ASSERT(kv_self.size == n_ctx); - - // split cached v into n_head heads - struct ggml_tensor * v = - ggml_view_3d(ctx0, kv_self.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv_self.v_l[il])*n_ctx, - ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, - 0); - //cb(v, "v", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - //cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - //cb(kqv_merged, "kqv_merged", il); - - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); - //cb(cur, "kqv_merged_cont", il); - - if (!cparams.offload_kqv) { - // all nodes between the KV store and the attention output are run on the CPU - ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu); - } - } - - ggml_build_forward_expand(gf, cur); - - if (wo) { - cur = build_lora_mm(ctx0, wo, cur); - } - - if (wo_b) { - //cb(cur, "kqv_wo", il); - } + ggml_tensor * v = !v_trans ? + ggml_view_3d(ctx0, kv_self.v_l[il], + n_embd_head_v, n_kv, n_head_kv, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v), + 0) : + ggml_view_3d(ctx0, kv_self.v_l[il], + n_kv, n_embd_head_v, n_head_kv, + ggml_element_size(kv_self.v_l[il])*n_ctx, + ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + 0); - if (wo_b) { - cur = ggml_add(ctx0, cur, wo_b); - } + struct ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale); return cur; } diff --git a/src/llama-context.h b/src/llama-context.h index 2945cbabe4559..5b63b3b06d21c 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -261,17 +261,25 @@ struct llama_context : public llama_graph_i { ggml_tensor * build_attn( ggml_context * ctx0, ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - int32_t n_tokens, float kq_scale, int il) override; protected: + virtual ggml_tensor * build_attn_mha( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * kq_b, + ggml_tensor * kq_mask, + bool v_trans, + float kq_scale); + virtual ggml_tensor * build_inp_self_k_shift( ggml_context * ctx0); @@ -472,13 +480,10 @@ class llama_context_kv_self : public llama_context { ggml_tensor * build_attn( ggml_context * ctx0, ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - int32_t n_tokens, float kq_scale, int il) override; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index c058ee2498880..99eb326205bc6 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -7,24 +7,18 @@ llama_graph_i::llama_graph_i(llama_graph_type type) : type(type) {} ggml_tensor * llama_graph_i::build_attn( ggml_context * ctx0, ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - int32_t n_tokens, float kq_scale, int il) { GGML_UNUSED(ctx0); GGML_UNUSED(gf); - GGML_UNUSED(wo); - GGML_UNUSED(wo_b); GGML_UNUSED(q_cur); GGML_UNUSED(k_cur); GGML_UNUSED(v_cur); GGML_UNUSED(kq_b); - GGML_UNUSED(n_tokens); GGML_UNUSED(kq_scale); GGML_UNUSED(il); diff --git a/src/llama-graph.h b/src/llama-graph.h index ee56f08396a63..c84c254934ff1 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -107,13 +107,10 @@ class llama_graph_i { virtual ggml_tensor * build_attn( ggml_context * ctx0, ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - int32_t n_tokens, float kq_scale, int il); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1e34ed80388bb..e8057f4687fdf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4265,18 +4265,32 @@ struct llm_build_context { struct ggml_tensor * q_cur, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, - int32_t n_tokens, + int32_t n_tokens, // TODO: remove float kq_scale, int il) { + GGML_UNUSED(n_tokens); + // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced ggml_build_forward_expand(gf, q_cur); ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, nullptr, n_tokens, kq_scale, il); + ggml_tensor * cur = lgf->build_attn(ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il); cb(cur, "kqv_out", il); + if (wo) { + cur = lgf->build_lora_mm(ctx0, wo, cur); + } + + if (wo_b) { + //cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + return cur; } @@ -4288,18 +4302,32 @@ struct llm_build_context { struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, struct ggml_tensor * kq_b, - int32_t n_tokens, + int32_t n_tokens, // TODO: remove float kq_scale, int il) { + GGML_UNUSED(n_tokens); + // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced ggml_build_forward_expand(gf, q_cur); ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, kq_b, n_tokens, kq_scale, il); + ggml_tensor * cur = lgf->build_attn(ctx0, gf, q_cur, k_cur, v_cur, kq_b, kq_scale, il); cb(cur, "kqv_out", il); + if (wo) { + cur = lgf->build_lora_mm(ctx0, wo, cur); + } + + if (wo_b) { + //cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + return cur; } From 9cd78f11a103c578cb598b16b4e49fc4709754a2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 24 Feb 2025 13:38:11 +0200 Subject: [PATCH 73/84] context : explicit llama_context_i abstract interface ggml-ci --- src/llama-context.cpp | 202 +++++++++++++++---------------- src/llama-context.h | 268 +++++++++++++++++++++++++++++++----------- 2 files changed, 299 insertions(+), 171 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e05afb5646afc..6b101f4869e44 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -42,16 +42,17 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t } // -// llama_context +// llama_context_base // -llama_context::llama_context( +llama_context_base::llama_context_base( const llama_model & model, llama_context_params params, llama_graph_type gtype) : + llama_context_i(), llama_graph_i(gtype), model(model) { - LLAMA_LOG_INFO("%s: constructing llama_context, gtype = %d\n", __func__, gtype); + LLAMA_LOG_INFO("%s: constructing llama_context_base, gtype = %d\n", __func__, gtype); t_start_us = model.t_start_us; t_load_us = model.t_load_us; @@ -223,9 +224,9 @@ llama_context::llama_context( } } -llama_context::~llama_context() = default; +llama_context_base::~llama_context_base() = default; -void llama_context::init() { +void llama_context_base::init() { LLAMA_LOG_DEBUG("%s: call\n", __func__); const auto & hparams = model.hparams; @@ -306,7 +307,7 @@ void llama_context::init() { reserve(); } -void llama_context::synchronize() { +void llama_context_base::synchronize() { ggml_backend_sched_synchronize(sched.get()); // FIXME: if multiple single tokens are evaluated without a synchronization, @@ -336,7 +337,7 @@ void llama_context::synchronize() { t_compute_start_us = 0; } -void llama_context::reserve() { +void llama_context_base::reserve() { uint32_t n_seqs = 1; // TODO: worst-case number of sequences uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); @@ -415,72 +416,72 @@ void llama_context::reserve() { } } -const llama_model & llama_context::get_model() const { +const llama_model & llama_context_base::get_model() const { return model; } -const llama_cparams & llama_context::get_cparams() const { +const llama_cparams & llama_context_base::get_cparams() const { return cparams; } -uint32_t llama_context::n_ctx() const { +uint32_t llama_context_base::n_ctx() const { return cparams.n_ctx; } -uint32_t llama_context::n_ctx_per_seq() const { +uint32_t llama_context_base::n_ctx_per_seq() const { return cparams.n_ctx / cparams.n_seq_max; } -uint32_t llama_context::n_batch() const { +uint32_t llama_context_base::n_batch() const { return cparams.n_batch; } -uint32_t llama_context::n_ubatch() const { +uint32_t llama_context_base::n_ubatch() const { return cparams.n_ubatch; } -uint32_t llama_context::n_seq_max() const { +uint32_t llama_context_base::n_seq_max() const { return cparams.n_seq_max; } -uint32_t llama_context::n_threads() const { +uint32_t llama_context_base::n_threads() const { return cparams.n_threads; } -uint32_t llama_context::n_threads_batch() const { +uint32_t llama_context_base::n_threads_batch() const { return cparams.n_threads_batch; } -int32_t llama_context::max_nodes() const { +int32_t llama_context_base::max_nodes() const { return std::max(8192, 5*model.n_tensors()); } -llama_kv_cache * llama_context::get_kv_self() { - LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__); +llama_kv_cache * llama_context_base::get_kv_self() { + LLAMA_LOG_WARN("%s: llama_context_base does not have a KV cache\n", __func__); return nullptr; } -const llama_kv_cache * llama_context::get_kv_self() const { - LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__); +const llama_kv_cache * llama_context_base::get_kv_self() const { + LLAMA_LOG_WARN("%s: llama_context_base does not have a KV cache\n", __func__); return nullptr; } -void llama_context::kv_self_update() { - LLAMA_LOG_WARN("%s: llama_context does not have a KV cache\n", __func__); +void llama_context_base::kv_self_update() { + LLAMA_LOG_WARN("%s: llama_context_base does not have a KV cache\n", __func__); } -enum llama_pooling_type llama_context::pooling_type() const { +enum llama_pooling_type llama_context_base::pooling_type() const { return cparams.pooling_type; } -float * llama_context::get_logits() { +float * llama_context_base::get_logits() { // reorder logits for backward compatibility output_reorder(); return logits; } -float * llama_context::get_logits_ith(int32_t i) { +float * llama_context_base::get_logits_ith(int32_t i) { int32_t j = -1; try { @@ -518,14 +519,14 @@ float * llama_context::get_logits_ith(int32_t i) { } } -float * llama_context::get_embeddings() { +float * llama_context_base::get_embeddings() { // reorder embeddings for backward compatibility output_reorder(); return embd; } -float * llama_context::get_embeddings_ith(int32_t i) { +float * llama_context_base::get_embeddings_ith(int32_t i) { int32_t j = -1; try { @@ -563,7 +564,7 @@ float * llama_context::get_embeddings_ith(int32_t i) { } } -float * llama_context::get_embeddings_seq(llama_seq_id seq_id) { +float * llama_context_base::get_embeddings_seq(llama_seq_id seq_id) { auto it = embd_seq.find(seq_id); if (it == embd_seq.end()) { return nullptr; @@ -572,11 +573,11 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) { return it->second.data(); } -int64_t llama_context::n_pos_per_token() const { +int64_t llama_context_base::n_pos_per_token() const { return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; } -void llama_context::attach_threadpool( +void llama_context_base::attach_threadpool( ggml_threadpool_t threadpool, ggml_threadpool_t threadpool_batch) { LLAMA_LOG_DEBUG("%s: call\n", __func__); @@ -585,21 +586,21 @@ void llama_context::attach_threadpool( this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; } -void llama_context::detach_threadpool() { +void llama_context_base::detach_threadpool() { LLAMA_LOG_DEBUG("%s: call\n", __func__); this->threadpool = nullptr; this->threadpool_batch = nullptr; } -void llama_context::set_n_threads(int32_t n_threads, int32_t n_threads_batch) { +void llama_context_base::set_n_threads(int32_t n_threads, int32_t n_threads_batch) { LLAMA_LOG_DEBUG("%s: n_threads = %d, n_threads_batch = %d\n", __func__, n_threads, n_threads_batch); cparams.n_threads = n_threads; cparams.n_threads_batch = n_threads_batch; } -void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) { +void llama_context_base::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) { LLAMA_LOG_DEBUG("%s: call\n", __func__); this->abort_callback = abort_callback; @@ -614,19 +615,19 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void } } -void llama_context::set_embeddings(bool value) { +void llama_context_base::set_embeddings(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); cparams.embeddings = value; } -void llama_context::set_causal_attn(bool value) { +void llama_context_base::set_causal_attn(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); cparams.causal_attn = value; } -void llama_context::set_adapter_lora( +void llama_context_base::set_adapter_lora( llama_adapter_lora * adapter, float scale) { LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale); @@ -634,7 +635,7 @@ void llama_context::set_adapter_lora( loras[adapter] = scale; } -bool llama_context::rm_adapter_lora( +bool llama_context_base::rm_adapter_lora( llama_adapter_lora * adapter) { LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter); @@ -647,13 +648,13 @@ bool llama_context::rm_adapter_lora( return false; } -void llama_context::clear_adapter_lora() { +void llama_context_base::clear_adapter_lora() { LLAMA_LOG_DEBUG("%s: call\n", __func__); loras.clear(); } -bool llama_context::apply_adapter_cvec( +bool llama_context_base::apply_adapter_cvec( const float * data, size_t len, int32_t n_embd, @@ -664,7 +665,7 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } -int llama_context::encode(llama_batch & inp_batch) { +int llama_context_base::encode(llama_batch & inp_batch) { if (inp_batch.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; @@ -798,7 +799,7 @@ int llama_context::encode(llama_batch & inp_batch) { return 0; } -int llama_context::decode(llama_batch & inp_batch) { +int llama_context_base::decode(llama_batch & inp_batch) { if (inp_batch.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; @@ -829,7 +830,7 @@ int llama_context::decode(llama_batch & inp_batch) { } // micro-batching is not possible without KV cache - GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "llama_context requires n_ubatch >= n_tokens"); + GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "llama_context_base requires n_ubatch >= n_tokens"); if (t_compute_start_us == 0) { t_compute_start_us = ggml_time_us(); @@ -1006,7 +1007,7 @@ int llama_context::decode(llama_batch & inp_batch) { // input // -void llama_context::input_set(const llama_ubatch & ubatch) { +void llama_context_base::input_set(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; if (ubatch.token) { @@ -1280,7 +1281,7 @@ void llama_context::input_set(const llama_ubatch & ubatch) { // output // -int32_t llama_context::output_reserve(int32_t n_outputs) { +int32_t llama_context_base::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; @@ -1348,7 +1349,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { return n_outputs_max; } -void llama_context::output_reorder() { +void llama_context_base::output_reorder() { auto & out_ids = sbatch.out_ids; if (!out_ids.empty()) { const uint32_t n_vocab = model.vocab.n_tokens(); @@ -1390,7 +1391,7 @@ void llama_context::output_reorder() { // graph // -ggml_cgraph * llama_context::graph_init() { +ggml_cgraph * llama_context_base::graph_init() { inp = {}; struct ggml_init_params params = { @@ -1404,14 +1405,14 @@ ggml_cgraph * llama_context::graph_init() { return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false); } -llama_graph_result llama_context::graph_build( +llama_graph_result llama_context_base::graph_build( ggml_context * ctx, ggml_cgraph * gf, const llama_ubatch & ubatch) { return model.build_graph(ctx, gf, this, cparams, ubatch); } -enum ggml_status llama_context::graph_compute( +enum ggml_status llama_context_base::graph_compute( ggml_cgraph * gf, bool batched) { int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; @@ -1442,7 +1443,7 @@ enum ggml_status llama_context::graph_compute( // graph build API // -void llama_context::build_cb( +void llama_context_base::build_cb( ggml_tensor * cur, const char * name, const llama_ubatch & ubatch, @@ -1477,14 +1478,14 @@ void llama_context::build_cb( } } -ggml_tensor * llama_context::build_cvec( +ggml_tensor * llama_context_base::build_cvec( ggml_context * ctx0, ggml_tensor * cur, int il) { return cvec.apply_to(ctx0, cur, il); } -ggml_tensor * llama_context::build_lora_mm( +ggml_tensor * llama_context_base::build_lora_mm( ggml_context * ctx0, ggml_tensor * w, ggml_tensor * cur) { @@ -1511,7 +1512,7 @@ ggml_tensor * llama_context::build_lora_mm( return res; } -ggml_tensor * llama_context::build_lora_mm_id( +ggml_tensor * llama_context_base::build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, ggml_tensor * cur, @@ -1540,7 +1541,7 @@ ggml_tensor * llama_context::build_lora_mm_id( return res; } -ggml_tensor * llama_context::build_rope_factors(int il) { +ggml_tensor * llama_context_base::build_rope_factors(int il) { const auto & hparams = model.hparams; // choose long/short freq factors based on the context size @@ -1557,7 +1558,7 @@ ggml_tensor * llama_context::build_rope_factors(int il) { return model.layers[il].rope_short; } -ggml_tensor * llama_context::build_rope_shift( +ggml_tensor * llama_context_base::build_rope_shift( ggml_context * ctx0, ggml_tensor * cur, ggml_tensor * shift, @@ -1606,7 +1607,7 @@ ggml_tensor * llama_context::build_rope_shift( return tmp; } -ggml_tensor * llama_context::build_inp_embd( +ggml_tensor * llama_context_base::build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, const llama_ubatch & ubatch) { @@ -1656,7 +1657,7 @@ ggml_tensor * llama_context::build_inp_embd( return inpL; } -ggml_tensor * llama_context::build_inp_pos( +ggml_tensor * llama_context_base::build_inp_pos( ggml_context * ctx0, int32_t n_tokens) { inp.pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); @@ -1665,7 +1666,7 @@ ggml_tensor * llama_context::build_inp_pos( return inp.pos; } -ggml_tensor * llama_context::build_inp_pos_bucket( +ggml_tensor * llama_context_base::build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) { inp.pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); @@ -1674,7 +1675,7 @@ ggml_tensor * llama_context::build_inp_pos_bucket( return inp.pos_bucket; } -ggml_tensor * llama_context::build_inp_out_ids( +ggml_tensor * llama_context_base::build_inp_out_ids( ggml_context * ctx0) { const int32_t n_out_ids = n_outputs; @@ -1684,7 +1685,7 @@ ggml_tensor * llama_context::build_inp_out_ids( return inp.out_ids; } -ggml_tensor * llama_context::build_inp_mean( +ggml_tensor * llama_context_base::build_inp_mean( ggml_context * ctx0, int32_t n_tokens) { inp.mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); @@ -1693,7 +1694,7 @@ ggml_tensor * llama_context::build_inp_mean( return inp.mean; } -ggml_tensor * llama_context::build_inp_cls( +ggml_tensor * llama_context_base::build_inp_cls( ggml_context * ctx0, int32_t n_tokens) { inp.cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); @@ -1702,7 +1703,7 @@ ggml_tensor * llama_context::build_inp_cls( return inp.cls; } -void llama_context::build_attn_inp( +void llama_context_base::build_attn_inp( ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -1718,7 +1719,7 @@ void llama_context::build_attn_inp( inp.kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.kq_mask, GGML_TYPE_F16) : inp.kq_mask; } -ggml_tensor * llama_context::build_attn( +ggml_tensor * llama_context_base::build_attn( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, @@ -1745,7 +1746,7 @@ ggml_tensor * llama_context::build_attn( return cur; } -ggml_tensor * llama_context::build_attn_mha( +ggml_tensor * llama_context_base::build_attn_mha( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q, @@ -1774,6 +1775,7 @@ ggml_tensor * llama_context::build_attn_mha( struct ggml_tensor * cur; + // TODO: replace hardcoded padding with ggml-provided padding if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) { GGML_UNUSED(model); @@ -1841,7 +1843,7 @@ ggml_tensor * llama_context::build_attn_mha( return cur; } -ggml_tensor * llama_context::build_inp_self_k_shift( +ggml_tensor * llama_context_base::build_inp_self_k_shift( ggml_context * ctx0) { GGML_UNUSED(ctx0); @@ -1849,7 +1851,7 @@ ggml_tensor * llama_context::build_inp_self_k_shift( return nullptr; } -void llama_context::build_kv_self_shift( +void llama_context_base::build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * gf) { GGML_UNUSED(ctx0); @@ -1858,7 +1860,7 @@ void llama_context::build_kv_self_shift( LLAMA_LOG_ERROR("%s: not implemented\n", __func__); } -void llama_context::build_kv_self_defrag( +void llama_context_base::build_kv_self_defrag( ggml_context * ctx0, ggml_cgraph * gf) { GGML_UNUSED(ctx0); @@ -1872,7 +1874,7 @@ void llama_context::build_kv_self_defrag( // perf // -llama_perf_context_data llama_context::perf_get_data() const { +llama_perf_context_data llama_context_base::perf_get_data() const { llama_perf_context_data data = {}; data.t_start_ms = 1e-3 * t_start_us; @@ -1885,7 +1887,7 @@ llama_perf_context_data llama_context::perf_get_data() const { return data; } -void llama_context::perf_reset() { +void llama_context_base::perf_reset() { t_start_us = ggml_time_us(); t_eval_us = n_eval = 0; t_p_eval_us = n_p_eval = 0; @@ -2029,7 +2031,7 @@ class llama_io_read_file : public llama_io_read_i { std::vector temp_buffer; }; -size_t llama_context::state_get_size() { +size_t llama_context_base::state_get_size() { llama_io_write_dummy io; try { return state_get_data(io); @@ -2039,7 +2041,7 @@ size_t llama_context::state_get_size() { } } -size_t llama_context::state_get_data(uint8_t * dst, size_t size) { +size_t llama_context_base::state_get_data(uint8_t * dst, size_t size) { llama_io_write_buffer io(dst, size); try { return state_get_data(io); @@ -2049,7 +2051,7 @@ size_t llama_context::state_get_data(uint8_t * dst, size_t size) { } } -size_t llama_context::state_set_data(const uint8_t * src, size_t size) { +size_t llama_context_base::state_set_data(const uint8_t * src, size_t size) { llama_io_read_buffer io(src, size); try { return state_set_data(io); @@ -2059,7 +2061,7 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) { } } -size_t llama_context::state_seq_get_size(llama_seq_id seq_id) { +size_t llama_context_base::state_seq_get_size(llama_seq_id seq_id) { llama_io_write_dummy io; try { return state_seq_get_data(io, seq_id); @@ -2069,7 +2071,7 @@ size_t llama_context::state_seq_get_size(llama_seq_id seq_id) { } } -size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { +size_t llama_context_base::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { llama_io_write_buffer io(dst, size); try { return state_seq_get_data(io, seq_id); @@ -2079,7 +2081,7 @@ size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, siz } } -size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { +size_t llama_context_base::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { llama_io_read_buffer io(src, size); try { return state_seq_set_data(io, seq_id); @@ -2089,7 +2091,7 @@ size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * sr } } -bool llama_context::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +bool llama_context_base::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { llama_file file(filepath, "rb"); // sanity checks @@ -2132,7 +2134,7 @@ bool llama_context::state_load_file(const char * filepath, llama_token * tokens_ return true; } -bool llama_context::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) { +bool llama_context_base::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) { llama_file file(filepath, "wb"); file.write_u32(LLAMA_SESSION_MAGIC); @@ -2149,7 +2151,7 @@ bool llama_context::state_save_file(const char * filepath, const llama_token * t return true; } -size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +size_t llama_context_base::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { llama_file file(filepath, "rb"); // version checks @@ -2192,7 +2194,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file return file.tell(); } -size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) { +size_t llama_context_base::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) { llama_file file(filepath, "wb"); file.write_u32(LLAMA_STATE_SEQ_MAGIC); @@ -2212,7 +2214,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file return res; } -size_t llama_context::state_get_data(llama_io_write_i & io) { +size_t llama_context_base::state_get_data(llama_io_write_i & io) { LLAMA_LOG_DEBUG("%s: writing state\n", __func__); // write model info @@ -2285,7 +2287,7 @@ size_t llama_context::state_get_data(llama_io_write_i & io) { return io.n_bytes(); } -size_t llama_context::state_set_data(llama_io_read_i & io) { +size_t llama_context_base::state_set_data(llama_io_read_i & io) { LLAMA_LOG_DEBUG("%s: reading state\n", __func__); // read model info @@ -2366,13 +2368,13 @@ size_t llama_context::state_set_data(llama_io_read_i & io) { return io.n_bytes(); } -size_t llama_context::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { +size_t llama_context_base::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { GGML_UNUSED(seq_id); return io.n_bytes(); } -size_t llama_context::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { +size_t llama_context_base::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { GGML_UNUSED(seq_id); return io.n_bytes(); @@ -2386,7 +2388,7 @@ llama_context_kv_self::llama_context_kv_self( const llama_model & model, llama_context_params params, llama_graph_type gtype) : - llama_context(model, params, gtype), + llama_context_base(model, params, gtype), kv_self(model.hparams) { LLAMA_LOG_INFO("%s: constructing llama_context_kv_self\n", __func__); @@ -2436,7 +2438,7 @@ void llama_context_kv_self::reserve() { LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n); - llama_context::reserve(); + llama_context_base::reserve(); } llama_kv_cache * llama_context_kv_self::get_kv_self() { @@ -3033,7 +3035,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { } // call base functionality - llama_context::input_set(ubatch); + llama_context_base::input_set(ubatch); if (inp.self_kq_mask || inp.self_kq_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. @@ -3219,7 +3221,7 @@ ggml_cgraph * llama_context_kv_self::graph_init() { inp = {}; - return llama_context::graph_init(); + return llama_context_base::graph_init(); } ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) { @@ -3719,7 +3721,7 @@ ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross( // state save/load size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { - llama_context::state_get_data(io); + llama_context_base::state_get_data(io); LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__); kv_self.state_write(io); @@ -3728,7 +3730,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { } size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { - llama_context::state_set_data(io); + llama_context_base::state_set_data(io); LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__); kv_self.state_read(io); @@ -3737,7 +3739,7 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { } size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { - llama_context::state_seq_get_data(io, seq_id); + llama_context_base::state_seq_get_data(io, seq_id); kv_self.state_write(io, seq_id); @@ -3745,7 +3747,7 @@ size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_se } size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { - llama_context::state_seq_set_data(io, seq_id); + llama_context_base::state_seq_set_data(io, seq_id); kv_self.state_read(io, seq_id); @@ -3760,7 +3762,7 @@ llama_context_recurrent::llama_context_recurrent( const llama_model & model, llama_context_params params, llama_graph_type gtype) : - llama_context(model, params, gtype), + llama_context_base(model, params, gtype), kv_self(model.hparams) { LLAMA_LOG_INFO("%s: constructing llama_context_recurrent\n", __func__); @@ -3807,7 +3809,7 @@ void llama_context_recurrent::reserve() { LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n); // TODO: implement recurrent-specific reserve logic - llama_context::reserve(); + llama_context_base::reserve(); } llama_kv_cache * llama_context_recurrent::get_kv_self() { @@ -4139,7 +4141,7 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { // call base functionality - llama_context::input_set(ubatch); + llama_context_base::input_set(ubatch); GGML_ASSERT(kv_self.recurrent); @@ -4193,7 +4195,7 @@ ggml_cgraph * llama_context_recurrent::graph_init() { inp.s_copy = nullptr; inp.s_mask = nullptr; - return llama_context::graph_init(); + return llama_context_base::graph_init(); } ggml_tensor * llama_context_recurrent::build_inp_s_copy( @@ -4602,7 +4604,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( // state save/load size_t llama_context_recurrent::state_get_data(llama_io_write_i & io) { - llama_context::state_get_data(io); + llama_context_base::state_get_data(io); kv_self.state_write(io); @@ -4610,7 +4612,7 @@ size_t llama_context_recurrent::state_get_data(llama_io_write_i & io) { } size_t llama_context_recurrent::state_set_data(llama_io_read_i & io) { - llama_context::state_set_data(io); + llama_context_base::state_set_data(io); kv_self.state_read(io); @@ -4618,7 +4620,7 @@ size_t llama_context_recurrent::state_set_data(llama_io_read_i & io) { } size_t llama_context_recurrent::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { - llama_context::state_seq_get_data(io, seq_id); + llama_context_base::state_seq_get_data(io, seq_id); kv_self.state_write(io, seq_id); @@ -4626,7 +4628,7 @@ size_t llama_context_recurrent::state_seq_get_data(llama_io_write_i & io, llama_ } size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { - llama_context::state_seq_set_data(io, seq_id); + llama_context_base::state_seq_set_data(io, seq_id); kv_self.state_read(io, seq_id); @@ -4640,7 +4642,7 @@ size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_s llama_context_enc_dec::llama_context_enc_dec( const llama_model & model, llama_context_params params) : - llama_context(model, params, LLAMA_GRAPH_TYPE_ENCODER), + llama_context_enc(model, params, LLAMA_GRAPH_TYPE_ENCODER), ctx_dec(model, params, LLAMA_GRAPH_TYPE_DECODER) { LLAMA_LOG_INFO("%s: constructing llama_context_enc_dec\n", __func__); } diff --git a/src/llama-context.h b/src/llama-context.h index 5b63b3b06d21c..d647a426cd1be 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -20,90 +20,78 @@ class llama_io_write_i; using llama_loras = std::unordered_map; -// basic transformer without KV cache -struct llama_context : public llama_graph_i { +// abstract interface corresponding to the public C API +struct llama_context { public: - llama_context( - const llama_model & model, - llama_context_params params, - llama_graph_type gtype); - - virtual ~llama_context(); - - // init scheduler and compute buffers, reserve worst-case graphs - // call once after the context is constructed - virtual void init(); + llama_context() = default; + virtual ~llama_context() = default; - virtual void synchronize(); + virtual void init() = 0; -protected: - // called by init() to reserve the worst-case graphs - // override in child classes - virtual void reserve(); + virtual void synchronize() = 0; -public: - const llama_model & get_model() const; - const llama_cparams & get_cparams() const; + virtual const llama_model & get_model() const = 0; + virtual const llama_cparams & get_cparams() const = 0; - virtual uint32_t n_ctx() const; - virtual uint32_t n_ctx_per_seq() const; - virtual uint32_t n_batch() const; - virtual uint32_t n_ubatch() const; - virtual uint32_t n_seq_max() const; + virtual uint32_t n_ctx() const = 0; + virtual uint32_t n_ctx_per_seq() const = 0; + virtual uint32_t n_batch() const = 0; + virtual uint32_t n_ubatch() const = 0; + virtual uint32_t n_seq_max() const = 0; - virtual uint32_t n_threads() const; - virtual uint32_t n_threads_batch() const; + virtual uint32_t n_threads() const = 0; + virtual uint32_t n_threads_batch() const = 0; - virtual int32_t max_nodes() const; + virtual int32_t max_nodes() const = 0; // self-attention: // if the context does not have a KV cache, return nullptr - virtual llama_kv_cache * get_kv_self(); - virtual const llama_kv_cache * get_kv_self() const; + virtual llama_kv_cache * get_kv_self() = 0; + virtual const llama_kv_cache * get_kv_self() const = 0; // if the context does not have a KV cache, noop - virtual void kv_self_update(); + virtual void kv_self_update() = 0; - virtual enum llama_pooling_type pooling_type() const; + virtual enum llama_pooling_type pooling_type() const = 0; - virtual float * get_logits(); - virtual float * get_logits_ith(int32_t i); + virtual float * get_logits() = 0; + virtual float * get_logits_ith(int32_t i) = 0; - virtual float * get_embeddings(); - virtual float * get_embeddings_ith(int32_t i); - virtual float * get_embeddings_seq(llama_seq_id seq_id); + virtual float * get_embeddings() = 0; + virtual float * get_embeddings_ith(int32_t i) = 0; + virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0; - virtual int64_t n_pos_per_token() const; // vision + virtual int64_t n_pos_per_token() const = 0; // vision virtual void attach_threadpool( ggml_threadpool_t threadpool, - ggml_threadpool_t threadpool_batch); + ggml_threadpool_t threadpool_batch) = 0; - virtual void detach_threadpool(); + virtual void detach_threadpool() = 0; - virtual void set_n_threads(int32_t n_threads, int32_t n_threads_batch); + virtual void set_n_threads(int32_t n_threads, int32_t n_threads_batch) = 0; - virtual void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data); + virtual void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) = 0; - virtual void set_embeddings (bool value); - virtual void set_causal_attn(bool value); + virtual void set_embeddings (bool value) = 0; + virtual void set_causal_attn(bool value) = 0; virtual void set_adapter_lora( llama_adapter_lora * adapter, - float scale); + float scale) = 0; virtual bool rm_adapter_lora( - llama_adapter_lora * adapter); + llama_adapter_lora * adapter) = 0; - virtual void clear_adapter_lora(); + virtual void clear_adapter_lora() = 0; virtual bool apply_adapter_cvec( const float * data, size_t len, int32_t n_embd, int32_t il_start, - int32_t il_end); + int32_t il_end) = 0; // encode a batch of tokens by evaluating the encoder part of the transformer // @@ -114,7 +102,7 @@ struct llama_context : public llama_graph_i { // return positive int on warning // return negative int on error // - virtual int encode(llama_batch & inp_batch); + virtual int encode(llama_batch & inp_batch) = 0; // decode a batch of tokens by evaluating the transformer // in case of unsuccessful decoding (error or warning), @@ -128,7 +116,145 @@ struct llama_context : public llama_graph_i { // return positive int on warning // return negative int on error // - virtual int decode(llama_batch & inp_batch); + virtual int decode(llama_batch & inp_batch) = 0; + + // + // perf + // + + virtual llama_perf_context_data perf_get_data() const = 0; + virtual void perf_reset() = 0; + + // + // state save/load + // + + virtual size_t state_get_size() = 0; + virtual size_t state_get_data( uint8_t * dst, size_t size) = 0; + virtual size_t state_set_data(const uint8_t * src, size_t size) = 0; + + virtual size_t state_seq_get_size(llama_seq_id seq_id) = 0; + virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) = 0; + virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) = 0; + + virtual bool state_load_file( + const char * filepath, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) = 0; + + virtual bool state_save_file( + const char * filepath, + const llama_token * tokens, + size_t n_token_count) = 0; + + virtual size_t state_seq_load_file( + llama_seq_id seq_id, + const char * filepath, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) = 0; + + virtual size_t state_seq_save_file( + llama_seq_id seq_id, + const char * filepath, + const llama_token * tokens, + size_t n_token_count) = 0; +}; + +// C++ alias +class llama_context_i : public llama_context { +public: + using llama_context::llama_context; +}; + +// basic transformer without KV cache +class llama_context_base : public llama_context_i, public llama_graph_i { +public: + llama_context_base( + const llama_model & model, + llama_context_params params, + llama_graph_type gtype); + + virtual ~llama_context_base(); + + // init scheduler and compute buffers, reserve worst-case graphs + // call once after the context is constructed + void init() override; + + void synchronize() override; + +protected: + // called by init() to reserve the worst-case graphs + // override in child classes + virtual void reserve(); + +public: + const llama_model & get_model() const override; + const llama_cparams & get_cparams() const override; + + uint32_t n_ctx() const override; + uint32_t n_ctx_per_seq() const override; + uint32_t n_batch() const override; + uint32_t n_ubatch() const override; + uint32_t n_seq_max() const override; + + uint32_t n_threads() const override; + uint32_t n_threads_batch() const override; + + int32_t max_nodes() const override; + + // self-attention: + + // if the context does not have a KV cache, return nullptr + llama_kv_cache * get_kv_self() override; + const llama_kv_cache * get_kv_self() const override; + + // if the context does not have a KV cache, noop + void kv_self_update() override; + + enum llama_pooling_type pooling_type() const override; + + float * get_logits() override; + float * get_logits_ith(int32_t i) override; + + float * get_embeddings() override; + float * get_embeddings_ith(int32_t i) override; + float * get_embeddings_seq(llama_seq_id seq_id) override; + + int64_t n_pos_per_token() const override; // vision + + void attach_threadpool( + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) override; + + void detach_threadpool() override; + + void set_n_threads(int32_t n_threads, int32_t n_threads_batch) override; + + void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) override; + + void set_embeddings (bool value) override; + void set_causal_attn(bool value) override; + + void set_adapter_lora( + llama_adapter_lora * adapter, + float scale) override; + + bool rm_adapter_lora( + llama_adapter_lora * adapter) override; + + void clear_adapter_lora() override; + + bool apply_adapter_cvec( + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end) override; + + int encode(llama_batch & inp_batch) override; + int decode(llama_batch & inp_batch) override; protected: // @@ -297,8 +423,8 @@ struct llama_context : public llama_graph_i { // perf // - virtual llama_perf_context_data perf_get_data() const; - virtual void perf_reset(); + llama_perf_context_data perf_get_data() const override; + void perf_reset() override; protected: // TODO: become private @@ -318,37 +444,37 @@ struct llama_context : public llama_graph_i { // state save/load // - virtual size_t state_get_size(); - virtual size_t state_get_data( uint8_t * dst, size_t size); - virtual size_t state_set_data(const uint8_t * src, size_t size); + size_t state_get_size() override; + size_t state_get_data( uint8_t * dst, size_t size) override; + size_t state_set_data(const uint8_t * src, size_t size) override; - virtual size_t state_seq_get_size(llama_seq_id seq_id); - virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size); - virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size); + size_t state_seq_get_size(llama_seq_id seq_id) override; + size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) override; + size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override; - virtual bool state_load_file( + bool state_load_file( const char * filepath, llama_token * tokens_out, size_t n_token_capacity, - size_t * n_token_count_out); + size_t * n_token_count_out) override; - virtual bool state_save_file( + bool state_save_file( const char * filepath, const llama_token * tokens, - size_t n_token_count); + size_t n_token_count) override; - virtual size_t state_seq_load_file( + size_t state_seq_load_file( llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, - size_t * n_token_count_out); + size_t * n_token_count_out) override; - virtual size_t state_seq_save_file( + size_t state_seq_save_file( llama_seq_id seq_id, const char * filepath, const llama_token * tokens, - size_t n_token_count); + size_t n_token_count) override; protected: virtual size_t state_get_data(llama_io_write_i & io); @@ -417,7 +543,7 @@ struct llama_context : public llama_graph_i { }; // transformer with a self-attention KV cache -class llama_context_kv_self : public llama_context { +class llama_context_kv_self : public llama_context_base { public: llama_context_kv_self( const llama_model & model, @@ -542,7 +668,7 @@ class llama_context_kv_self : public llama_context { }; // a recurrent transformer (ie.e RWKV, Mamba) -class llama_context_recurrent : public llama_context { +class llama_context_recurrent : public llama_context_base { public: llama_context_recurrent( const llama_model & model, @@ -656,12 +782,12 @@ class llama_context_recurrent : public llama_context { llama_kv_cache_recurrent kv_self; }; -class llama_context_enc : public llama_context { +class llama_context_enc : public llama_context_base { public: - using llama_context::llama_context; + using llama_context_base::llama_context_base; }; -class llama_context_enc_dec : public llama_context { +class llama_context_enc_dec : public llama_context_enc { public: llama_context_enc_dec( const llama_model & model, From be58e30017b445e2146c8bc1784ae0b291fae48c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 24 Feb 2025 15:16:45 +0200 Subject: [PATCH 74/84] enc-dec : compose wip ggml-ci --- src/llama-context.cpp | 777 ++++++++++++++++++++++++++++++++---------- src/llama-context.h | 261 +++++++++++--- src/llama-graph.cpp | 26 +- src/llama-graph.h | 14 +- src/llama-model.cpp | 328 +++++++++--------- 5 files changed, 1002 insertions(+), 404 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 6b101f4869e44..81663c40018e3 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -261,7 +261,7 @@ void llama_context_base::init() { LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size()); - const size_t max_nodes = this->max_nodes(); + const size_t max_nodes = this->graph_max_nodes(); LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes); @@ -420,10 +420,6 @@ const llama_model & llama_context_base::get_model() const { return model; } -const llama_cparams & llama_context_base::get_cparams() const { - return cparams; -} - uint32_t llama_context_base::n_ctx() const { return cparams.n_ctx; } @@ -452,10 +448,6 @@ uint32_t llama_context_base::n_threads_batch() const { return cparams.n_threads_batch; } -int32_t llama_context_base::max_nodes() const { - return std::max(8192, 5*model.n_tensors()); -} - llama_kv_cache * llama_context_base::get_kv_self() { LLAMA_LOG_WARN("%s: llama_context_base does not have a KV cache\n", __func__); return nullptr; @@ -573,10 +565,6 @@ float * llama_context_base::get_embeddings_seq(llama_seq_id seq_id) { return it->second.data(); } -int64_t llama_context_base::n_pos_per_token() const { - return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; -} - void llama_context_base::attach_threadpool( ggml_threadpool_t threadpool, ggml_threadpool_t threadpool_batch) { @@ -1007,6 +995,10 @@ int llama_context_base::decode(llama_batch & inp_batch) { // input // +int64_t llama_context_base::n_pos_per_token() const { + return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; +} + void llama_context_base::input_set(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; @@ -1391,6 +1383,10 @@ void llama_context_base::output_reorder() { // graph // +int32_t llama_context_base::graph_max_nodes() const { + return std::max(8192, 5*model.n_tensors()); +} + ggml_cgraph * llama_context_base::graph_init() { inp = {}; @@ -1402,7 +1398,7 @@ ggml_cgraph * llama_context_base::graph_init() { ctx_compute.reset(ggml_init(params)); - return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false); + return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false); } llama_graph_result llama_context_base::graph_build( @@ -2034,7 +2030,7 @@ class llama_io_read_file : public llama_io_read_i { size_t llama_context_base::state_get_size() { llama_io_write_dummy io; try { - return state_get_data(io); + return state_write_data(io); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); return 0; @@ -2044,7 +2040,7 @@ size_t llama_context_base::state_get_size() { size_t llama_context_base::state_get_data(uint8_t * dst, size_t size) { llama_io_write_buffer io(dst, size); try { - return state_get_data(io); + return state_write_data(io); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); return 0; @@ -2054,7 +2050,7 @@ size_t llama_context_base::state_get_data(uint8_t * dst, size_t size) { size_t llama_context_base::state_set_data(const uint8_t * src, size_t size) { llama_io_read_buffer io(src, size); try { - return state_set_data(io); + return state_read_data(io); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); return 0; @@ -2064,7 +2060,7 @@ size_t llama_context_base::state_set_data(const uint8_t * src, size_t size) { size_t llama_context_base::state_seq_get_size(llama_seq_id seq_id) { llama_io_write_dummy io; try { - return state_seq_get_data(io, seq_id); + return state_seq_write_data(io, seq_id); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what()); return 0; @@ -2074,7 +2070,7 @@ size_t llama_context_base::state_seq_get_size(llama_seq_id seq_id) { size_t llama_context_base::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { llama_io_write_buffer io(dst, size); try { - return state_seq_get_data(io, seq_id); + return state_seq_write_data(io, seq_id); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what()); return 0; @@ -2084,7 +2080,7 @@ size_t llama_context_base::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst size_t llama_context_base::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { llama_io_read_buffer io(src, size); try { - return state_seq_set_data(io, seq_id); + return state_seq_read_data(io, seq_id); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what()); return 0; @@ -2123,7 +2119,7 @@ bool llama_context_base::state_load_file(const char * filepath, llama_token * to const size_t n_state_size_cur = file.size() - file.tell(); llama_io_read_file io( &file); - const size_t n_read = state_set_data(io); + const size_t n_read = state_read_data(io); if (n_read != n_state_size_cur) { LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read); @@ -2146,7 +2142,7 @@ bool llama_context_base::state_save_file(const char * filepath, const llama_toke // save the context state using stream saving llama_io_write_file io(&file); - state_get_data(io); + state_write_data(io); return true; } @@ -2182,7 +2178,7 @@ size_t llama_context_base::state_seq_load_file(llama_seq_id seq_id, const char * { const size_t state_size = file.size() - file.tell(); llama_io_read_file io(&file); - const size_t nread = state_seq_set_data(io, seq_id); + const size_t nread = state_seq_read_data(io, seq_id); if (!nread) { LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); return 0; @@ -2206,7 +2202,7 @@ size_t llama_context_base::state_seq_save_file(llama_seq_id seq_id, const char * // save the context state using stream saving llama_io_write_file io(&file); - state_seq_get_data(io, seq_id); + state_seq_write_data(io, seq_id); const size_t res = file.tell(); GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes()); @@ -2214,7 +2210,7 @@ size_t llama_context_base::state_seq_save_file(llama_seq_id seq_id, const char * return res; } -size_t llama_context_base::state_get_data(llama_io_write_i & io) { +size_t llama_context_base::state_write_data(llama_io_write_i & io) { LLAMA_LOG_DEBUG("%s: writing state\n", __func__); // write model info @@ -2287,7 +2283,7 @@ size_t llama_context_base::state_get_data(llama_io_write_i & io) { return io.n_bytes(); } -size_t llama_context_base::state_set_data(llama_io_read_i & io) { +size_t llama_context_base::state_read_data(llama_io_read_i & io) { LLAMA_LOG_DEBUG("%s: reading state\n", __func__); // read model info @@ -2368,13 +2364,13 @@ size_t llama_context_base::state_set_data(llama_io_read_i & io) { return io.n_bytes(); } -size_t llama_context_base::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { +size_t llama_context_base::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) { GGML_UNUSED(seq_id); return io.n_bytes(); } -size_t llama_context_base::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { +size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) { GGML_UNUSED(seq_id); return io.n_bytes(); @@ -2400,9 +2396,6 @@ llama_context_kv_self::llama_context_kv_self( LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx); - // build worst-case graph for encoder if a model contains encoder - is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder() - uint32_t kv_size = cparams.n_ctx; ggml_type type_k = params.type_k; ggml_type type_v = params.type_v; @@ -2537,8 +2530,6 @@ void llama_context_kv_self::kv_self_update() { } int llama_context_kv_self::encode(llama_batch & inp_batch) { - is_encoding = true; - if (inp_batch.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; @@ -2589,7 +2580,6 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { output_ids[i] = i; } - inp_embd_enc = NULL; n_outputs = n_tokens; //batch_manager->prepare(ubatch); @@ -2624,65 +2614,48 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); GGML_ASSERT(backend_embd != nullptr); - if (llama_model_has_decoder(&model)) { - embd_enc.resize(n_tokens*n_embd); - float * embd_out = embd_enc.data(); - - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - - // remember the sequence ids used during the encoding - needed for cross attention later - seq_ids_enc.resize(n_tokens); - for (int32_t i = 0; i < n_tokens; i++) { - for (int s = 0; s < ubatch.n_seq_id[i]; s++) { - llama_seq_id seq_id = ubatch.seq_id[i][s]; - seq_ids_enc[i].insert(seq_id); - } - } - } else { - GGML_ASSERT(embd != nullptr); + GGML_ASSERT(embd != nullptr); - switch (cparams.pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - // extract token embeddings - GGML_ASSERT(embd != nullptr); - float * embd_out = embd; + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(embd != nullptr); + float * embd_out = embd; - GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - } break; - case LLAMA_POOLING_TYPE_MEAN: - case LLAMA_POOLING_TYPE_CLS: - case LLAMA_POOLING_TYPE_LAST: - { - // extract sequence embeddings - auto & embd_seq_out = embd_seq; - embd_seq_out.clear(); + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = embd_seq; + embd_seq_out.clear(); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - for (int32_t i = 0; i < n_tokens; i++) { - const llama_seq_id seq_id = ubatch.seq_id[i][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + for (int32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = ubatch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; } - } break; - case LLAMA_POOLING_TYPE_RANK: - { - // TODO: this likely should be the same logic as in llama_decoder_internal, but better to - // wait for an encoder model that requires this pooling type in order to test it - // https://github.com/ggerganov/llama.cpp/pull/9510 - GGML_ABORT("RANK pooling not implemented yet"); - } - case LLAMA_POOLING_TYPE_UNSPECIFIED: - { - GGML_ABORT("unknown pooling type"); + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); } - } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // TODO: this likely should be the same logic as in llama_decoder_internal, but better to + // wait for an encoder model that requires this pooling type in order to test it + // https://github.com/ggerganov/llama.cpp/pull/9510 + GGML_ABORT("RANK pooling not implemented yet"); + } + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } } } @@ -2694,8 +2667,6 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { } int llama_context_kv_self::decode(llama_batch & inp_batch) { - is_encoding = false; - if (inp_batch.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; @@ -3039,7 +3010,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { if (inp.self_kq_mask || inp.self_kq_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. - if (cparams.causal_attn && !is_encoding) { + if (cparams.causal_attn) { const int64_t n_kv = kv_self.n; const int64_t n_tokens = ubatch.n_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens; @@ -3116,7 +3087,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; // when using kv cache, the mask needs to match the kv cache size - const int64_t n_stride = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens; + const int64_t n_stride = n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask->buffer)); @@ -3175,50 +3146,9 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { } } } - - if (!is_encoding && inp_embd_enc) { - assert(inp_embd_enc->type == GGML_TYPE_F32); - assert((size_t) ggml_nelements(inp_embd_enc) == embd_enc.size()); - - ggml_backend_tensor_set(inp_embd_enc, embd_enc.data(), 0, ggml_nbytes(inp_embd_enc)); - } - - if (!is_encoding && inp_kq_mask_cross) { - const int64_t n_output_enc = embd_enc.size() / hparams.n_embd; - const int64_t n_tokens = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp_kq_mask_cross->buffer)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - - float * data = (float *) inp_kq_mask_cross->data; - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - for (int i = 0; i < n_output_enc; ++i) { - float f = -INFINITY; - for (int s = 0; s < ubatch.n_seq_id[j]; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[j][s]; - if (seq_ids_enc[i].find(seq_id) != seq_ids_enc[i].end()) { - f = 0.0f; - } - } - data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f; - } - } - - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_output_enc; ++j) { - data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY; - } - } - } - } } ggml_cgraph * llama_context_kv_self::graph_init() { - inp_embd_enc = nullptr; - inp_kq_mask_cross = nullptr; - inp = {}; return llama_context_base::graph_init(); @@ -3441,7 +3371,7 @@ void llama_context_kv_self::build_kv_self_defrag( // - x2 for keys and values //const uint32_t max_moves = max_nodes()/(6*n_layer); // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (max_nodes() - 2*n_layer)/(6*n_layer); + const uint32_t max_moves = (graph_max_nodes() - 2*n_layer)/(6*n_layer); // determine which KV cells to move where // @@ -3689,39 +3619,10 @@ void llama_context_kv_self::build_kv_self_defrag( #endif } -ggml_tensor * llama_context_kv_self::build_inp_embd_enc( - ggml_context * ctx0) { - const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd; - - // TODO: not sure if this is correct - const int32_t n_outputs_enc = embd_enc.size() / n_embd; - - inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); - ggml_set_input(inp_embd_enc); - - return inp_embd_enc; -} - -ggml_tensor * llama_context_kv_self::build_inp_kq_mask_cross( - ggml_context * ctx0, - int32_t n_tokens) { - const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd; - - // TODO: not sure if this is correct - const int32_t n_outputs_enc = embd_enc.size() / n_embd; - - inp_kq_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - ggml_set_input(inp_kq_mask_cross); - - return inp_kq_mask_cross; -} - // state save/load -size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { - llama_context_base::state_get_data(io); +size_t llama_context_kv_self::state_write_data(llama_io_write_i & io) { + llama_context_base::state_write_data(io); LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__); kv_self.state_write(io); @@ -3729,8 +3630,8 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { return io.n_bytes(); } -size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { - llama_context_base::state_set_data(io); +size_t llama_context_kv_self::state_read_data(llama_io_read_i & io) { + llama_context_base::state_read_data(io); LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__); kv_self.state_read(io); @@ -3738,16 +3639,16 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { return io.n_bytes(); } -size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { - llama_context_base::state_seq_get_data(io, seq_id); +size_t llama_context_kv_self::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) { + llama_context_base::state_seq_write_data(io, seq_id); kv_self.state_write(io, seq_id); return io.n_bytes(); } -size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { - llama_context_base::state_seq_set_data(io, seq_id); +size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) { + llama_context_base::state_seq_read_data(io, seq_id); kv_self.state_read(io, seq_id); @@ -4603,54 +4504,568 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( // state save/load -size_t llama_context_recurrent::state_get_data(llama_io_write_i & io) { - llama_context_base::state_get_data(io); +size_t llama_context_recurrent::state_write_data(llama_io_write_i & io) { + llama_context_base::state_write_data(io); kv_self.state_write(io); return io.n_bytes(); } -size_t llama_context_recurrent::state_set_data(llama_io_read_i & io) { - llama_context_base::state_set_data(io); +size_t llama_context_recurrent::state_read_data(llama_io_read_i & io) { + llama_context_base::state_read_data(io); kv_self.state_read(io); return io.n_bytes(); } -size_t llama_context_recurrent::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { - llama_context_base::state_seq_get_data(io, seq_id); +size_t llama_context_recurrent::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) { + llama_context_base::state_seq_write_data(io, seq_id); kv_self.state_write(io, seq_id); return io.n_bytes(); } -size_t llama_context_recurrent::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { - llama_context_base::state_seq_set_data(io, seq_id); +size_t llama_context_recurrent::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) { + llama_context_base::state_seq_read_data(io, seq_id); kv_self.state_read(io, seq_id); return io.n_bytes(); } +// +// llama_context_enc +// + +int llama_context_enc::encode(llama_batch & inp_batch) { + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } + + // temporary allocate memory for the input batch if needed + llama_batch_allocr batch_allocr(inp_batch, 0); + + const llama_batch & batch = batch_allocr.batch; + + const int32_t n_tokens = batch.n_tokens; + + const auto & hparams = model.hparams; + + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + + if (batch.token) { + for (int32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + return -1; + } + } + } + + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot + GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens"); + + if (t_compute_start_us == 0) { + t_compute_start_us = ggml_time_us(); + } + + n_queued_tokens += n_tokens; + + const int64_t n_embd = hparams.n_embd; + + sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); + + const llama_ubatch ubatch = sbatch.split_simple(n_tokens); + + // reserve output buffer + if (output_reserve(n_tokens) < n_tokens) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); + return -2; + }; + + for (int32_t i = 0; i < n_tokens; ++i) { + output_ids[i] = i; + } + + n_outputs = n_tokens; + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + auto * gf = graph_init(); + auto res = graph_build(ctx_compute.get(), gf, ubatch); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + input_set(ubatch); + + const auto compute_status = graph_compute(gf, n_tokens > 1); + switch (compute_status) { + case GGML_STATUS_SUCCESS: + break; + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + + auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd; + + // extract embeddings + if (t_embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + GGML_ASSERT(embd != nullptr); + + // extract token embeddings + float * embd_out = embd; + + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = embd_seq; + embd_seq_out.clear(); + + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + for (int32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = ubatch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // TODO: this likely should be the same logic as in llama_decoder_internal, but better to + // wait for an encoder model that requires this pooling type in order to test it + // https://github.com/ggerganov/llama.cpp/pull/9510 + GGML_ABORT("RANK pooling not implemented yet"); + } + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } + } + } + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + cross->n_outputs = n_tokens; + cross->embd_enc = embd; + + // remember the sequence ids used during the encoding - needed for cross attention later + cross->seq_ids_enc.resize(n_tokens); + for (int32_t i = 0; i < n_tokens; i++) { + for (int s = 0; s < ubatch.n_seq_id[i]; s++) { + llama_seq_id seq_id = ubatch.seq_id[i][s]; + cross->seq_ids_enc[i].insert(seq_id); + } + } + + return 0; +} + +// +// llama_context_dec +// + +void llama_context_dec::reserve() { + // simulate full KV cache + cross->n_outputs = cparams.n_ubatch; + + LLAMA_LOG_DEBUG("%s: n_outputs = %u\n", __func__, cross->n_outputs); + + llama_context_kv_self::reserve(); +} + +void llama_context_dec::input_set(const llama_ubatch & ubatch) { + // call base functionality + llama_context_kv_self::input_set(ubatch); + + if (inp.cross_embd) { + assert(inp.cross_embd->type == GGML_TYPE_F32); + assert(ggml_nelements(inp.cross_embd) == cross->n_outputs*model.hparams.n_embd); + + ggml_backend_tensor_set(inp.cross_embd, cross->embd_enc, 0, ggml_nbytes(inp.cross_embd)); + } + + if (inp.cross_kq_mask) { + const int64_t n_output_enc = cross->n_outputs; + const int64_t n_tokens = ubatch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(inp.cross_kq_mask->buffer)); + GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing + + float * data = (float *) inp.cross_kq_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + for (int i = 0; i < n_output_enc; ++i) { + float f = -INFINITY; + for (int s = 0; s < ubatch.n_seq_id[j]; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[j][s]; + if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) { + f = 0.0f; + } + } + data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f; + } + } + + for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { + for (int j = 0; j < n_output_enc; ++j) { + data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY; + } + } + } + } +} + +ggml_cgraph * llama_context_dec::graph_init() { + inp = {}; + + return llama_context_kv_self::graph_init(); +} + +ggml_tensor * llama_context_dec::build_inp_cross_embd( + ggml_context * ctx0) { + const auto & hparams = model.hparams; + const int64_t n_embd = hparams.n_embd; + + const int32_t n_outputs_enc = cross->n_outputs; + + inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); + ggml_set_input(inp.cross_embd); + + return inp.cross_embd; +} + +void llama_context_dec::build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa) { + llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa); + + const int32_t n_outputs_enc = cross->n_outputs; + + inp.cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + ggml_set_input(inp.cross_kq_mask); + + inp.cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.cross_kq_mask, GGML_TYPE_F16) : inp.cross_kq_mask; +} + +ggml_tensor * llama_context_dec::build_attn_cross( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_b, + float kq_scale, + int il) { + GGML_UNUSED(il); + + const auto & kq_mask = inp.cross_kq_mask_cnv; + + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); + //cb(k, "k", il); + + ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); + //cb(k, "v", il); + + ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, false, kq_scale); + + return cur; +} + // // llama_context_enc_dec // llama_context_enc_dec::llama_context_enc_dec( const llama_model & model, - llama_context_params params) : - llama_context_enc(model, params, LLAMA_GRAPH_TYPE_ENCODER), - ctx_dec(model, params, LLAMA_GRAPH_TYPE_DECODER) { + llama_context_params params) { LLAMA_LOG_INFO("%s: constructing llama_context_enc_dec\n", __func__); + + ctx_enc = std::make_unique(model, params, LLAMA_GRAPH_TYPE_ENCODER); + ctx_dec = std::make_unique(model, params, LLAMA_GRAPH_TYPE_DECODER); + + ctx_enc->cross = ✗ + ctx_dec->cross = ✗ } llama_context_enc_dec::~llama_context_enc_dec() { LLAMA_LOG_INFO("%s: destructing llama_context_enc_dec\n", __func__); } +void llama_context_enc_dec::init() { + ctx_enc->init(); + ctx_dec->init(); +} + +void llama_context_enc_dec::synchronize() { + ctx_enc->synchronize(); + ctx_dec->synchronize(); +} + +const llama_model & llama_context_enc_dec::get_model() const { + return ctx_enc->get_model(); +} + +uint32_t llama_context_enc_dec::n_ctx() const { + return ctx_dec->n_ctx(); +} + +uint32_t llama_context_enc_dec::n_ctx_per_seq() const { + return ctx_dec->n_ctx_per_seq(); +} + +uint32_t llama_context_enc_dec::n_batch() const { + return ctx_dec->n_batch(); +} + +uint32_t llama_context_enc_dec::n_ubatch() const { + return ctx_dec->n_ubatch(); +} + +uint32_t llama_context_enc_dec::n_seq_max() const { + return ctx_dec->n_seq_max(); +} + +uint32_t llama_context_enc_dec::n_threads() const { + return ctx_dec->n_threads(); +} + +uint32_t llama_context_enc_dec::n_threads_batch() const { + return ctx_dec->n_threads_batch(); +} + +llama_kv_cache * llama_context_enc_dec::get_kv_self() { + return ctx_dec->get_kv_self(); +} + +const llama_kv_cache * llama_context_enc_dec::get_kv_self() const { + return ctx_dec->get_kv_self(); +} + +void llama_context_enc_dec::kv_self_update() { + ctx_dec->kv_self_update(); +} + +enum llama_pooling_type llama_context_enc_dec::pooling_type() const { + return ctx_enc->pooling_type(); +} + +float * llama_context_enc_dec::get_logits() { + return ctx_dec->get_logits(); +} + +float * llama_context_enc_dec::get_logits_ith(int32_t i) { + return ctx_dec->get_logits_ith(i); +} + +float * llama_context_enc_dec::get_embeddings() { + return ctx_enc->get_embeddings(); +} + +float * llama_context_enc_dec::get_embeddings_ith(int32_t i) { + return ctx_enc->get_embeddings_ith(i); +} + +float * llama_context_enc_dec::get_embeddings_seq(llama_seq_id seq_id) { + return ctx_enc->get_embeddings_seq(seq_id); +} + +void llama_context_enc_dec::attach_threadpool( + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { + // TODO: attach to both - not sure if this is correct + ctx_enc->attach_threadpool(threadpool, threadpool_batch); + ctx_dec->attach_threadpool(threadpool, threadpool_batch); +} + +void llama_context_enc_dec::detach_threadpool() { + ctx_enc->detach_threadpool(); + ctx_dec->detach_threadpool(); +} + +void llama_context_enc_dec::set_n_threads(int32_t n_threads, int32_t n_threads_batch) { + ctx_enc->set_n_threads(n_threads, n_threads_batch); + ctx_dec->set_n_threads(n_threads, n_threads_batch); +} + +void llama_context_enc_dec::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) { + ctx_enc->set_abort_callback(abort_callback, abort_callback_data); + ctx_dec->set_abort_callback(abort_callback, abort_callback_data); +} + +void llama_context_enc_dec::set_embeddings(bool value) { + GGML_UNUSED(value); + LLAMA_LOG_WARN("%s: set_embeddings() not supported for llama_context_enc_dec\n", __func__); +} + +void llama_context_enc_dec::set_causal_attn(bool value) { + GGML_UNUSED(value); + LLAMA_LOG_WARN("%s: set_causal_attn() not supported for llama_context_enc_dec\n", __func__); +} + +void llama_context_enc_dec::set_adapter_lora( + llama_adapter_lora * adapter, + float scale) { + ctx_dec->set_adapter_lora(adapter, scale); +} + +bool llama_context_enc_dec::rm_adapter_lora( + llama_adapter_lora * adapter) { + return ctx_dec->rm_adapter_lora(adapter); +} + +void llama_context_enc_dec::clear_adapter_lora() { + ctx_dec->clear_adapter_lora(); +} + +bool llama_context_enc_dec::apply_adapter_cvec( + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end) { + return ctx_dec->apply_adapter_cvec(data, len, n_embd, il_start, il_end); +} + +int llama_context_enc_dec::encode(llama_batch & inp_batch) { + return ctx_enc->encode(inp_batch); +} + +int llama_context_enc_dec::decode(llama_batch & inp_batch) { + return ctx_dec->decode(inp_batch); +} + +// +// perf +// + +llama_perf_context_data llama_context_enc_dec::perf_get_data() const { + return ctx_dec->perf_get_data(); +} + +void llama_context_enc_dec::perf_reset() { + ctx_enc->perf_reset(); + ctx_dec->perf_reset(); +} + +// +// state save/load +// + +size_t llama_context_enc_dec::state_get_size() { + GGML_ABORT("TODO: implement"); +} + +size_t llama_context_enc_dec::state_get_data( uint8_t * dst, size_t size) { + GGML_UNUSED(dst); + GGML_UNUSED(size); + GGML_ABORT("TODO: implement"); +} + +size_t llama_context_enc_dec::state_set_data(const uint8_t * src, size_t size) { + GGML_UNUSED(src); + GGML_UNUSED(size); + GGML_ABORT("TODO: implement"); +} + +size_t llama_context_enc_dec::state_seq_get_size(llama_seq_id seq_id) { + GGML_UNUSED(seq_id); + GGML_ABORT("TODO: implement"); +} + +size_t llama_context_enc_dec::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { + GGML_UNUSED(seq_id); + GGML_UNUSED(dst); + GGML_UNUSED(size); + GGML_ABORT("TODO: implement"); +} + +size_t llama_context_enc_dec::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { + GGML_UNUSED(seq_id); + GGML_UNUSED(src); + GGML_UNUSED(size); + GGML_ABORT("TODO: implement"); +} + +bool llama_context_enc_dec::state_load_file( + const char * filepath, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) { + GGML_UNUSED(filepath); + GGML_UNUSED(tokens_out); + GGML_UNUSED(n_token_capacity); + GGML_UNUSED(n_token_count_out); + GGML_ABORT("TODO: implement"); +} + +bool llama_context_enc_dec::state_save_file( + const char * filepath, + const llama_token * tokens, + size_t n_token_count) { + GGML_UNUSED(filepath); + GGML_UNUSED(tokens); + GGML_UNUSED(n_token_count); + GGML_ABORT("TODO: implement"); +} + +size_t llama_context_enc_dec::state_seq_load_file( + llama_seq_id seq_id, + const char * filepath, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) { + GGML_UNUSED(seq_id); + GGML_UNUSED(filepath); + GGML_UNUSED(tokens_out); + GGML_UNUSED(n_token_capacity); + GGML_UNUSED(n_token_count_out); + GGML_ABORT("TODO: implement"); +} + +size_t llama_context_enc_dec::state_seq_save_file( + llama_seq_id seq_id, + const char * filepath, + const llama_token * tokens, + size_t n_token_count) { + GGML_UNUSED(seq_id); + GGML_UNUSED(filepath); + GGML_UNUSED(tokens); + GGML_UNUSED(n_token_count); + GGML_ABORT("TODO: implement"); +} + // // interface implementation // diff --git a/src/llama-context.h b/src/llama-context.h index d647a426cd1be..3165865a73c37 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -30,8 +30,7 @@ struct llama_context { virtual void synchronize() = 0; - virtual const llama_model & get_model() const = 0; - virtual const llama_cparams & get_cparams() const = 0; + virtual const llama_model & get_model() const = 0; virtual uint32_t n_ctx() const = 0; virtual uint32_t n_ctx_per_seq() const = 0; @@ -42,8 +41,6 @@ struct llama_context { virtual uint32_t n_threads() const = 0; virtual uint32_t n_threads_batch() const = 0; - virtual int32_t max_nodes() const = 0; - // self-attention: // if the context does not have a KV cache, return nullptr @@ -62,8 +59,6 @@ struct llama_context { virtual float * get_embeddings_ith(int32_t i) = 0; virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0; - virtual int64_t n_pos_per_token() const = 0; // vision - virtual void attach_threadpool( ggml_threadpool_t threadpool, ggml_threadpool_t threadpool_batch) = 0; @@ -190,8 +185,7 @@ class llama_context_base : public llama_context_i, public llama_graph_i { virtual void reserve(); public: - const llama_model & get_model() const override; - const llama_cparams & get_cparams() const override; + const llama_model & get_model() const override; uint32_t n_ctx() const override; uint32_t n_ctx_per_seq() const override; @@ -202,15 +196,9 @@ class llama_context_base : public llama_context_i, public llama_graph_i { uint32_t n_threads() const override; uint32_t n_threads_batch() const override; - int32_t max_nodes() const override; - - // self-attention: - - // if the context does not have a KV cache, return nullptr llama_kv_cache * get_kv_self() override; const llama_kv_cache * get_kv_self() const override; - // if the context does not have a KV cache, noop void kv_self_update() override; enum llama_pooling_type pooling_type() const override; @@ -222,8 +210,6 @@ class llama_context_base : public llama_context_i, public llama_graph_i { float * get_embeddings_ith(int32_t i) override; float * get_embeddings_seq(llama_seq_id seq_id) override; - int64_t n_pos_per_token() const override; // vision - void attach_threadpool( ggml_threadpool_t threadpool, ggml_threadpool_t threadpool_batch) override; @@ -261,6 +247,8 @@ class llama_context_base : public llama_context_i, public llama_graph_i { // input // + virtual int64_t n_pos_per_token() const; // vision + // when the compute graph is built, it creates the input tensors that it needs // the contents of the input tensors are set by the input_set() function @@ -299,6 +287,8 @@ class llama_context_base : public llama_context_i, public llama_graph_i { // graph // + virtual int32_t graph_max_nodes() const; + // zero-out inputs and create the ctx_compute for the compute graph virtual ggml_cgraph * graph_init(); @@ -477,11 +467,11 @@ class llama_context_base : public llama_context_i, public llama_graph_i { size_t n_token_count) override; protected: - virtual size_t state_get_data(llama_io_write_i & io); - virtual size_t state_set_data(llama_io_read_i & io); + virtual size_t state_write_data(llama_io_write_i & io); + virtual size_t state_read_data (llama_io_read_i & io); - virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id); - virtual size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id); + virtual size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id); + virtual size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id); // // members @@ -625,39 +615,15 @@ class llama_context_kv_self : public llama_context_base { ggml_context * ctx0, ggml_cgraph * gf) override; - // ======================================================= - // === encoder-decoder === - // - // TODO: this is temporary here, it will be moved - // - - // whether we are computing encoder output or decoder output - bool is_encoding = false; - - // output of the encoder part of the encoder-decoder models - std::vector embd_enc; - std::vector> seq_ids_enc; - - struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] - struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch] - - ggml_tensor * build_inp_embd_enc( - ggml_context * ctx0) override; - - ggml_tensor * build_inp_kq_mask_cross( - ggml_context * ctx0, - int32_t n_tokens) override; - // ====================================================== - // // state save/load // - size_t state_get_data(llama_io_write_i & io) override; - size_t state_set_data(llama_io_read_i & io) override; + size_t state_write_data(llama_io_write_i & io) override; + size_t state_read_data (llama_io_read_i & io) override; - size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; - size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; + size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) override; + size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id) override; private: // @@ -767,11 +733,11 @@ class llama_context_recurrent : public llama_context_base { // state save/load // - size_t state_get_data(llama_io_write_i & io) override; - size_t state_set_data(llama_io_read_i & io) override; + size_t state_write_data(llama_io_write_i & io) override; + size_t state_read_data (llama_io_read_i & io) override; - size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override; - size_t state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) override; + size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) override; + size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id) override; private: // @@ -782,21 +748,206 @@ class llama_context_recurrent : public llama_context_base { llama_kv_cache_recurrent kv_self; }; +// TODO: tmp - need something better +struct llama_cross { + int32_t n_outputs; + float * embd_enc; + + std::vector> seq_ids_enc; +}; + class llama_context_enc : public llama_context_base { public: using llama_context_base::llama_context_base; + + int encode(llama_batch & inp_batch) override; + + llama_cross * cross = nullptr; }; -class llama_context_enc_dec : public llama_context_enc { +class llama_context_dec : public llama_context_kv_self { +public: + using llama_context_kv_self::llama_context_kv_self; + +protected: + void reserve() override; + + // + // input + // + + void input_set(const llama_ubatch & ubatch) override; + +private: + struct { + ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc] + ggml_tensor * cross_kq_mask; // F32 [n_outputs_enc, n_batch] + ggml_tensor * cross_kq_mask_cnv; // F32 [n_outputs_enc, n_batch] + } inp; + +protected: + // + // graph + // + + ggml_cgraph * graph_init() override; + + ggml_tensor * build_inp_cross_embd( + ggml_context * ctx0) override; + + void build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa) override; + + ggml_tensor * build_attn_cross( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_b, + float kq_scale, + int il) override; + +public: + llama_cross * cross = nullptr; +}; + +class llama_context_enc_dec : public llama_context_i { public: llama_context_enc_dec( const llama_model & model, llama_context_params params); - virtual ~llama_context_enc_dec(); + ~llama_context_enc_dec(); + + void init() override; + + void synchronize() override; + + const llama_model & get_model() const override; + + // TODO: the default implementation of these getters calls the corresponding getter of the enc or dec context + // in the future, the public API in llama.h should allow to get references to the context that the user wants + // this will allow to specify the desired context explicitly + // for example: + // + // // this can be an enc-dec context + // llama_context_t ctx = llama_init_from_model(...); + // + // ... + // + // llama_context_t ctx_enc = llama_get_ctx_enc(ctx); + // llama_set_embeddings(ctx_enc, true); + // + // llama_context_t ctx_dec = llama_get_ctx_dec(ctx); + // llama_set_causal_attn(ctx_dec, true); + // + uint32_t n_ctx() const override; + uint32_t n_ctx_per_seq() const override; + uint32_t n_batch() const override; + uint32_t n_ubatch() const override; + uint32_t n_seq_max() const override; + + uint32_t n_threads() const override; + uint32_t n_threads_batch() const override; + + llama_kv_cache * get_kv_self() override; + const llama_kv_cache * get_kv_self() const override; + + void kv_self_update() override; + + enum llama_pooling_type pooling_type() const override; + + float * get_logits() override; + float * get_logits_ith(int32_t i) override; + + float * get_embeddings() override; + float * get_embeddings_ith(int32_t i) override; + float * get_embeddings_seq(llama_seq_id seq_id) override; + + void attach_threadpool( + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) override; + + void detach_threadpool() override; + + void set_n_threads(int32_t n_threads, int32_t n_threads_batch) override; + + void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) override; + + void set_embeddings (bool value) override; + void set_causal_attn(bool value) override; + + void set_adapter_lora( + llama_adapter_lora * adapter, + float scale) override; + + bool rm_adapter_lora( + llama_adapter_lora * adapter) override; + + void clear_adapter_lora() override; + + bool apply_adapter_cvec( + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end) override; + + int encode(llama_batch & inp_batch) override; + int decode(llama_batch & inp_batch) override; + + // + // perf + // + + llama_perf_context_data perf_get_data() const override; + void perf_reset() override; + + // + // state save/load + // + + size_t state_get_size() override; + size_t state_get_data( uint8_t * dst, size_t size) override; + size_t state_set_data(const uint8_t * src, size_t size) override; + + size_t state_seq_get_size(llama_seq_id seq_id) override; + size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) override; + size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override; + + bool state_load_file( + const char * filepath, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) override; + + bool state_save_file( + const char * filepath, + const llama_token * tokens, + size_t n_token_count) override; + + size_t state_seq_load_file( + llama_seq_id seq_id, + const char * filepath, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) override; + + size_t state_seq_save_file( + llama_seq_id seq_id, + const char * filepath, + const llama_token * tokens, + size_t n_token_count) override; private: - llama_context_kv_self ctx_dec; + std::unique_ptr ctx_enc; + std::unique_ptr ctx_dec; + + llama_cross cross; }; // For internal test use diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 99eb326205bc6..1e336e844ada0 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -26,7 +26,29 @@ ggml_tensor * llama_graph_i::build_attn( return nullptr; } -ggml_tensor * llama_graph_i::build_inp_embd_enc( +ggml_tensor * llama_graph_i::build_attn_cross( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_b, + float kq_scale, + int il) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + GGML_UNUSED(q_cur); + GGML_UNUSED(k_cur); + GGML_UNUSED(v_cur); + GGML_UNUSED(kq_b); + GGML_UNUSED(kq_scale); + GGML_UNUSED(il); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + return nullptr; +} + +ggml_tensor * llama_graph_i::build_inp_cross_embd( ggml_context * ctx0) { GGML_UNUSED(ctx0); @@ -34,7 +56,7 @@ ggml_tensor * llama_graph_i::build_inp_embd_enc( return nullptr; } -ggml_tensor * llama_graph_i::build_inp_kq_mask_cross( +ggml_tensor * llama_graph_i::build_inp_cross_kq_mask( ggml_context * ctx0, int32_t n_tokens) { GGML_UNUSED(ctx0); diff --git a/src/llama-graph.h b/src/llama-graph.h index c84c254934ff1..28e8a563067db 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -114,10 +114,20 @@ class llama_graph_i { float kq_scale, int il); - virtual ggml_tensor * build_inp_embd_enc( + virtual ggml_tensor * build_attn_cross( + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_b, + float kq_scale, + int il); + + virtual ggml_tensor * build_inp_cross_embd( ggml_context * ctx0); - virtual ggml_tensor * build_inp_kq_mask_cross( + virtual ggml_tensor * build_inp_cross_kq_mask( ggml_context * ctx0, int32_t n_tokens); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e8057f4687fdf..38e8c2812fcbb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3964,16 +3964,16 @@ struct llm_build_context { } // TODO: tmp - struct ggml_tensor * build_inp_embd_enc() { - ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0); + struct ggml_tensor * build_inp_cross_embd() { + ggml_tensor * cur = lgf->build_inp_cross_embd(ctx0); cb(cur, "embd_enc", -1); return cur; } // TODO: tmp - struct ggml_tensor * build_inp_kq_mask_cross() { - ggml_tensor * cur = lgf->build_inp_kq_mask_cross(ctx0, n_tokens); + struct ggml_tensor * build_inp_cross_kq_mask() { + ggml_tensor * cur = lgf->build_inp_cross_kq_mask(ctx0, n_tokens); cb(cur, "KQ_mask_cross", -1); return cur; @@ -4294,6 +4294,42 @@ struct llm_build_context { return cur; } + struct ggml_tensor * build_attn_cross( + struct ggml_cgraph * gf, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * q_cur, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + int32_t n_tokens, // TODO: remove + float kq_scale, + int il) { + GGML_UNUSED(n_tokens); + + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, k_cur); + ggml_build_forward_expand(gf, v_cur); + + ggml_tensor * cur = lgf->build_attn_cross(ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il); + cb(cur, "kqv_out", il); + + if (wo) { + cur = lgf->build_lora_mm(ctx0, wo, cur); + } + + if (wo_b) { + //cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; + } + struct ggml_tensor * build_attn_with_kq_b( struct ggml_cgraph * gf, struct ggml_tensor * wo, @@ -9762,209 +9798,173 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); } - //void build_t5_dec(ggml_cgraph * gf) { - // const int64_t n_embd_head = hparams.n_embd_head_v; - // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - - // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - // struct ggml_tensor * cur; - // struct ggml_tensor * inpL; - - // inpL = build_inp_embd(model.tok_embd); - - // GGML_ASSERT(!lctx.is_encoding); - // GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - - // struct ggml_tensor * embd_enc = build_inp_embd_enc(); - // struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); - - // struct ggml_tensor * KQ_mask_dec = build_inp_kq_mask(); - // struct ggml_tensor * KQ_mask_cross = build_inp_kq_mask_cross(); - - // for (int il = 0; il < n_layer; ++il) { - // struct ggml_tensor * inpSA = inpL; - - // // norm - // cur = build_norm(inpL, - // model.layers[il].attn_norm, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "attn_norm", il); - - // // self-attention - // { - // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - // cb(Qcur, "Qcur", il); - - // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - // cb(Kcur, "Kcur", il); + void build_t5_dec(ggml_cgraph * gf) { + const int64_t n_embd_head = hparams.n_embd_head_v; + //const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - // cb(Vcur, "Vcur", il); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - // build_kv_store(gf, Kcur, Vcur, il); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - // struct ggml_tensor * k = - // ggml_view_3d(ctx0, kv_self.k_l[il], - // n_embd_head_k, n_kv, n_head_kv, - // ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - // ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - // 0); - // cb(k, "k", il); + inpL = build_inp_embd(model.tok_embd); - // struct ggml_tensor * v = - // ggml_view_3d(ctx0, kv_self.v_l[il], - // n_kv, n_embd_head_v, n_head_kv, - // ggml_element_size(kv_self.v_l[il])*n_ctx, - // ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, - // 0); - // cb(v, "v", il); + struct ggml_tensor * embd_enc = build_inp_cross_embd(); + struct ggml_tensor * pos_bucket_dec = build_pos_bucket(); - // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + const int64_t n_outputs_enc = embd_enc->ne[1]; - // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + lgf->build_attn_inp(ctx0, n_tokens, true, false); - // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - // cb(kq, "kq", il); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; - // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); - // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - // cb(kq_b, "kq_b", il); + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); - // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); - // cb(kq, "kq_soft_max_ext", il); + // self-attention + { + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); - // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - // cb(kqv, "kqv", il); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - // cb(kqv_merged, "kqv_merged", il); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - // cb(cur, "kqv_merged_cont", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - // ggml_build_forward_expand(gf, cur); + struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; + struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b); - // cur = build_lora_mm(model.layers[il].wo, cur); - // cb(cur, "kqv_out", il); - // } + cur = build_attn_with_kq_b(gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, kq_b, n_tokens, 1.0f, il); + cb(cur, "kqv_out", il); + } - // cur = ggml_add(ctx0, cur, inpSA); - // cb(cur, "cross_inp", il); + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "cross_inp", il); - // struct ggml_tensor * inpCA = cur; + struct ggml_tensor * inpCA = cur; - // // norm - // cur = build_norm(cur, - // model.layers[il].attn_norm_cross, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "attn_norm_cross", il); + // norm + cur = build_norm(cur, + model.layers[il].attn_norm_cross, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm_cross", il); - // // cross-attention - // { - // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur); - // cb(Qcur, "Qcur", il); + // cross-attention + { + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur); + cb(Qcur, "Qcur", il); - // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc); - // cb(Kcur, "Kcur", il); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc); + cb(Kcur, "Kcur", il); - // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc); - // cb(Vcur, "Vcur", il); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc); + cb(Vcur, "Vcur", il); - // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc); - // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + cur = build_attn_cross(gf, + model.layers[il].wo_cross, nullptr, + Qcur, Kcur, Vcur, n_tokens, 1.0f, il); + cb(cur, "kqv_out", il); - // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - // cb(kq, "kq", il); + //struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + //struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - // kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); - // cb(kq, "kq_soft_max_ext", il); + //struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + //cb(kq, "kq", il); - // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); - // cb(v, "v", il); + //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); + //cb(kq, "kq_soft_max_ext", il); - // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); - // cb(kqv, "kqv", il); + //struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); + //cb(v, "v", il); - // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - // cb(kqv_merged, "kqv_merged", il); + //struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); + //cb(kqv, "kqv", il); - // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - // cb(cur, "kqv_merged_cont", il); + //struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + //cb(kqv_merged, "kqv_merged", il); - // ggml_build_forward_expand(gf, cur); + //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + //cb(cur, "kqv_merged_cont", il); - // cur = build_lora_mm(model.layers[il].wo_cross, cur); - // cb(cur, "kqv_out", il); - // } + //ggml_build_forward_expand(gf, cur); - // if (il == n_layer - 1) { - // // skip computing output for unused tokens - // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - // cur = ggml_get_rows(ctx0, cur, inp_out_ids); - // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - // inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); - // } + //cur = build_lora_mm(model.layers[il].wo_cross, cur); + //cb(cur, "kqv_out", il); + } - // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); - // cb(ffn_inp, "ffn_inp", il); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); + } - // // feed-forward network - // { - // cur = build_norm(ffn_inp, - // model.layers[il].ffn_norm, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "ffn_norm", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); + cb(ffn_inp, "ffn_inp", il); - // // T5 uses relu, flan-T5 uses gelu-gated - // cur = build_ffn(cur, - // model.layers[il].ffn_up, NULL, NULL, - // model.layers[il].ffn_gate, NULL, NULL, - // model.layers[il].ffn_down, NULL, NULL, - // NULL, - // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - // il); - // cb(cur, "ffn_out", il); - // } + // feed-forward network + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); - // cur = ggml_add(ctx0, cur, ffn_inp); - // cb(cur, "ffn_out", il); + // T5 uses relu, flan-T5 uses gelu-gated + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + il); + cb(cur, "ffn_out", il); + } - // ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - // if (layer_dir != nullptr) { - // cur = ggml_add(ctx0, cur, layer_dir); - // } - // cb(cur, "l_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); - // // input for next layer - // inpL = cur; - // } + cur = lgf->build_cvec(ctx0, cur, il); + cb(cur, "l_out", il); - // cur = inpL; - // cb(cur, "result_embd", -1); + // input for next layer + inpL = cur; + } - // cur = build_norm(cur, - // model.output_norm, NULL, - // LLM_NORM_RMS, -1); + cur = inpL; + cb(cur, "result_embd", -1); - // cb(cur, "result_norm", -1); - // res.t_embd = cur; + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); - // // lm_head - // cur = build_lora_mm(model.output, cur); + cb(cur, "result_norm", -1); + res.t_embd = cur; - // cb(cur, "result_output", -1); - // res.t_logits = cur; + // lm_head + cur = build_lora_mm(model.output, cur); - // ggml_build_forward_expand(gf, cur); + cb(cur, "result_output", -1); + res.t_logits = cur; - // return gf; - //} + ggml_build_forward_expand(gf, cur); + } void build_jais(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -11119,7 +11119,7 @@ llama_graph_result llama_model::build_graph( llm.build_t5_enc(gf); break; case LLAMA_GRAPH_TYPE_DECODER: - //llm.build_t5_dec(gf); + llm.build_t5_dec(gf); break; default: GGML_ABORT("invalid graph type"); From e5bc5f8e029b668078f76eb779eac52b183ff660 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 25 Feb 2025 12:10:34 +0200 Subject: [PATCH 75/84] context : enc-dec is now working ggml-ci --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 38e8c2812fcbb..8e579d8e88fa1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9741,7 +9741,7 @@ struct llm_build_context { struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b); cur = build_attn_with_kq_b(gf, - model.layers[il].wo, model.layers[il].bo, + model.layers[il].wo_enc, nullptr, Qcur, Kcur, Vcur, kq_b, n_tokens, 1.0f, il); cb(cur, "kqv_out", il); } From e2b3294f2c13c468ca9f798525344b063dafa378 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 25 Feb 2025 12:14:34 +0200 Subject: [PATCH 76/84] context : fix enc-dec state save/load ggml-ci --- src/llama-context.cpp | 48 ++++++++++--------------------------------- 1 file changed, 11 insertions(+), 37 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 81663c40018e3..dacf809086cb1 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -4981,41 +4981,31 @@ void llama_context_enc_dec::perf_reset() { // // state save/load +// TODO: for now dump just the decoder state, in the future dump both // size_t llama_context_enc_dec::state_get_size() { - GGML_ABORT("TODO: implement"); + return ctx_dec->state_get_size(); } size_t llama_context_enc_dec::state_get_data( uint8_t * dst, size_t size) { - GGML_UNUSED(dst); - GGML_UNUSED(size); - GGML_ABORT("TODO: implement"); + return ctx_dec->state_get_data(dst, size); } size_t llama_context_enc_dec::state_set_data(const uint8_t * src, size_t size) { - GGML_UNUSED(src); - GGML_UNUSED(size); - GGML_ABORT("TODO: implement"); + return ctx_dec->state_set_data(src, size); } size_t llama_context_enc_dec::state_seq_get_size(llama_seq_id seq_id) { - GGML_UNUSED(seq_id); - GGML_ABORT("TODO: implement"); + return ctx_dec->state_seq_get_size(seq_id); } size_t llama_context_enc_dec::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) { - GGML_UNUSED(seq_id); - GGML_UNUSED(dst); - GGML_UNUSED(size); - GGML_ABORT("TODO: implement"); + return ctx_dec->state_seq_get_data(seq_id, dst, size); } size_t llama_context_enc_dec::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) { - GGML_UNUSED(seq_id); - GGML_UNUSED(src); - GGML_UNUSED(size); - GGML_ABORT("TODO: implement"); + return ctx_dec->state_seq_set_data(seq_id, src, size); } bool llama_context_enc_dec::state_load_file( @@ -5023,21 +5013,14 @@ bool llama_context_enc_dec::state_load_file( llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - GGML_UNUSED(filepath); - GGML_UNUSED(tokens_out); - GGML_UNUSED(n_token_capacity); - GGML_UNUSED(n_token_count_out); - GGML_ABORT("TODO: implement"); + return ctx_dec->state_load_file(filepath, tokens_out, n_token_capacity, n_token_count_out); } bool llama_context_enc_dec::state_save_file( const char * filepath, const llama_token * tokens, size_t n_token_count) { - GGML_UNUSED(filepath); - GGML_UNUSED(tokens); - GGML_UNUSED(n_token_count); - GGML_ABORT("TODO: implement"); + return ctx_dec->state_save_file(filepath, tokens, n_token_count); } size_t llama_context_enc_dec::state_seq_load_file( @@ -5046,12 +5029,7 @@ size_t llama_context_enc_dec::state_seq_load_file( llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - GGML_UNUSED(seq_id); - GGML_UNUSED(filepath); - GGML_UNUSED(tokens_out); - GGML_UNUSED(n_token_capacity); - GGML_UNUSED(n_token_count_out); - GGML_ABORT("TODO: implement"); + return ctx_dec->state_seq_load_file(seq_id, filepath, tokens_out, n_token_capacity, n_token_count_out); } size_t llama_context_enc_dec::state_seq_save_file( @@ -5059,11 +5037,7 @@ size_t llama_context_enc_dec::state_seq_save_file( const char * filepath, const llama_token * tokens, size_t n_token_count) { - GGML_UNUSED(seq_id); - GGML_UNUSED(filepath); - GGML_UNUSED(tokens); - GGML_UNUSED(n_token_count); - GGML_ABORT("TODO: implement"); + return ctx_dec->state_seq_save_file(seq_id, filepath, tokens, n_token_count); } // From 4efe9898862ccea908176a6801c643382f2e27f7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 25 Feb 2025 16:11:17 +0200 Subject: [PATCH 77/84] context : pass embeddings tensor from encoder to decoder ggml-ci --- src/llama-context.cpp | 45 ++++++++++++++++++++++++------------------- src/llama-context.h | 7 ++++--- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index dacf809086cb1..f7c83e886ef1c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -4540,6 +4540,7 @@ size_t llama_context_recurrent::state_seq_read_data(llama_io_read_i & io, llama_ // llama_context_enc // +// TODO: avoid copy-paste of the entire encode() function int llama_context_enc::encode(llama_batch & inp_batch) { if (inp_batch.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); @@ -4671,8 +4672,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) { // overlap with device computation. ggml_backend_sched_reset(sched.get()); - cross->n_outputs = n_tokens; - cross->embd_enc = embd; + cross->t_embd = t_embd; // remember the sequence ids used during the encoding - needed for cross attention later cross->seq_ids_enc.resize(n_tokens); @@ -4692,9 +4692,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) { void llama_context_dec::reserve() { // simulate full KV cache - cross->n_outputs = cparams.n_ubatch; - - LLAMA_LOG_DEBUG("%s: n_outputs = %u\n", __func__, cross->n_outputs); + cross->t_embd = nullptr; llama_context_kv_self::reserve(); } @@ -4703,15 +4701,15 @@ void llama_context_dec::input_set(const llama_ubatch & ubatch) { // call base functionality llama_context_kv_self::input_set(ubatch); - if (inp.cross_embd) { - assert(inp.cross_embd->type == GGML_TYPE_F32); - assert(ggml_nelements(inp.cross_embd) == cross->n_outputs*model.hparams.n_embd); + //if (inp.cross_embd && inp.cross_embd->op != GGML_OP_NONE) { + // assert(inp.cross_embd->type == GGML_TYPE_F32); + // assert(ggml_nelements(inp.cross_embd) == cross->n_outputs*model.hparams.n_embd); - ggml_backend_tensor_set(inp.cross_embd, cross->embd_enc, 0, ggml_nbytes(inp.cross_embd)); - } + // ggml_backend_tensor_set(inp.cross_embd, cross->embd_enc, 0, ggml_nbytes(inp.cross_embd)); + //} if (inp.cross_kq_mask) { - const int64_t n_output_enc = cross->n_outputs; + const int64_t n_enc = inp.cross_kq_mask->ne[0]; const int64_t n_tokens = ubatch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(inp.cross_kq_mask->buffer)); @@ -4721,7 +4719,7 @@ void llama_context_dec::input_set(const llama_ubatch & ubatch) { for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { - for (int i = 0; i < n_output_enc; ++i) { + for (int i = 0; i < n_enc; ++i) { float f = -INFINITY; for (int s = 0; s < ubatch.n_seq_id[j]; ++s) { const llama_seq_id seq_id = ubatch.seq_id[j][s]; @@ -4729,13 +4727,13 @@ void llama_context_dec::input_set(const llama_ubatch & ubatch) { f = 0.0f; } } - data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f; + data[h*(n_enc*n_tokens) + j*n_enc + i] = f; } } for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_output_enc; ++j) { - data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY; + for (int j = 0; j < n_enc; ++j) { + data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY; } } } @@ -4750,12 +4748,19 @@ ggml_cgraph * llama_context_dec::graph_init() { ggml_tensor * llama_context_dec::build_inp_cross_embd( ggml_context * ctx0) { + // if we have the output embeddings from the encoder, use them directly + if (cross->t_embd) { + inp.cross_embd = ggml_view_tensor(ctx0, cross->t_embd); + + return inp.cross_embd; + } + const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd; - const int32_t n_outputs_enc = cross->n_outputs; + const auto n_embd = hparams.n_embd; + const auto n_enc = hparams.n_ctx_train; - inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); + inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); ggml_set_input(inp.cross_embd); return inp.cross_embd; @@ -4768,9 +4773,9 @@ void llama_context_dec::build_attn_inp( bool swa) { llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa); - const int32_t n_outputs_enc = cross->n_outputs; + const int32_t n_enc = cross->t_embd ? cross->t_embd->ne[1] : model.hparams.n_ctx_train; - inp.cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + inp.cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); ggml_set_input(inp.cross_kq_mask); inp.cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.cross_kq_mask, GGML_TYPE_F16) : inp.cross_kq_mask; diff --git a/src/llama-context.h b/src/llama-context.h index 3165865a73c37..af35b577b3af1 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -748,11 +748,12 @@ class llama_context_recurrent : public llama_context_base { llama_kv_cache_recurrent kv_self; }; -// TODO: tmp - need something better +// TODO: tmp - need something better to pass the data from the encoder to the decoder struct llama_cross { - int32_t n_outputs; - float * embd_enc; + // the output embeddings from the encoder + ggml_tensor * t_embd = nullptr; + // needed to construct the cross-attention mask in the decoder std::vector> seq_ids_enc; }; From 952feedfca81134c686781ec210a6a15e5bd2b6c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 27 Feb 2025 15:07:10 +0200 Subject: [PATCH 78/84] context : disable encoder embd tensor for now ggml-ci --- src/llama-context.cpp | 23 ++++++++++++----------- src/llama-context.h | 7 ++++++- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f7c83e886ef1c..4341c571e3b2d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -4673,6 +4673,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); cross->t_embd = t_embd; + cross->v_embd = embd; // remember the sequence ids used during the encoding - needed for cross attention later cross->seq_ids_enc.resize(n_tokens); @@ -4701,12 +4702,11 @@ void llama_context_dec::input_set(const llama_ubatch & ubatch) { // call base functionality llama_context_kv_self::input_set(ubatch); - //if (inp.cross_embd && inp.cross_embd->op != GGML_OP_NONE) { - // assert(inp.cross_embd->type == GGML_TYPE_F32); - // assert(ggml_nelements(inp.cross_embd) == cross->n_outputs*model.hparams.n_embd); + if (inp.cross_embd && cross->t_embd) { + assert(inp.cross_embd->type == GGML_TYPE_F32); - // ggml_backend_tensor_set(inp.cross_embd, cross->embd_enc, 0, ggml_nbytes(inp.cross_embd)); - //} + ggml_backend_tensor_set(inp.cross_embd, cross->v_embd, 0, ggml_nbytes(inp.cross_embd)); + } if (inp.cross_kq_mask) { const int64_t n_enc = inp.cross_kq_mask->ne[0]; @@ -4749,16 +4749,17 @@ ggml_cgraph * llama_context_dec::graph_init() { ggml_tensor * llama_context_dec::build_inp_cross_embd( ggml_context * ctx0) { // if we have the output embeddings from the encoder, use them directly - if (cross->t_embd) { - inp.cross_embd = ggml_view_tensor(ctx0, cross->t_embd); + // TODO: needs more work to be correct, for now just use the tensor shape + //if (cross->t_embd) { + // inp.cross_embd = ggml_view_tensor(ctx0, cross->t_embd); - return inp.cross_embd; - } + // return inp.cross_embd; + //} const auto & hparams = model.hparams; - const auto n_embd = hparams.n_embd; - const auto n_enc = hparams.n_ctx_train; + const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd; + const auto n_enc = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train; inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); ggml_set_input(inp.cross_embd); diff --git a/src/llama-context.h b/src/llama-context.h index af35b577b3af1..1b807ccf84a5c 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -750,9 +750,14 @@ class llama_context_recurrent : public llama_context_base { // TODO: tmp - need something better to pass the data from the encoder to the decoder struct llama_cross { - // the output embeddings from the encoder + // the output embeddings from the encoder as a ggml tensor + // TODO: this needs more work to be correct, for now copy the embeddings data to host memory + // ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524 ggml_tensor * t_embd = nullptr; + // embeddings data copied to host memory (tmp) + float * v_embd = nullptr; + // needed to construct the cross-attention mask in the decoder std::vector> seq_ids_enc; }; From 828effd9d74d770e03852b6123d54f12e92bb950 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 27 Feb 2025 15:54:44 +0200 Subject: [PATCH 79/84] kv-cache : basic abstraction ggml-ci --- src/llama-context.cpp | 288 +++++++++++++++++++++-------------------- src/llama-context.h | 4 +- src/llama-kv-cache.cpp | 84 +++++++----- src/llama-kv-cache.h | 66 +++++++--- 4 files changed, 244 insertions(+), 198 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4341c571e3b2d..5c77b29c13a7d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2384,15 +2384,16 @@ llama_context_kv_self::llama_context_kv_self( const llama_model & model, llama_context_params params, llama_graph_type gtype) : - llama_context_base(model, params, gtype), - kv_self(model.hparams) { + llama_context_base(model, params, gtype) { LLAMA_LOG_INFO("%s: constructing llama_context_kv_self\n", __func__); const auto & hparams = model.hparams; + kv_self = std::make_unique(hparams); + LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv_self.get_padding(cparams)); + cparams.n_ctx = GGML_PAD(cparams.n_ctx, kv_self->get_padding(cparams)); LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx); @@ -2406,14 +2407,14 @@ llama_context_kv_self::llama_context_kv_self( GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); if (!hparams.vocab_only) { - if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { + if (!kv_self->init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); throw std::runtime_error("failed to initialize self-attention cache"); } { - const size_t memory_size_k = kv_self.size_k_bytes(); - const size_t memory_size_v = kv_self.size_v_bytes(); + const size_t memory_size_k = kv_self->size_k_bytes(); + const size_t memory_size_v = kv_self->size_v_bytes(); LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), @@ -2427,19 +2428,19 @@ llama_context_kv_self::~llama_context_kv_self() = default; void llama_context_kv_self::reserve() { // simulate full KV cache - kv_self.n = kv_self.size; + kv_self->n = kv_self->size; - LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n); + LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self->n); llama_context_base::reserve(); } llama_kv_cache * llama_context_kv_self::get_kv_self() { - return &kv_self; + return kv_self.get(); } const llama_kv_cache * llama_context_kv_self::get_kv_self() const { - return &kv_self; + return kv_self.get(); } void llama_context_kv_self::kv_self_update() { @@ -2449,8 +2450,8 @@ void llama_context_kv_self::kv_self_update() { bool need_reserve = false; - if (kv.has_shift) { - if (!kv.can_shift) { + if (kv->has_shift) { + if (!kv->get_can_shift()) { GGML_ABORT("The current context does not support K-shift"); } @@ -2474,16 +2475,16 @@ void llama_context_kv_self::kv_self_update() { } { - kv.has_shift = false; + kv->has_shift = false; - for (uint32_t i = 0; i < kv.size; ++i) { - kv.cells[i].delta = 0; + for (uint32_t i = 0; i < kv->size; ++i) { + kv->cells[i].delta = 0; } } } // defragment the KV cache if needed - if (kv.do_defrag) { + if (kv->do_defrag) { LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); ggml_backend_sched_reset(sched.get()); @@ -2499,7 +2500,7 @@ void llama_context_kv_self::kv_self_update() { graph_compute(gf, false); - kv.do_defrag = false; + kv->do_defrag = false; need_reserve = true; } @@ -2513,7 +2514,7 @@ void llama_context_kv_self::kv_self_update() { uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); // simulate full KV cache - kv_self.n = kv_self.size; + kv_self->n = kv_self->size; llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; @@ -2537,7 +2538,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { // temporary allocate memory for the input batch if needed // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1); const llama_batch & batch = batch_allocr.batch; const int32_t n_tokens = batch.n_tokens; @@ -2674,7 +2675,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // temporary allocate memory for the input batch if needed // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1); const llama_batch & batch = batch_allocr.batch; @@ -2689,7 +2690,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // TODO: remove this stuff class batch_guard { public: - batch_guard(llama_kv_cache & kv_self) : kv_slot_restorer(kv_self) { + batch_guard(llama_kv_cache_unified & kv_self) : kv_slot_restorer(kv_self) { } ~batch_guard() { @@ -2712,7 +2713,7 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { llama_kv_slot_restorer kv_slot_restorer; }; - batch_guard bg(kv_self); + batch_guard bg(*kv_self); GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT @@ -2797,11 +2798,11 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it - if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) { - kv_self.head = 0; + if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) { + kv_self->head = 0; } - const auto slot_info = kv_self.find_slot(ubatch); + const auto slot_info = kv_self->find_slot(ubatch); if (!slot_info) { LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); return -3; @@ -2813,12 +2814,12 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - const uint32_t pad = kv_self.get_padding(cparams); - kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); + const uint32_t pad = kv_self->get_padding(cparams); + kv_self->n = std::min(kv_self->size, std::max(pad, GGML_PAD(kv_self->cell_max(), pad))); } } - //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head); ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); @@ -2847,11 +2848,11 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // update the kv ring buffer { - kv_self.head += ubatch.n_tokens; + kv_self->head += ubatch.n_tokens; // Ensure kv cache head points to a valid index. - if (kv_self.head >= kv_self.size) { - kv_self.head = 0; + if (kv_self->head >= kv_self->size) { + kv_self->head = 0; } } @@ -2972,13 +2973,13 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { if (cparams.causal_attn && cparams.defrag_thold > 0.0f) { // - do not defrag small contexts (i.e. < 2048 tokens) // - count the padding towards the number of used tokens - const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + kv_self.get_padding(cparams))/float(kv_self.n)) : 0.0f; + const float fragmentation = kv_self->n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self->used + kv_self->get_padding(cparams))/float(kv_self->n)) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > cparams.defrag_thold) { LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); - kv_self.defrag(); + kv_self->defrag(); } } @@ -2997,8 +2998,8 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { int32_t * data = (int32_t *) inp.self_k_shift->data; - for (uint32_t i = 0; i < kv_self.size; ++i) { - data[i] = kv_self.cells[i].delta; + for (uint32_t i = 0; i < kv_self->size; ++i) { + data[i] = kv_self->cells[i].delta; } // the K-shift graph requires just this input @@ -3011,7 +3012,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { if (inp.self_kq_mask || inp.self_kq_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. if (cparams.causal_attn) { - const int64_t n_kv = kv_self.n; + const int64_t n_kv = kv_self->n; const int64_t n_tokens = ubatch.n_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; @@ -3041,11 +3042,11 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { for (int i = 0; i < n_kv; ++i) { float f; - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + if (!kv_self->cells[i].has_seq_id(seq_id) || kv_self->cells[i].pos > pos) { f = -INFINITY; } else { if (hparams.use_alibi) { - f = -std::abs(kv_self.cells[i].pos - pos); + f = -std::abs(kv_self->cells[i].pos - pos); } else { f = 0.0f; } @@ -3057,7 +3058,7 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { // may need to cut off old tokens for sliding window if (data_swa) { - if (pos - kv_self.cells[i].pos >= (int32_t)hparams.n_swa) { + if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) { f = -INFINITY; } data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; @@ -3137,11 +3138,11 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { int32_t * data = (int32_t *) inp.self_pos_bucket->data; - const int64_t n_kv = kv_self.n; + const int64_t n_kv = kv_self->n; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_kv; ++i) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, false); + data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, false); } } } @@ -3164,7 +3165,7 @@ ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) ggml_tensor * llama_context_kv_self::build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) { - const auto n_kv = kv_self.n; + const auto n_kv = kv_self->n; inp.self_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); ggml_set_input(inp.self_pos_bucket); @@ -3177,7 +3178,7 @@ void llama_context_kv_self::build_attn_inp( int32_t n_tokens, bool causal, bool swa) { - const auto n_kv = kv_self.n; + const auto n_kv = kv_self->n; inp.self_kq_mask = causal ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) @@ -3224,13 +3225,13 @@ ggml_tensor * llama_context_kv_self::build_attn( // store to KV cache { - GGML_ASSERT(!kv_self.recurrent); + GGML_ASSERT(!kv_self->recurrent); - const auto kv_head = kv_self.head; + const auto kv_head = kv_self->head; - GGML_ASSERT(kv_self.size == n_ctx); + GGML_ASSERT(kv_self->size == n_ctx); - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head); + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head); //cb(k_cache_view, "k_cache_view", il); // note: storing RoPE-ed version of K in the KV cache @@ -3241,12 +3242,12 @@ ggml_tensor * llama_context_kv_self::build_attn( struct ggml_tensor * v_cache_view = nullptr; if (!v_trans) { - v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head); + v_cache_view = ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head); } else { // note: the V cache is transposed when not using flash attention - v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa, - ( n_ctx)*ggml_element_size(kv_self.v_l[il]), - (kv_head)*ggml_element_size(kv_self.v_l[il])); + v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa, + ( n_ctx)*ggml_element_size(kv_self->v_l[il]), + (kv_head)*ggml_element_size(kv_self->v_l[il])); v_cur = ggml_transpose(ctx0, v_cur); } @@ -3281,7 +3282,7 @@ ggml_tensor * llama_context_kv_self::build_attn( const auto & kq_mask = is_sliding ? inp.self_kq_mask_swa_cnv : inp.self_kq_mask_cnv; - const auto n_kv = kv_self.n; + const auto n_kv = kv_self->n; const int64_t n_head_kv = hparams.n_head_kv(il); @@ -3292,23 +3293,23 @@ ggml_tensor * llama_context_kv_self::build_attn( //cb(q, "q", il); ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], + ggml_view_3d(ctx0, kv_self->k_l[il], n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k), 0); //cb(k, "k", il); ggml_tensor * v = !v_trans ? - ggml_view_3d(ctx0, kv_self.v_l[il], + ggml_view_3d(ctx0, kv_self->v_l[il], n_embd_head_v, n_kv, n_head_kv, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v), + ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v), 0) : - ggml_view_3d(ctx0, kv_self.v_l[il], + ggml_view_3d(ctx0, kv_self->v_l[il], n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv_self.v_l[il])*n_ctx, - ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + ggml_element_size(kv_self->v_l[il])*n_ctx, + ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v, 0); struct ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale); @@ -3326,7 +3327,7 @@ void llama_context_kv_self::build_kv_self_shift( const auto & n_embd_head_k = hparams.n_embd_head_k; //const auto & n_embd_head_v = hparams.n_embd_head_v; - //GGML_ASSERT(kv_self.size == n_ctx); + //GGML_ASSERT(kv_self->size == n_ctx); ggml_tensor * inp_self_k_shift = build_inp_self_k_shift(ctx0); @@ -3337,13 +3338,13 @@ void llama_context_kv_self::build_kv_self_shift( struct ggml_tensor * rope_factors = build_rope_factors(il); struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_head_kv, kv_self.size, - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_view_3d(ctx0, kv_self->k_l[il], + n_embd_head_k, n_head_kv, kv_self->size, + ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), 0); - ggml_tensor * cur = build_rope_shift(ctx0, k, inp_self_k_shift, rope_factors, kv_self.k_l[il]->buffer); + ggml_tensor * cur = build_rope_shift(ctx0, k, inp_self_k_shift, rope_factors, kv_self->k_l[il]->buffer); ggml_build_forward_expand(gf, cur); } @@ -3356,8 +3357,8 @@ void llama_context_kv_self::build_kv_self_defrag( const uint32_t n_layer = hparams.n_layer; - const uint32_t n_kv = kv_self.cell_max(); - const uint32_t n_used = kv_self.used; + const uint32_t n_kv = kv_self->cell_max(); + const uint32_t n_used = kv_self->used; assert(n_used <= n_kv); @@ -3382,7 +3383,7 @@ void llama_context_kv_self::build_kv_self_defrag( std::vector ids(n_kv, n_kv); for (uint32_t i0 = 0; i0 < n_used; ++i0) { - const auto & cell0 = kv_self.cells[i0]; + const auto & cell0 = kv_self->cells[i0]; if (!cell0.is_empty()) { ids[i0] = i0; @@ -3395,7 +3396,7 @@ void llama_context_kv_self::build_kv_self_defrag( uint32_t nh = 1; // determine the size of the hole - while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { + while (i0 + nh < n_used && kv_self->cells[i0 + nh].is_empty()) { nh++; } @@ -3404,7 +3405,7 @@ void llama_context_kv_self::build_kv_self_defrag( // starting from the end, find nh non-empty cells for (; is > i0; --is) { - const auto & cell1 = kv_self.cells[is]; + const auto & cell1 = kv_self->cells[is]; if (cell1.is_empty() || ids[is] != n_kv) { continue; @@ -3433,7 +3434,7 @@ void llama_context_kv_self::build_kv_self_defrag( // go back and move the nf cells to the hole for (; i1 < n_kv; ++i1) { - auto & cell1 = kv_self.cells[i1]; + auto & cell1 = kv_self->cells[i1]; if (cell1.is_empty() || ids[i1] != n_kv) { if (n_moves == max_moves) { @@ -3449,11 +3450,11 @@ void llama_context_kv_self::build_kv_self_defrag( ids[i1] = i0 + nf; // move the cell meta data - kv_self.cells[i0 + nf] = cell1; + kv_self->cells[i0 + nf] = cell1; // clear the old cell and move the head there cell1 = llama_kv_cell(); - kv_self.head = n_used; + kv_self->head = n_used; if (!cont) { n_moves++; @@ -3572,40 +3573,40 @@ void llama_context_kv_self::build_kv_self_defrag( const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self->k_l[il], n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i)); - ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self->k_l[il], n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id)); ggml_tensor * view_v_src; ggml_tensor * view_v_dst; if (cparams.flash_attn) { // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i)); - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il], n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); + ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id)); } else { - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il], nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, i)); + ggml_row_size(kv_self->v_l[il]->type, kv_self->size), + ggml_row_size(kv_self->v_l[il]->type, i)); - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + view_v_dst = ggml_view_2d(ctx0, kv_self->v_l[il], nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, id)); + ggml_row_size(kv_self->v_l[il]->type, kv_self->size), + ggml_row_size(kv_self->v_l[il]->type, id)); } ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); @@ -3625,7 +3626,7 @@ size_t llama_context_kv_self::state_write_data(llama_io_write_i & io) { llama_context_base::state_write_data(io); LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__); - kv_self.state_write(io); + kv_self->state_write(io); return io.n_bytes(); } @@ -3634,7 +3635,7 @@ size_t llama_context_kv_self::state_read_data(llama_io_read_i & io) { llama_context_base::state_read_data(io); LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__); - kv_self.state_read(io); + kv_self->state_read(io); return io.n_bytes(); } @@ -3642,7 +3643,7 @@ size_t llama_context_kv_self::state_read_data(llama_io_read_i & io) { size_t llama_context_kv_self::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) { llama_context_base::state_seq_write_data(io, seq_id); - kv_self.state_write(io, seq_id); + kv_self->state_write(io, seq_id); return io.n_bytes(); } @@ -3650,7 +3651,7 @@ size_t llama_context_kv_self::state_seq_write_data(llama_io_write_i & io, llama_ size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) { llama_context_base::state_seq_read_data(io, seq_id); - kv_self.state_read(io, seq_id); + kv_self->state_read(io, seq_id); return io.n_bytes(); } @@ -3663,12 +3664,13 @@ llama_context_recurrent::llama_context_recurrent( const llama_model & model, llama_context_params params, llama_graph_type gtype) : - llama_context_base(model, params, gtype), - kv_self(model.hparams) { + llama_context_base(model, params, gtype) { LLAMA_LOG_INFO("%s: constructing llama_context_recurrent\n", __func__); const auto & hparams = model.hparams; + kv_self = std::make_unique(hparams); + LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx); // Mamba only needs a constant number of KV cache cells per sequence @@ -3684,14 +3686,14 @@ llama_context_recurrent::llama_context_recurrent( GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); if (!hparams.vocab_only) { - if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { + if (!kv_self->init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); throw std::runtime_error("failed to initialize self-attention cache"); } { - const size_t memory_size_k = kv_self.size_k_bytes(); - const size_t memory_size_v = kv_self.size_v_bytes(); + const size_t memory_size_k = kv_self->size_k_bytes(); + const size_t memory_size_v = kv_self->size_v_bytes(); LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), @@ -3705,20 +3707,20 @@ llama_context_recurrent::~llama_context_recurrent() = default; void llama_context_recurrent::reserve() { // simulate full KV cache - kv_self.n = kv_self.size; + kv_self->n = kv_self->size; - LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n); + LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self->n); // TODO: implement recurrent-specific reserve logic llama_context_base::reserve(); } llama_kv_cache * llama_context_recurrent::get_kv_self() { - return &kv_self; + return kv_self.get(); } const llama_kv_cache * llama_context_recurrent::get_kv_self() const { - return &kv_self; + return kv_self.get(); } void llama_context_recurrent::kv_self_update() { @@ -3740,7 +3742,7 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { // temporary allocate memory for the input batch if needed // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self.pos_max() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1); const llama_batch & batch = batch_allocr.batch; @@ -3755,7 +3757,7 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { // TODO: remove this stuff class batch_guard { public: - batch_guard(llama_kv_cache & kv_self) : kv_slot_restorer(kv_self) { + batch_guard(llama_kv_cache_unified & kv_self) : kv_slot_restorer(kv_self) { } ~batch_guard() { @@ -3778,7 +3780,7 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { llama_kv_slot_restorer kv_slot_restorer; }; - batch_guard bg(kv_self); + batch_guard bg(*kv_self); GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT @@ -3870,11 +3872,11 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it - if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) { - kv_self.head = 0; + if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) { + kv_self->head = 0; } - const auto slot_info = kv_self.find_slot(ubatch); + const auto slot_info = kv_self->find_slot(ubatch); if (!slot_info) { LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); return -3; @@ -3883,7 +3885,7 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { bg.save(slot_info); } - //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head); ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); @@ -3912,11 +3914,11 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { // update the kv ring buffer { - kv_self.head += ubatch.n_tokens; + kv_self->head += ubatch.n_tokens; // Ensure kv cache head points to a valid index. - if (kv_self.head >= kv_self.size) { - kv_self.head = 0; + if (kv_self->head >= kv_self->size) { + kv_self->head = 0; } } @@ -4044,9 +4046,9 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { // call base functionality llama_context_base::input_set(ubatch); - GGML_ASSERT(kv_self.recurrent); + GGML_ASSERT(kv_self->recurrent); - const int64_t n_kv = kv_self.n; + const int64_t n_kv = kv_self->n; if (inp.s_mask) { GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_mask->buffer)); @@ -4054,8 +4056,8 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { // clear unused states for (int i = 0; i < n_kv; ++i) { - const uint32_t cell_id = i + kv_self.head; - llama_kv_cell & kv_cell = kv_self.cells[cell_id]; + const uint32_t cell_id = i + kv_self->head; + llama_kv_cell & kv_cell = kv_self->cells[cell_id]; data[i] = (float) (kv_cell.src >= 0); @@ -4073,11 +4075,11 @@ void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t cell_id = i + kv_self.head; - llama_kv_cell & kv_cell = kv_self.cells[cell_id]; + const uint32_t cell_id = i + kv_self->head; + llama_kv_cell & kv_cell = kv_self->cells[cell_id]; // prevent out-of-bound sources - if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) { + if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) { kv_cell.src = cell_id; } @@ -4101,7 +4103,7 @@ ggml_cgraph * llama_context_recurrent::graph_init() { ggml_tensor * llama_context_recurrent::build_inp_s_copy( ggml_context * ctx0) { - const auto n_kv = kv_self.n; + const auto n_kv = kv_self->n; inp.s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); //cb(inp.s_copy, "inp_s_copy", -1); @@ -4112,7 +4114,7 @@ ggml_tensor * llama_context_recurrent::build_inp_s_copy( ggml_tensor * llama_context_recurrent::build_inp_s_mask( ggml_context * ctx0) { - const auto n_kv = kv_self.n; + const auto n_kv = kv_self->n; inp.s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); //cb(inp.s_mask, "inp_s_mask", -1); @@ -4129,10 +4131,10 @@ ggml_tensor * llama_context_recurrent::build_copy_mask_state( ggml_tensor * state_mask, int32_t n_state, int32_t n_seqs) { - const auto n_kv = kv_self.n; - const auto kv_head = kv_self.head; + const auto n_kv = kv_self->n; + const auto kv_head = kv_self->head; - struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self.size); + struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self->size); // copy states // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv @@ -4164,7 +4166,7 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer( int il) { const auto & hparams = model.hparams; - const auto kv_head = kv_self.head; + const auto kv_head = kv_self->head; const int64_t d_conv = hparams.ssm_d_conv; const int64_t d_inner = hparams.ssm_d_inner; @@ -4182,8 +4184,8 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer( GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); - struct ggml_tensor * conv_states_all = kv_self.k_l[il]; - struct ggml_tensor * ssm_states_all = kv_self.v_l[il]; + struct ggml_tensor * conv_states_all = kv_self->k_l[il]; + struct ggml_tensor * ssm_states_all = kv_self->v_l[il]; // (ab)using the KV cache to store the states struct ggml_tensor * conv = build_copy_mask_state( @@ -4300,7 +4302,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_load( const int64_t n_seqs = ubatch.n_seqs; - struct ggml_tensor * token_shift_all = kv_self.k_l[il]; + struct ggml_tensor * token_shift_all = kv_self->k_l[il]; struct ggml_tensor * token_shift = build_copy_mask_state( ctx0, gf, token_shift_all, state_copy, state_mask, @@ -4323,12 +4325,12 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_store( const int64_t n_seqs = ubatch.n_seqs; - const auto kv_head = kv_self.head; + const auto kv_head = kv_self->head; return ggml_cpy( ctx0, ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0), - ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) + ggml_view_1d(ctx0, kv_self->k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self->k_l[il])) ); } @@ -4350,7 +4352,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( const auto n_head = n_embd / head_size; const auto n_head_kv = hparams.n_head_kv(il); - const auto kv_head = kv_self.head; + const auto kv_head = kv_self->head; const auto & layer = model.layers[il]; @@ -4458,7 +4460,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( } struct ggml_tensor * wkv_state = build_copy_mask_state( - ctx0, gf, kv_self.v_l[il], state_copy, state_mask, + ctx0, gf, kv_self->v_l[il], state_copy, state_mask, hparams.n_embd_v_s(), n_seqs); struct ggml_tensor * wkv_output; @@ -4477,9 +4479,9 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( wkv_state, ggml_view_1d( ctx0, - kv_self.v_l[il], + kv_self->v_l[il], hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il]) ) ) ); @@ -4507,7 +4509,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( size_t llama_context_recurrent::state_write_data(llama_io_write_i & io) { llama_context_base::state_write_data(io); - kv_self.state_write(io); + kv_self->state_write(io); return io.n_bytes(); } @@ -4515,7 +4517,7 @@ size_t llama_context_recurrent::state_write_data(llama_io_write_i & io) { size_t llama_context_recurrent::state_read_data(llama_io_read_i & io) { llama_context_base::state_read_data(io); - kv_self.state_read(io); + kv_self->state_read(io); return io.n_bytes(); } @@ -4523,7 +4525,7 @@ size_t llama_context_recurrent::state_read_data(llama_io_read_i & io) { size_t llama_context_recurrent::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) { llama_context_base::state_seq_write_data(io, seq_id); - kv_self.state_write(io, seq_id); + kv_self->state_write(io, seq_id); return io.n_bytes(); } @@ -4531,7 +4533,7 @@ size_t llama_context_recurrent::state_seq_write_data(llama_io_write_i & io, llam size_t llama_context_recurrent::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) { llama_context_base::state_seq_read_data(io, seq_id); - kv_self.state_read(io, seq_id); + kv_self->state_read(io, seq_id); return io.n_bytes(); } @@ -5211,7 +5213,7 @@ void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * return; } - llama_kv_cache_view_update(view, *kv); + llama_kv_cache_view_update(view, kv); } // diff --git a/src/llama-context.h b/src/llama-context.h index 1b807ccf84a5c..d74db70c7781c 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -630,7 +630,7 @@ class llama_context_kv_self : public llama_context_base { // members // - llama_kv_cache kv_self; + std::unique_ptr kv_self; }; // a recurrent transformer (ie.e RWKV, Mamba) @@ -745,7 +745,7 @@ class llama_context_recurrent : public llama_context_base { // // TODO: change name to something more meaningful -- does "KV cache" make sense for recurrent models? - llama_kv_cache_recurrent kv_self; + std::unique_ptr kv_self; }; // TODO: tmp - need something better to pass the data from the encoder to the decoder diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index e1b07c9932166..0cd4142d5f8d5 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -6,17 +6,16 @@ #include "llama-model.h" #include -#include #include #include #include static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false}; -llama_kv_cache::llama_kv_cache(const llama_hparams & hparams) : hparams(hparams) { +llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams) : hparams(hparams) { } -bool llama_kv_cache::init( +bool llama_kv_cache_unified::init( const llama_model & model, const llama_cparams & cparams, ggml_type type_k, @@ -123,7 +122,7 @@ bool llama_kv_cache::init( return true; } -int32_t llama_kv_cache::n_tokens() const { +int32_t llama_kv_cache_unified::n_tokens() const { int32_t result = 0; for (uint32_t i = 0; i < size; i++) { @@ -133,7 +132,11 @@ int32_t llama_kv_cache::n_tokens() const { return result; } -size_t llama_kv_cache::total_size() const { +uint32_t llama_kv_cache_unified::used_cells() const { + return used; +} + +size_t llama_kv_cache_unified::total_size() const { size_t size = 0; for (const auto & buf : bufs) { size += ggml_backend_buffer_get_size(buf.get()); @@ -142,7 +145,7 @@ size_t llama_kv_cache::total_size() const { return size; } -llama_pos llama_kv_cache::pos_max() const { +llama_pos llama_kv_cache_unified::pos_max() const { llama_pos pos_max = -1; for (const auto & cell : cells) { pos_max = std::max(pos_max, cell.pos); @@ -151,7 +154,7 @@ llama_pos llama_kv_cache::pos_max() const { return pos_max; } -void llama_kv_cache::clear() { +void llama_kv_cache_unified::clear() { for (int32_t i = 0; i < (int32_t) size; ++i) { cells[i].pos = -1; cells[i].seq_id.clear(); @@ -166,7 +169,7 @@ void llama_kv_cache::clear() { } } -bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { +bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { uint32_t new_head = size; if (p0 < 0) { @@ -237,7 +240,7 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { return true; } -void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { +void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { if (seq_id_src == seq_id_dst) { return; } @@ -288,7 +291,7 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll } } -void llama_kv_cache::seq_keep(llama_seq_id seq_id) { +void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) { uint32_t new_head = size; for (uint32_t i = 0; i < size; ++i) { @@ -320,7 +323,7 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) { } } -void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { +void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { if (delta == 0) { return; } @@ -378,7 +381,7 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll head = new_head != size ? new_head : 0; } -void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { +void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { if (d == 1) { return; } @@ -424,7 +427,7 @@ void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, in } } -llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) { +llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) { llama_pos result = 0; for (uint32_t i = 0; i < size; ++i) { @@ -436,13 +439,17 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) { return result; } -void llama_kv_cache::defrag() { +void llama_kv_cache_unified::defrag() { if (!recurrent) { do_defrag = true; } } -struct llama_kv_cache_slot_info llama_kv_cache::find_slot( +bool llama_kv_cache_unified::get_can_shift() const { + return can_shift; +} + +struct llama_kv_cache_slot_info llama_kv_cache_unified::find_slot( const struct llama_ubatch & ubatch) { const uint32_t n_tokens = ubatch.n_tokens; const uint32_t n_seqs = ubatch.n_seqs; @@ -663,12 +670,12 @@ struct llama_kv_cache_slot_info llama_kv_cache::find_slot( return llama_kv_cache_slot_info(head, head + n_tokens); } -uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) const { +uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const { // the FA kernels require padding to avoid extra runtime boundary checks return cparams.flash_attn ? 256u : 32u; } -uint32_t llama_kv_cache::cell_max() const { +uint32_t llama_kv_cache_unified::cell_max() const { for (uint32_t i = size; i > 0; --i) { const llama_kv_cell & cell = cells[i - 1]; @@ -680,7 +687,7 @@ uint32_t llama_kv_cache::cell_max() const { return 0; } -size_t llama_kv_cache::size_k_bytes() const { +size_t llama_kv_cache_unified::size_k_bytes() const { size_t size_k_bytes = 0; for (const auto & k : k_l) { @@ -690,7 +697,7 @@ size_t llama_kv_cache::size_k_bytes() const { return size_k_bytes; } -size_t llama_kv_cache::size_v_bytes() const { +size_t llama_kv_cache_unified::size_v_bytes() const { size_t size_v_bytes = 0; for (const auto & v : v_l) { @@ -700,7 +707,7 @@ size_t llama_kv_cache::size_v_bytes() const { return size_v_bytes; } -void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { +void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { std::vector> cell_ranges; // ranges, from inclusive, to exclusive uint32_t cell_count = 0; @@ -738,7 +745,7 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) con state_write_data(io, cell_ranges); } -void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id) { +void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) { uint32_t cell_count; io.read_to(&cell_count, sizeof(cell_count)); @@ -756,7 +763,7 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id) { } } -void llama_kv_cache::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { +void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { for (const auto & range : cell_ranges) { for (uint32_t i = range.first; i < range.second; ++i) { const auto & cell = cells[i]; @@ -775,7 +782,7 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges) const { +void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { const uint32_t v_trans = this->v_trans ? 1 : 0; const uint32_t n_layer = hparams.n_layer; @@ -855,7 +862,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector= llama_n_seq_max(ctx)) { if (seq_id < 0) { //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); @@ -957,7 +964,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t cell_count, return true; } -bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t cell_count) { +bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) { uint32_t v_trans; uint32_t n_layer; io.read_to(&v_trans, sizeof(v_trans)); @@ -1092,7 +1099,7 @@ int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { return 0; } - return kv->used; + return kv->used_cells(); } void llama_kv_cache_clear(llama_kv_cache * kv) { @@ -1183,7 +1190,7 @@ bool llama_kv_cache_can_shift(const llama_kv_cache * kv) { return false; } - return kv->can_shift; + return kv->get_can_shift(); } // @@ -1216,9 +1223,16 @@ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) { } } -void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv) { - if (uint32_t(view->n_cells) < kv.size || view->cells == nullptr) { - view->n_cells = int32_t(kv.size); +void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache * kv) { + // TODO: rework this in the future, for now quick hack + const llama_kv_cache_unified * kvu = dynamic_cast(kv); + if (kvu == nullptr) { + LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__); + return; + } + + if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) { + view->n_cells = int32_t(kvu->size); void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells); GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells"); view->cells = (struct llama_kv_cache_view_cell *)p; @@ -1227,7 +1241,7 @@ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct view->cells_sequences = (llama_seq_id *)p; } - const std::vector & kv_cells = kv.cells; + const std::vector & kv_cells = kvu->cells; llama_kv_cache_view_cell * c_curr = view->cells; llama_seq_id * cs_curr = view->cells_sequences; int32_t used_cells = 0; @@ -1236,7 +1250,7 @@ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct uint32_t max_contig = 0; int32_t max_contig_idx = -1; - for (int32_t i = 0; i < int32_t(kv.size); i++, c_curr++, cs_curr += view->n_seq_max) { + for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) { const size_t curr_size = kv_cells[i].seq_id.size(); token_count += curr_size; c_curr->pos = kv_cells[i].pos + kv_cells[i].delta; @@ -1274,8 +1288,8 @@ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct view->max_contiguous_idx = max_contig_idx; view->token_count = token_count; view->used_cells = used_cells; - if (uint32_t(used_cells) != kv.used) { + if (uint32_t(used_cells) != kvu->used) { LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n", - __func__, kv.used, used_cells); + __func__, kvu->used, used_cells); } } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index dda9bfec48846..99eb0be3c7404 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -45,12 +45,39 @@ struct llama_kv_cache_slot_info { operator bool() const { return found; } }; +struct llama_kv_cache { +public: + virtual int32_t n_tokens() const = 0; + virtual uint32_t used_cells() const = 0; // TODO: remove + + virtual void clear() = 0; + virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0; + virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0; + virtual void seq_keep(llama_seq_id seq_id) = 0; + virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0; + virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0; + + virtual llama_pos seq_pos_max(llama_seq_id seq_id) = 0; + + virtual void defrag() = 0; + virtual bool get_can_shift() const = 0; +}; + + +// C++ alias +class llama_kv_cache_i : public llama_kv_cache { +public: + using llama_kv_cache::llama_kv_cache; +}; + + // ring-buffer of cached KV data // TODO: pimpl // TODO: add notion of max sequences -struct llama_kv_cache { - llama_kv_cache(const llama_hparams & hparams); - virtual ~llama_kv_cache() = default; +class llama_kv_cache_unified : public llama_kv_cache_i { +public: + llama_kv_cache_unified(const llama_hparams & hparams); + virtual ~llama_kv_cache_unified() = default; // TODO: become constructor bool init( @@ -61,24 +88,26 @@ struct llama_kv_cache { uint32_t kv_size, bool offload); - int32_t n_tokens() const; + int32_t n_tokens() const override; + uint32_t used_cells() const override; size_t total_size() const; // TODO: better data structures to reduce the cost of this operation llama_pos pos_max() const; - void clear(); + void clear() override; - bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1); - void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); - void seq_keep(llama_seq_id seq_id); - void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta); - void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d); + bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; + void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; + void seq_keep(llama_seq_id seq_id) override; + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override; + void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; - llama_pos seq_pos_max(llama_seq_id seq_id); + llama_pos seq_pos_max(llama_seq_id seq_id) override; - void defrag(); + void defrag() override; + bool get_can_shift() const override; // find an empty slot of size "n_tokens" in the cache // updates the cache head @@ -143,9 +172,10 @@ struct llama_kv_cache { bool state_read_data(llama_io_read_i & io, uint32_t cell_count); }; -// TODO: temporary reusing llama_kv_cache -- implement recurrent cache and simplify llama_kv_cache -struct llama_kv_cache_recurrent : public llama_kv_cache { - using llama_kv_cache::llama_kv_cache; +// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified +class llama_kv_cache_recurrent : public llama_kv_cache_unified { +public: + using llama_kv_cache_unified::llama_kv_cache_unified; }; // @@ -166,9 +196,9 @@ struct llama_kv_slot_restorer { bool do_restore = false; - llama_kv_cache & cache; + llama_kv_cache_unified & cache; - explicit llama_kv_slot_restorer(llama_kv_cache & cache) : cache(cache) { + explicit llama_kv_slot_restorer(llama_kv_cache_unified & cache) : cache(cache) { old_state.head = cache.head; old_state.n = cache.n; } @@ -249,4 +279,4 @@ bool llama_kv_cache_can_shift(const llama_kv_cache * kv); struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max); -void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv); +void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache * kv); From 38db8a586105ea8d516e66d0dbcb87924efe70b0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 28 Feb 2025 10:51:17 +0200 Subject: [PATCH 80/84] llama : introduce concept of llama_memory ggml-ci --- src/llama-context.cpp | 2 +- src/llama-context.h | 17 +- src/llama-kv-cache.cpp | 8 +- src/llama-kv-cache.h | 47 +- src/llama-memory.cpp | 1295 ++++++++++++++++++++++++++++++++++++++++ src/llama-memory.h | 21 + 6 files changed, 1345 insertions(+), 45 deletions(-) create mode 100644 src/llama-memory.cpp create mode 100644 src/llama-memory.h diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 5c77b29c13a7d..c599801763181 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -49,7 +49,7 @@ llama_context_base::llama_context_base( const llama_model & model, llama_context_params params, llama_graph_type gtype) : - llama_context_i(), + llama_context(), llama_graph_i(gtype), model(model) { LLAMA_LOG_INFO("%s: constructing llama_context_base, gtype = %d\n", __func__, gtype); diff --git a/src/llama-context.h b/src/llama-context.h index d74db70c7781c..f44652e2d1f18 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -21,10 +21,10 @@ class llama_io_write_i; using llama_loras = std::unordered_map; // abstract interface corresponding to the public C API -struct llama_context { +class llama_context_i { public: - llama_context() = default; - virtual ~llama_context() = default; + llama_context_i() = default; + virtual ~llama_context_i() = default; virtual void init() = 0; @@ -157,14 +157,13 @@ struct llama_context { size_t n_token_count) = 0; }; -// C++ alias -class llama_context_i : public llama_context { -public: - using llama_context::llama_context; +// C alias +struct llama_context : public llama_context_i { + using llama_context_i::llama_context_i; }; // basic transformer without KV cache -class llama_context_base : public llama_context_i, public llama_graph_i { +class llama_context_base : public llama_context, public llama_graph_i { public: llama_context_base( const llama_model & model, @@ -821,7 +820,7 @@ class llama_context_dec : public llama_context_kv_self { llama_cross * cross = nullptr; }; -class llama_context_enc_dec : public llama_context_i { +class llama_context_enc_dec : public llama_context { public: llama_context_enc_dec( const llama_model & model, diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 0cd4142d5f8d5..33ee833125b58 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -122,7 +122,7 @@ bool llama_kv_cache_unified::init( return true; } -int32_t llama_kv_cache_unified::n_tokens() const { +int32_t llama_kv_cache_unified::get_n_tokens() const { int32_t result = 0; for (uint32_t i = 0; i < size; i++) { @@ -132,7 +132,7 @@ int32_t llama_kv_cache_unified::n_tokens() const { return result; } -uint32_t llama_kv_cache_unified::used_cells() const { +uint32_t llama_kv_cache_unified::get_used_cells() const { return used; } @@ -1091,7 +1091,7 @@ int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) { return 0; } - return kv->n_tokens(); + return kv->get_n_tokens(); } int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { @@ -1099,7 +1099,7 @@ int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { return 0; } - return kv->used_cells(); + return kv->get_used_cells(); } void llama_kv_cache_clear(llama_kv_cache * kv) { diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 99eb0be3c7404..8aed239154885 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -2,7 +2,7 @@ #include "llama.h" #include "llama-io.h" -#include "llama-graph.h" +#include "llama-memory.h" #include "ggml-cpp.h" @@ -13,6 +13,17 @@ struct llama_cparams; struct llama_hparams; struct llama_ubatch; +struct llama_kv_cache : public llama_memory_i { + using llama_memory_i::llama_memory_i; + + virtual int32_t get_n_tokens() const = 0; + virtual uint32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache + + virtual bool get_can_shift() const = 0; + + bool get_can_edit() const override { return get_can_shift(); } +}; + struct llama_kv_cell { llama_pos pos = -1; llama_pos delta = 0; @@ -45,36 +56,10 @@ struct llama_kv_cache_slot_info { operator bool() const { return found; } }; -struct llama_kv_cache { -public: - virtual int32_t n_tokens() const = 0; - virtual uint32_t used_cells() const = 0; // TODO: remove - - virtual void clear() = 0; - virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0; - virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0; - virtual void seq_keep(llama_seq_id seq_id) = 0; - virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0; - virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0; - - virtual llama_pos seq_pos_max(llama_seq_id seq_id) = 0; - - virtual void defrag() = 0; - virtual bool get_can_shift() const = 0; -}; - - -// C++ alias -class llama_kv_cache_i : public llama_kv_cache { -public: - using llama_kv_cache::llama_kv_cache; -}; - - // ring-buffer of cached KV data // TODO: pimpl // TODO: add notion of max sequences -class llama_kv_cache_unified : public llama_kv_cache_i { +class llama_kv_cache_unified : public llama_kv_cache { public: llama_kv_cache_unified(const llama_hparams & hparams); virtual ~llama_kv_cache_unified() = default; @@ -88,8 +73,8 @@ class llama_kv_cache_unified : public llama_kv_cache_i { uint32_t kv_size, bool offload); - int32_t n_tokens() const override; - uint32_t used_cells() const override; + int32_t get_n_tokens() const override; + uint32_t get_used_cells() const override; size_t total_size() const; @@ -97,6 +82,7 @@ class llama_kv_cache_unified : public llama_kv_cache_i { llama_pos pos_max() const; void clear() override; + void defrag() override; bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; @@ -106,7 +92,6 @@ class llama_kv_cache_unified : public llama_kv_cache_i { llama_pos seq_pos_max(llama_seq_id seq_id) override; - void defrag() override; bool get_can_shift() const override; // find an empty slot of size "n_tokens" in the cache diff --git a/src/llama-memory.cpp b/src/llama-memory.cpp new file mode 100644 index 0000000000000..0cd4142d5f8d5 --- /dev/null +++ b/src/llama-memory.cpp @@ -0,0 +1,1295 @@ +#include "llama-kv-cache.h" + +#include "llama-impl.h" +#include "llama-batch.h" +#include "llama-cparams.h" +#include "llama-model.h" + +#include +#include +#include +#include + +static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false}; + +llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams) : hparams(hparams) { +} + +bool llama_kv_cache_unified::init( + const llama_model & model, + const llama_cparams & cparams, + ggml_type type_k, + ggml_type type_v, + uint32_t kv_size, + bool offload) { + const int32_t n_layer = hparams.n_layer; + + has_shift = false; + + recurrent = llama_model_is_recurrent(&model); + v_trans = !recurrent && !cparams.flash_attn; + can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + + LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", + __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift); + + head = 0; + size = kv_size; + used = 0; + + this->type_k = type_k; + this->type_v = type_v; + + cells.clear(); + cells.resize(kv_size); + + // create a context for each buffer type + std::map ctx_map; + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + struct ggml_init_params params = { + /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ggml_context * ctx = ggml_init(params); + if (!ctx) { + return nullptr; + } + + ctx_map[buft] = ctx; + ctxs.emplace_back(ctx); + + return ctx; + } + + return it->second; + }; + + k_l.reserve(n_layer); + v_l.reserve(n_layer); + + for (int i = 0; i < n_layer; i++) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); + + const char * dev_name = "CPU"; + + ggml_backend_buffer_type_t buft; + if (offload) { + auto * dev = model.dev_layer(i); + buft = ggml_backend_dev_buffer_type(dev); + + dev_name = ggml_backend_dev_name(dev); + } else { + buft = ggml_backend_cpu_buffer_type(); + } + + LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__, + i, n_embd_k_gqa, n_embd_v_gqa, dev_name); + + ggml_context * ctx = ctx_for_buft(buft); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__); + return false; + } + + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); + ggml_format_name(k, "cache_k_l%d", i); + ggml_format_name(v, "cache_v_l%d", i); + k_l.push_back(k); + v_l.push_back(v); + } + + // allocate tensors and initialize the buffers to avoid NaNs in the padding + for (auto it : ctx_map) { + auto * buft = it.first; + auto * ctx = it.second; + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + bufs.emplace_back(buf); + } + + return true; +} + +int32_t llama_kv_cache_unified::n_tokens() const { + int32_t result = 0; + + for (uint32_t i = 0; i < size; i++) { + result += cells[i].seq_id.size(); + } + + return result; +} + +uint32_t llama_kv_cache_unified::used_cells() const { + return used; +} + +size_t llama_kv_cache_unified::total_size() const { + size_t size = 0; + for (const auto & buf : bufs) { + size += ggml_backend_buffer_get_size(buf.get()); + } + + return size; +} + +llama_pos llama_kv_cache_unified::pos_max() const { + llama_pos pos_max = -1; + for (const auto & cell : cells) { + pos_max = std::max(pos_max, cell.pos); + } + + return pos_max; +} + +void llama_kv_cache_unified::clear() { + for (int32_t i = 0; i < (int32_t) size; ++i) { + cells[i].pos = -1; + cells[i].seq_id.clear(); + cells[i].src = -1; + cells[i].tail = -1; + } + head = 0; + used = 0; + + for (auto & buf : bufs) { + ggml_backend_buffer_clear(buf.get(), 0); + } +} + +bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // models like Mamba or RWKV can't have a state partially erased + if (recurrent) { + if (seq_id >= (int64_t) size) { + // could be fatal + return false; + } + if (0 <= seq_id) { + int32_t & tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + const llama_kv_cell & cell = cells[tail_id]; + // partial intersection is invalid + if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { + return false; + } + // invalidate tails which will be cleared + if (p0 <= cell.pos && cell.pos < p1) { + tail_id = -1; + } + } + } else { + // seq_id is negative, then the range should include everything or nothing + if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { + return false; + } + } + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].pos >= p0 && cells[i].pos < p1) { + if (seq_id < 0) { + cells[i].seq_id.clear(); + } else if (cells[i].has_seq_id(seq_id)) { + cells[i].seq_id.erase(seq_id); + } else { + continue; + } + if (cells[i].is_empty()) { + // keep count of the number of used cells + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } + + return true; +} + +void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (seq_id_src == seq_id_dst) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + if (recurrent) { + if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) { + llama_kv_cell & tail_src = cells[seq_id_src]; + llama_kv_cell & tail_dst = cells[seq_id_dst]; + if (tail_dst.tail >= 0) { + // clear destination seq_id if it wasn't empty + llama_kv_cell & cell_dst = cells[tail_dst.tail]; + + cell_dst.seq_id.erase(seq_id_dst); + tail_dst.tail = -1; + if (cell_dst.seq_id.empty()) { + cell_dst.pos = -1; + cell_dst.delta = -1; + cell_dst.src = -1; + used -= 1; + } + } + if (tail_src.tail >= 0) { + llama_kv_cell & cell_src = cells[tail_src.tail]; + + cell_src.seq_id.insert(seq_id_dst); + tail_dst.tail = tail_src.tail; + } + } + + return; + } + + // otherwise, this is the KV of a Transformer-like model + head = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) { + cells[i].seq_id.insert(seq_id_dst); + } + } +} + +void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) { + uint32_t new_head = size; + + for (uint32_t i = 0; i < size; ++i) { + if (recurrent && (llama_seq_id) i != seq_id) { + cells[i].tail = -1; + } + + if (!cells[i].has_seq_id(seq_id)) { + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + cells[i].seq_id.clear(); + + if (new_head == size){ + new_head = i; + } + } else { + cells[i].seq_id.clear(); + cells[i].seq_id.insert(seq_id); + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } +} + +void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + if (delta == 0) { + return; + } + + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be shifted + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos += delta; + } + } + } + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + cells[i].pos += delta; + cells[i].delta += delta; + + if (cells[i].pos < 0) { + if (!cells[i].is_empty()) { + used--; + } + cells[i].pos = -1; + cells[i].seq_id.clear(); + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + // Otherwise we just start the next search from the beginning. + head = new_head != size ? new_head : 0; +} + +void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (d == 1) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the cache. + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be changed + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos /= d; + } + } + } + + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + + { + llama_pos p_old = cells[i].pos; + cells[i].pos /= d; + cells[i].delta += cells[i].pos - p_old; + } + } + } +} + +llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) { + llama_pos result = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id)) { + result = std::max(result, cells[i].pos); + } + } + + return result; +} + +void llama_kv_cache_unified::defrag() { + if (!recurrent) { + do_defrag = true; + } +} + +bool llama_kv_cache_unified::get_can_shift() const { + return can_shift; +} + +struct llama_kv_cache_slot_info llama_kv_cache_unified::find_slot( + const struct llama_ubatch & ubatch) { + const uint32_t n_tokens = ubatch.n_tokens; + const uint32_t n_seqs = ubatch.n_seqs; + const uint32_t n_seq_tokens = ubatch.n_seq_tokens; + + if (recurrent) { + // For recurrent state architectures (like Mamba or RWKV), + // each cache cell can store the state for a whole sequence. + // A slot should be always be contiguous. + + // can only process batches with an equal number of new tokens in each sequence + GGML_ASSERT(ubatch.equal_seqs); + + int32_t min = size - 1; + int32_t max = 0; + + // everything should fit if all seq_ids are smaller than the max + for (uint32_t s = 0; s < n_seqs; ++s) { + const uint32_t n_seq_id = ubatch.n_seq_id[s]; + for (uint32_t j = 0; j < n_seq_id; ++j) { + const llama_seq_id seq_id = ubatch.seq_id[s][j]; + + if (seq_id < 0 || (uint32_t) seq_id >= size) { + // too big seq_id + // TODO: would it be possible to resize the cache instead? + LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size); + return llama_kv_cache_slot_info_failed; + } + if (j > 0) { + llama_kv_cell & seq = cells[seq_id]; + if (seq.tail >= 0) { + llama_kv_cell & cell = cells[seq.tail]; + // clear cells from seq_ids that become shared + // (should not normally happen, but let's handle it anyway) + cell.seq_id.erase(seq_id); + seq.tail = -1; + if (cell.seq_id.empty()) { + cell.pos = -1; + cell.src = -1; + used -= 1; + } + } + } + } + } + +#ifndef NDEBUG + { + std::vector tails_verif; + tails_verif.assign(size, -1); + for (uint32_t i = 0; i < size; ++i) { + llama_kv_cell & cell = cells[i]; + for (llama_seq_id seq_id : cell.seq_id) { + if (tails_verif[seq_id] != -1) { + LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]); + } + tails_verif[seq_id] = i; + } + } + for (uint32_t i = 0; i < size; ++i) { + if (tails_verif[i] != cells[i].tail) { + LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]); + } + } + } +#endif + + // find next empty cell + uint32_t next_empty_cell = head; + + for (uint32_t i = 0; i < size; ++i) { + if (next_empty_cell >= size) { next_empty_cell -= size; } + llama_kv_cell & cell = cells[next_empty_cell]; + if (cell.is_empty()) { break; } + next_empty_cell += 1; + } + + // find usable cell range + for (uint32_t s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + llama_kv_cell & seq_meta = cells[seq_id]; + bool has_cell = false; + if (seq_meta.tail >= 0) { + llama_kv_cell & cell = cells[seq_meta.tail]; + GGML_ASSERT(cell.has_seq_id(seq_id)); + // does this seq_id "own" the cell? + if (cell.seq_id.size() == 1) { has_cell = true; } + } + if (!has_cell) { + llama_kv_cell & empty_cell = cells[next_empty_cell]; + GGML_ASSERT(empty_cell.is_empty()); + // copy old tail into the empty cell + if (seq_meta.tail >= 0) { + llama_kv_cell & orig_cell = cells[seq_meta.tail]; + empty_cell.pos = orig_cell.pos; + empty_cell.src = orig_cell.src; + orig_cell.seq_id.erase(seq_id); + empty_cell.seq_id.insert(seq_id); // will be overwritten + } + seq_meta.tail = next_empty_cell; + // find next empty cell + if (s + 1 < n_seqs) { + next_empty_cell += 1; + for (uint32_t i = 0; i < size; ++i) { + if (next_empty_cell >= size) { next_empty_cell -= size; } + llama_kv_cell & cell = cells[next_empty_cell]; + if (cell.is_empty()) { break; } + next_empty_cell += 1; + } + } + } + if (min > seq_meta.tail) { min = seq_meta.tail; } + if (max < seq_meta.tail) { max = seq_meta.tail; } + } + + // gather and re-order + for (uint32_t s = 0; s < n_seqs; ++s) { + int32_t dst_id = s + min; + int32_t src_id = cells[ubatch.seq_id[s][0]].tail; + if (dst_id != src_id) { + llama_kv_cell & dst_cell = cells[dst_id]; + llama_kv_cell & src_cell = cells[src_id]; + + std::swap(dst_cell.pos, src_cell.pos); + std::swap(dst_cell.src, src_cell.src); + std::swap(dst_cell.seq_id, src_cell.seq_id); + + // swap tails (assuming they NEVER overlap) + for (const llama_seq_id seq_id : src_cell.seq_id) { + cells[seq_id].tail = src_id; + } + for (const llama_seq_id seq_id : dst_cell.seq_id) { + cells[seq_id].tail = dst_id; + } + } + } + + // update the pos of the used seqs + for (uint32_t s = 0; s < n_seqs; ++s) { + const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1]; + int32_t cell_id = s + min; + llama_kv_cell & cell = cells[cell_id]; + + if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) { + // What should happen when the pos backtracks or skips a value? + // Clearing the state mid-batch would require special-casing which isn't done. + LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n", + __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens); + } + cell.pos = last_pos; + cell.seq_id.clear(); + for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) { + const llama_seq_id seq_id = ubatch.seq_id[s][j]; + cell.seq_id.insert(seq_id); + cells[seq_id].tail = cell_id; + } + } + + // allow getting the range of used cells, from head to head + n + head = min; + n = max - min + 1; + used = std::count_if(cells.begin(), cells.end(), + [](const llama_kv_cell& cell){ return !cell.is_empty(); }); + + // sanity check + return llama_kv_cache_slot_info(n >= n_seqs); + } + + // otherwise, one cell per token. + + if (n_tokens > size) { + LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size); + return llama_kv_cache_slot_info_failed; + } + + uint32_t n_tested = 0; + + while (true) { + if (head + n_tokens > size) { + n_tested += size - head; + head = 0; + continue; + } + + bool found = true; + for (uint32_t i = 0; i < n_tokens; i++) { + if (cells[head + i].pos >= 0) { + found = false; + head += i + 1; + n_tested += i + 1; + break; + } + } + + if (found) { + break; + } + + if (n_tested >= size) { + //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); + return llama_kv_cache_slot_info_failed; + } + } + + for (uint32_t s = 0; s < n_seqs; s++) { + for (uint32_t i = 0; i < n_seq_tokens; ++i) { + uint32_t k = s*n_seq_tokens + i; + cells[head + k].pos = ubatch.pos[k]; + + for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) { + cells[head + k].seq_id.insert(ubatch.seq_id[s][j]); + } + } + } + + used += n_tokens; + + return llama_kv_cache_slot_info(head, head + n_tokens); +} + +uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const { + // the FA kernels require padding to avoid extra runtime boundary checks + return cparams.flash_attn ? 256u : 32u; +} + +uint32_t llama_kv_cache_unified::cell_max() const { + for (uint32_t i = size; i > 0; --i) { + const llama_kv_cell & cell = cells[i - 1]; + + if (cell.pos >= 0 && !cell.is_empty()) { + return i; + } + } + + return 0; +} + +size_t llama_kv_cache_unified::size_k_bytes() const { + size_t size_k_bytes = 0; + + for (const auto & k : k_l) { + size_k_bytes += ggml_nbytes(k); + } + + return size_k_bytes; +} + +size_t llama_kv_cache_unified::size_v_bytes() const { + size_t size_v_bytes = 0; + + for (const auto & v : v_l) { + size_v_bytes += ggml_nbytes(v); + } + + return size_v_bytes; +} + +void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { + std::vector> cell_ranges; // ranges, from inclusive, to exclusive + uint32_t cell_count = 0; + + // Count the number of cells with the specified seq_id + // Find all the ranges of cells with this seq id (or all, when -1) + uint32_t cell_range_begin = size; + for (uint32_t i = 0; i < size; ++i) { + const auto & cell = cells[i]; + if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { + ++cell_count; + if (cell_range_begin == size) { + cell_range_begin = i; + } + } else { + if (cell_range_begin != size) { + cell_ranges.emplace_back(cell_range_begin, i); + cell_range_begin = size; + } + } + } + if (cell_range_begin != size) { + cell_ranges.emplace_back(cell_range_begin, size); + } + + // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count + uint32_t cell_count_check = 0; + for (const auto & range : cell_ranges) { + cell_count_check += range.second - range.first; + } + GGML_ASSERT(cell_count == cell_count_check); + + io.write(&cell_count, sizeof(cell_count)); + + state_write_meta(io, cell_ranges, seq_id); + state_write_data(io, cell_ranges); +} + +void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) { + uint32_t cell_count; + io.read_to(&cell_count, sizeof(cell_count)); + + bool res = true; + res = res && state_read_meta(io, cell_count, seq_id); + res = res && state_read_data(io, cell_count); + + if (!res) { + if (seq_id == -1) { + clear(); + } else { + seq_rm(seq_id, -1, -1); + } + throw std::runtime_error("failed to restore kv cache"); + } +} + +void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { + for (const auto & range : cell_ranges) { + for (uint32_t i = range.first; i < range.second; ++i) { + const auto & cell = cells[i]; + const llama_pos pos = cell.pos; + const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; + + io.write(&pos, sizeof(pos)); + io.write(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id) { + for (auto seq_id : cell.seq_id) { + io.write(&seq_id, sizeof(seq_id)); + } + } + } + } +} + +void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { + const uint32_t v_trans = this->v_trans ? 1 : 0; + const uint32_t n_layer = hparams.n_layer; + + io.write(&v_trans, sizeof(v_trans)); + io.write(&n_layer, sizeof(n_layer)); + + std::vector tmp_buf; + + // Iterate and write all the keys first, each row is a cell + // Get whole range at a time + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Write key type + const int32_t k_type_i = (int32_t)k_l[il]->type; + io.write(&k_type_i, sizeof(k_type_i)); + + // Write row size of key + const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + io.write(&k_size_row, sizeof(k_size_row)); + + // Read each range of cells of k_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * k_size_row; + io.write_tensor(k_l[il], range.first * k_size_row, buf_size); + } + } + + if (!v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)v_l[il]->type; + io.write(&v_type_i, sizeof(v_type_i)); + + // Write row size of value + const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa); + io.write(&v_size_row, sizeof(v_size_row)); + + // Read each range of cells of v_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * v_size_row; + io.write_tensor(v_l[il], range.first * v_size_row, buf_size); + } + } + } else { + // When v is transposed, we also need the element size and get the element ranges from each row + const uint32_t kv_size = size; + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)v_l[il]->type; + io.write(&v_type_i, sizeof(v_type_i)); + + // Write element size + const uint32_t v_size_el = ggml_type_size(v_l[il]->type); + io.write(&v_size_el, sizeof(v_size_el)); + + // Write GQA embedding size + io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); + + // For each row, we get the element values of each cell + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + // Read each range of cells of v_size_el length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t src_offset = (range.first + j * kv_size) * v_size_el; + const size_t buf_size = range_size * v_size_el; + io.write_tensor(v_l[il], src_offset, buf_size); + } + } + } + } +} + +bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) { + if (dest_seq_id != -1) { + // single sequence + + seq_rm(dest_seq_id, -1, -1); + + llama_sbatch sbatch; + llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); + + batch.n_tokens = cell_count; + batch.n_seq_tokens = cell_count; + batch.n_seqs = 1; + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_pos pos; + uint32_t n_seq_id; + + io.read_to(&pos, sizeof(pos)); + io.read_to(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id != 0) { + LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); + return false; + } + + batch.pos[i] = pos; + } + batch.n_seq_id[0] = 1; + batch.seq_id[0] = &dest_seq_id; + if (!find_slot(batch)) { + LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + return false; + } + + // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // Assume that this is one contiguous block of cells + GGML_ASSERT(head + cell_count <= size); + GGML_ASSERT(cells[head].pos == batch.pos[0]); + GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(cells[head].has_seq_id(dest_seq_id)); + GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id)); + } else { + // whole KV cache restore + + if (cell_count > size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); + return false; + } + + clear(); + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_kv_cell & cell = cells[i]; + + llama_pos pos; + uint32_t n_seq_id; + + io.read_to(&pos, sizeof(pos)); + io.read_to(&n_seq_id, sizeof(n_seq_id)); + + cell.pos = pos; + + for (uint32_t j = 0; j < n_seq_id; ++j) { + llama_seq_id seq_id; + io.read_to(&seq_id, sizeof(seq_id)); + + // TODO: llama_kv_cache_unified should have a notion of max sequences + //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { + if (seq_id < 0) { + //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id); + return false; + } + + cell.seq_id.insert(seq_id); + + if (recurrent) { + int32_t & tail = cells[seq_id].tail; + if (tail != -1) { + LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); + return false; + } + tail = i; + } + } + } + + head = 0; + used = cell_count; + } + + if (recurrent) { + for (uint32_t i = 0; i < cell_count; ++i) { + uint32_t cell_id = head + i; + // make sure the recurrent states will keep their restored state + cells[cell_id].src = cell_id; + } + } + + return true; +} + +bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) { + uint32_t v_trans; + uint32_t n_layer; + io.read_to(&v_trans, sizeof(v_trans)); + io.read_to(&n_layer, sizeof(n_layer)); + + if (n_layer != hparams.n_layer) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + return false; + } + if (cell_count > size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size); + return false; + } + if (v_trans != (bool) v_trans) { + LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); + return false; + } + + // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Read type of key + int32_t k_type_i_ref; + io.read_to(&k_type_i_ref, sizeof(k_type_i_ref)); + const int32_t k_type_i = (int32_t) k_l[il]->type; + if (k_type_i != k_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); + return false; + } + + // Read row size of key + uint64_t k_size_row_ref; + io.read_to(&k_size_row_ref, sizeof(k_size_row_ref)); + const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + if (k_size_row != k_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the keys for the whole cell range + ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row); + } + } + + if (!v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read row size of value + uint64_t v_size_row_ref; + io.read_to(&v_size_row_ref, sizeof(v_size_row_ref)); + const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa); + if (v_size_row != v_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the values for the whole cell range + ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row); + } + } + } else { + // For each layer, read the values for each cell (transposed) + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read element size of value + uint32_t v_size_el_ref; + io.read_to(&v_size_el_ref, sizeof(v_size_el_ref)); + const size_t v_size_el = ggml_type_size(v_l[il]->type); + if (v_size_el != v_size_el_ref) { + LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); + return false; + } + + // Read GQA embedding size + uint32_t n_embd_v_gqa_ref; + io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); + if (n_embd_v_gqa != n_embd_v_gqa_ref) { + LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il); + return false; + } + + if (cell_count) { + // For each row in the transposed matrix, read the values for the whole cell range + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const size_t dst_offset = (head + j * size) * v_size_el; + ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + } + } + } + } + + return true; +} + +// +// interface implementation +// + +int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) { + if (!kv) { + return 0; + } + + return kv->n_tokens(); +} + +int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { + if (!kv) { + return 0; + } + + return kv->used_cells(); +} + +void llama_kv_cache_clear(llama_kv_cache * kv) { + if (!kv) { + return; + } + + kv->clear(); +} + +bool llama_kv_cache_seq_rm( + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + if (!kv) { + return true; + } + + return kv->seq_rm(seq_id, p0, p1); +} + +void llama_kv_cache_seq_cp( + llama_kv_cache * kv, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + if (!kv) { + return; + } + + kv->seq_cp(seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id) { + if (!kv) { + return; + } + + kv->seq_keep(seq_id); +} + +void llama_kv_cache_seq_add( + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + if (!kv) { + return; + } + + kv->seq_add(seq_id, p0, p1, delta); +} + +void llama_kv_cache_seq_div( + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + if (!kv) { + return; + } + + kv->seq_div(seq_id, p0, p1, d); +} + +llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id) { + if (!kv) { + return 0; + } + + return kv->seq_pos_max(seq_id); +} + +void llama_kv_cache_defrag(llama_kv_cache * kv) { + if (!kv) { + return; + } + + kv->defrag(); +} + +bool llama_kv_cache_can_shift(const llama_kv_cache * kv) { + if (!kv) { + return false; + } + + return kv->get_can_shift(); +} + +// +// kv cache view +// + +struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max) { + struct llama_kv_cache_view result = { + /*.n_cells = */ 0, + /*.n_seq_max = */ n_seq_max, + /*.token_count = */ 0, + /*.used_cells = */ llama_kv_cache_used_cells(&kv), + /*.max_contiguous = */ 0, + /*.max_contiguous_idx = */ -1, + /*.cells = */ nullptr, + /*.cells_sequences = */ nullptr, + }; + + return result; +} + +void llama_kv_cache_view_free(struct llama_kv_cache_view * view) { + if (view->cells != nullptr) { + free(view->cells); + view->cells = nullptr; + } + if (view->cells_sequences != nullptr) { + free(view->cells_sequences); + view->cells_sequences = nullptr; + } +} + +void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache * kv) { + // TODO: rework this in the future, for now quick hack + const llama_kv_cache_unified * kvu = dynamic_cast(kv); + if (kvu == nullptr) { + LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__); + return; + } + + if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) { + view->n_cells = int32_t(kvu->size); + void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells); + GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells"); + view->cells = (struct llama_kv_cache_view_cell *)p; + p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells); + GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences"); + view->cells_sequences = (llama_seq_id *)p; + } + + const std::vector & kv_cells = kvu->cells; + llama_kv_cache_view_cell * c_curr = view->cells; + llama_seq_id * cs_curr = view->cells_sequences; + int32_t used_cells = 0; + int32_t token_count = 0; + int32_t curr_contig_idx = -1; + uint32_t max_contig = 0; + int32_t max_contig_idx = -1; + + for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) { + const size_t curr_size = kv_cells[i].seq_id.size(); + token_count += curr_size; + c_curr->pos = kv_cells[i].pos + kv_cells[i].delta; + + if (curr_size > 0) { + if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) { + max_contig = i - curr_contig_idx; + max_contig_idx = curr_contig_idx; + } + curr_contig_idx = -1; + } else if (curr_contig_idx < 0) { + curr_contig_idx = i; + } + + int seq_idx = 0; + for (const llama_seq_id it : kv_cells[i].seq_id) { + if (seq_idx >= view->n_seq_max) { + break; + } + cs_curr[seq_idx] = it; + seq_idx++; + } + if (seq_idx != 0) { + used_cells++; + } + for (; seq_idx < view->n_seq_max; seq_idx++) { + cs_curr[seq_idx] = -1; + } + } + if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) { + max_contig_idx = curr_contig_idx; + max_contig = kv_cells.size() - curr_contig_idx; + } + view->max_contiguous = max_contig; + view->max_contiguous_idx = max_contig_idx; + view->token_count = token_count; + view->used_cells = used_cells; + if (uint32_t(used_cells) != kvu->used) { + LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n", + __func__, kvu->used, used_cells); + } +} diff --git a/src/llama-memory.h b/src/llama-memory.h new file mode 100644 index 0000000000000..69e6e34ca4516 --- /dev/null +++ b/src/llama-memory.h @@ -0,0 +1,21 @@ +#pragma once + +#include "llama.h" + +// general concept of LLM memory +// the KV cache is a type of LLM memory, but there can be other types +class llama_memory_i { +public: + virtual void clear() = 0; + virtual void defrag() = 0; + + virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0; + virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0; + virtual void seq_keep(llama_seq_id seq_id) = 0; + virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0; + virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0; + + virtual llama_pos seq_pos_max(llama_seq_id seq_id) = 0; + + virtual bool get_can_edit() const = 0; +}; From 7f02ee562efae35fa0abcd8f4ae3bbfe3728be27 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 28 Feb 2025 14:09:20 +0200 Subject: [PATCH 81/84] context : decouple inputs, llama_graph_i become const (WIP) ggml-ci --- src/llama-context.cpp | 737 ++++++++++++++++++++++++------------------ src/llama-context.h | 53 ++- src/llama-graph.cpp | 25 +- src/llama-graph.h | 86 ++++- src/llama-model.cpp | 463 +++++++++++++------------- src/llama-model.h | 5 +- 6 files changed, 789 insertions(+), 580 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c599801763181..5ac28f983027e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -45,6 +45,137 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t // llama_context_base // +class llama_graph_input_embd : public llama_graph_input_i { +public: + llama_graph_input_embd() = default; + virtual ~llama_graph_input_embd() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * tokens = nullptr; // I32 [n_batch] + ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] +}; + +void llama_graph_input_embd::set_input(const llama_ubatch * ubatch) { + if (ubatch->token) { + const int64_t n_tokens = ubatch->n_tokens; + + ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens)); + } + + if (ubatch->embd) { + const int64_t n_embd = embd->ne[0]; + const int64_t n_tokens = ubatch->n_tokens; + + ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd)); + } +} + +class llama_graph_input_attn_base : public llama_graph_input_attn_i { +public: + llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) : + hparams(hparams), + cparams(cparams) { + } + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * get_kq_mask() override { return kq_mask_cnv; } + + ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch] + ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch] + + const llama_hparams & hparams; + const llama_cparams & cparams; +}; + +void llama_graph_input_attn_base::set_input(const llama_ubatch * ubatch) { + if (kq_mask) { + if (cparams.causal_attn) { + const int64_t n_kv = ubatch->n_tokens; + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer)); + float * data = (float *) kq_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int s1 = 0; s1 < n_seqs; ++s1) { + const llama_seq_id seq_id = ubatch->seq_id[s1][0]; + + for (int j = 0; j < n_seq_tokens; ++j) { + const int32_t tj = s1*n_seq_tokens + j; + + for (int s0 = 0; s0 < n_seqs; ++s0) { + for (int i = 0; i < n_seq_tokens; ++i) { + const int32_t ti = s0*n_seq_tokens + i; + float f = -INFINITY; + + for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) { + if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) { + if (hparams.use_alibi) { + f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]); + } else { + f = 0.0f; + } + break; + } + } + + data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f; + } + } + } + } + } + } else { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + const int64_t n_stride = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer)); + + float * data = (float *) kq_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int s1 = 0; s1 < n_seqs; ++s1) { + const llama_seq_id seq_id = ubatch->seq_id[s1][0]; + + for (int j = 0; j < n_seq_tokens; ++j) { + const int32_t tj = s1*n_seq_tokens + j; + + for (int s0 = 0; s0 < n_seqs; ++s0) { + for (int i = 0; i < n_seq_tokens; ++i) { + const int32_t ti = s0*n_seq_tokens + i; + float f = -INFINITY; + + for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) { + if (ubatch->seq_id[s0][s] == seq_id) { + if (hparams.use_alibi) { + f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]); + } else { + f = 0.0f; + } + break; + } + } + + data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f; + } + } + + for (int i = n_tokens; i < n_stride; ++i) { + data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY; + } + } + } + } + } + } +} + llama_context_base::llama_context_base( const llama_model & model, llama_context_params params, @@ -714,7 +845,8 @@ int llama_context_base::encode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); - input_set(ubatch); + res->set_inputs(&ubatch); + input_set(ubatch); // TODO: remove, tmp here, until all inputs are migrated outside the context const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { @@ -729,7 +861,7 @@ int llama_context_base::encode(llama_batch & inp_batch) { return -3; } - auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd; + auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); // extract embeddings if (t_embd) { @@ -870,7 +1002,8 @@ int llama_context_base::decode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); - input_set(ubatch); + res->set_inputs(&ubatch); + input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { @@ -885,11 +1018,11 @@ int llama_context_base::decode(llama_batch & inp_batch) { } } - auto * t_logits = cparams.embeddings ? nullptr : res.t_logits; - auto * t_embd = cparams.embeddings ? res.t_embd : nullptr; + auto * t_logits = cparams.embeddings ? nullptr : res->get_logits(); + auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; - if (t_embd && res.t_embd_pooled) { - t_embd = res.t_embd_pooled; + if (t_embd && res->get_embd_pooled()) { + t_embd = res->get_embd_pooled(); } // extract logits @@ -1002,19 +1135,6 @@ int64_t llama_context_base::n_pos_per_token() const { void llama_context_base::input_set(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; - if (ubatch.token) { - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp.tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp.tokens)); - } - - if (ubatch.embd) { - const int64_t n_embd = hparams.n_embd; - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp.embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp.embd)); - } - if (ubatch.pos && inp.pos) { const int64_t n_tokens = ubatch.n_tokens; @@ -1159,91 +1279,6 @@ void llama_context_base::input_set(const llama_ubatch & ubatch) { } } - if (inp.kq_mask) { - if (cparams.causal_attn) { - const int64_t n_kv = ubatch.n_tokens; - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp.kq_mask->buffer)); - float * data = (float *) inp.kq_mask->data; - - for (int h = 0; h < 1; ++h) { - for (int s1 = 0; s1 < n_seqs; ++s1) { - const llama_seq_id seq_id = ubatch.seq_id[s1][0]; - - for (int j = 0; j < n_seq_tokens; ++j) { - const int32_t tj = s1*n_seq_tokens + j; - - for (int s0 = 0; s0 < n_seqs; ++s0) { - for (int i = 0; i < n_seq_tokens; ++i) { - const int32_t ti = s0*n_seq_tokens + i; - float f = -INFINITY; - - for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) { - if (ubatch.seq_id[s0][s] == seq_id && ubatch.pos[ti] <= ubatch.pos[tj]) { - if (hparams.use_alibi) { - f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]); - } else { - f = 0.0f; - } - break; - } - } - - data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f; - } - } - } - } - } - } else { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_stride = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp.kq_mask->buffer)); - - float * data = (float *) inp.kq_mask->data; - - for (int h = 0; h < 1; ++h) { - for (int s1 = 0; s1 < n_seqs; ++s1) { - const llama_seq_id seq_id = ubatch.seq_id[s1][0]; - - for (int j = 0; j < n_seq_tokens; ++j) { - const int32_t tj = s1*n_seq_tokens + j; - - for (int s0 = 0; s0 < n_seqs; ++s0) { - for (int i = 0; i < n_seq_tokens; ++i) { - const int32_t ti = s0*n_seq_tokens + i; - float f = -INFINITY; - - for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) { - if (ubatch.seq_id[s0][s] == seq_id) { - if (hparams.use_alibi) { - f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]); - } else { - f = 0.0f; - } - break; - } - } - - data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f; - } - } - - for (int i = n_tokens; i < n_stride; ++i) { - data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY; - } - } - } - } - } - } - if (inp.pos_bucket) { const int64_t n_tokens = ubatch.n_tokens; @@ -1401,7 +1436,7 @@ ggml_cgraph * llama_context_base::graph_init() { return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false); } -llama_graph_result llama_context_base::graph_build( +llama_graph_result_ptr llama_context_base::graph_build( ggml_context * ctx, ggml_cgraph * gf, const llama_ubatch & ubatch) { @@ -1604,21 +1639,24 @@ ggml_tensor * llama_context_base::build_rope_shift( } ggml_tensor * llama_context_base::build_inp_embd( - ggml_context * ctx0, - ggml_tensor * tok_embd, - const llama_ubatch & ubatch) { + llama_graph_result * res, + ggml_context * ctx0, + ggml_tensor * tok_embd, + const llama_ubatch & ubatch) const { const auto & hparams = model.hparams; const int64_t n_embd = hparams.n_embd; + auto inp = std::make_shared(); + struct ggml_tensor * inpL; if (ubatch.token) { - inp.tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - //cb(inp.tokens, "inp_tokens", -1); - ggml_set_input(inp.tokens); + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + //cb(inp->tokens, "inp_tokens", -1); + ggml_set_input(inp->tokens); - inpL = ggml_get_rows(ctx0, tok_embd, inp.tokens); + inpL = ggml_get_rows(ctx0, tok_embd, inp->tokens); // apply lora for embedding tokens if needed for (const auto & lora : loras) { @@ -1632,15 +1670,15 @@ ggml_tensor * llama_context_base::build_inp_embd( struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( ctx0, lw->b, // non-transposed lora_b - ggml_get_rows(ctx0, lw->a, inp.tokens) + ggml_get_rows(ctx0, lw->a, inp->tokens) ), scale); inpL = ggml_add(ctx0, inpL, inpL_delta); } } else { - inp.embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - inpL = inp.embd; - ggml_set_input(inp.embd); + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); + inpL = inp->embd; + ggml_set_input(inp->embd); } // For Granite architecture @@ -1648,6 +1686,8 @@ ggml_tensor * llama_context_base::build_inp_embd( inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale); } + res->add_input(std::move(inp)); + //cb(inpL, "inp_embd", -1); return inpL; @@ -1699,23 +1739,31 @@ ggml_tensor * llama_context_base::build_inp_cls( return inp.cls; } -void llama_context_base::build_attn_inp( - ggml_context * ctx0, - int32_t n_tokens, - bool causal, - bool swa) { +llama_graph_input_attn_ptr llama_context_base::build_attn_inp( + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa) const { + auto inp = std::make_shared(model.hparams, cparams); + // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch GGML_UNUSED(causal); GGML_UNUSED(swa); - inp.kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); //cb(inp_kq_mask, "KQ_mask", -1); - ggml_set_input(inp.kq_mask); + ggml_set_input(inp->kq_mask); + + inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask; - inp.kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.kq_mask, GGML_TYPE_F16) : inp.kq_mask; + res->add_input(inp); + + return inp; } ggml_tensor * llama_context_base::build_attn( + llama_graph_input_attn_i * inp, ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, @@ -1723,10 +1771,10 @@ ggml_tensor * llama_context_base::build_attn( ggml_tensor * v_cur, ggml_tensor * kq_b, float kq_scale, - int il) { + int il) const { GGML_UNUSED(il); - const auto & kq_mask = inp.kq_mask_cnv; + const auto & kq_mask = inp->get_kq_mask(); ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); //cb(q, "q", il); @@ -1751,7 +1799,7 @@ ggml_tensor * llama_context_base::build_attn_mha( ggml_tensor * kq_b, ggml_tensor * kq_mask, bool v_trans, - float kq_scale) { + float kq_scale) const { const auto & hparams = model.hparams; //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); @@ -2380,6 +2428,156 @@ size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_i // llama_context_kv_self // +class llama_graph_input_attn_kv_self : public llama_graph_input_attn_i { +public: + llama_graph_input_attn_kv_self( + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache_unified * kv_self) : + hparams(hparams), + cparams(cparams), + kv_self(kv_self) { + } + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * get_kq_mask() override { return self_kq_mask_cnv; } + ggml_tensor * get_kq_mask_swa() override { return self_kq_mask_swa_cnv; } + + ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch] + ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch] + ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch] + + const llama_hparams & hparams; + const llama_cparams & cparams; + + const llama_kv_cache_unified * kv_self; +}; + +void llama_graph_input_attn_kv_self::set_input(const llama_ubatch * ubatch) { + if (self_kq_mask || self_kq_mask_swa) { + // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. + if (cparams.causal_attn) { + const int64_t n_kv = kv_self->n; + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + float * data = nullptr; + float * data_swa = nullptr; + + if (self_kq_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer)); + data = (float *) self_kq_mask->data; + } + + if (self_kq_mask_swa) { + GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer)); + data_swa = (float *) self_kq_mask_swa->data; + } + + // For causal attention, use only the previous KV cells + // of the correct sequence for each token of the ubatch. + // It's assumed that if a token in the batch has multiple sequences, they are equivalent. + for (int h = 0; h < 1; ++h) { + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + for (int j = 0; j < n_seq_tokens; ++j) { + const llama_pos pos = ubatch->pos[s*n_seq_tokens + j]; + + for (int i = 0; i < n_kv; ++i) { + float f; + if (!kv_self->cells[i].has_seq_id(seq_id) || kv_self->cells[i].pos > pos) { + f = -INFINITY; + } else { + if (hparams.use_alibi) { + f = -std::abs(kv_self->cells[i].pos - pos); + } else { + f = 0.0f; + } + } + + if (data) { + data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; + } + + // may need to cut off old tokens for sliding window + if (data_swa) { + if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) { + f = -INFINITY; + } + data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; + } + } + } + } + + if (data) { + for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { + for (int j = 0; j < n_kv; ++j) { + data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; + } + } + } + + if (data_swa) { + for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { + for (int j = 0; j < n_kv; ++j) { + data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; + } + } + } + } + } else { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + // when using kv cache, the mask needs to match the kv cache size + const int64_t n_stride = n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer)); + + float * data = (float *) self_kq_mask->data; + + for (int h = 0; h < 1; ++h) { + for (int s1 = 0; s1 < n_seqs; ++s1) { + const llama_seq_id seq_id = ubatch->seq_id[s1][0]; + + for (int j = 0; j < n_seq_tokens; ++j) { + const int32_t tj = s1*n_seq_tokens + j; + + for (int s0 = 0; s0 < n_seqs; ++s0) { + for (int i = 0; i < n_seq_tokens; ++i) { + const int32_t ti = s0*n_seq_tokens + i; + float f = -INFINITY; + + for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) { + if (ubatch->seq_id[s0][s] == seq_id) { + if (hparams.use_alibi) { + f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]); + } else { + f = 0.0f; + } + break; + } + } + + data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f; + } + } + + for (int i = n_tokens; i < n_stride; ++i) { + data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY; + } + } + } + } + } + } +} + llama_context_kv_self::llama_context_kv_self( const llama_model & model, llama_context_params params, @@ -2593,7 +2791,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); - input_set(ubatch); + res->set_inputs(&ubatch); + input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { @@ -2608,7 +2807,7 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { return -3; } - auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd; + auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); // extract embeddings if (t_embd) { @@ -2831,7 +3030,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); - input_set(ubatch); + res->set_inputs(&ubatch); + input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { @@ -2861,11 +3061,11 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} - auto * t_logits = cparams.embeddings ? nullptr : res.t_logits; - auto * t_embd = cparams.embeddings ? res.t_embd : nullptr; + auto * t_logits = cparams.embeddings ? nullptr : res->get_logits(); + auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; - if (t_embd && res.t_embd_pooled) { - t_embd = res.t_embd_pooled; + if (t_embd && res->get_embd_pooled()) { + t_embd = res->get_embd_pooled(); } // extract logits @@ -3009,127 +3209,6 @@ void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { // call base functionality llama_context_base::input_set(ubatch); - if (inp.self_kq_mask || inp.self_kq_mask_swa) { - // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. - if (cparams.causal_attn) { - const int64_t n_kv = kv_self->n; - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - float * data = nullptr; - float * data_swa = nullptr; - - if (inp.self_kq_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask->buffer)); - data = (float *) inp.self_kq_mask->data; - } - - if (inp.self_kq_mask_swa) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask_swa->buffer)); - data_swa = (float *) inp.self_kq_mask_swa->data; - } - - // For causal attention, use only the previous KV cells - // of the correct sequence for each token of the ubatch. - // It's assumed that if a token in the batch has multiple sequences, they are equivalent. - for (int h = 0; h < 1; ++h) { - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - for (int j = 0; j < n_seq_tokens; ++j) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + j]; - - for (int i = 0; i < n_kv; ++i) { - float f; - if (!kv_self->cells[i].has_seq_id(seq_id) || kv_self->cells[i].pos > pos) { - f = -INFINITY; - } else { - if (hparams.use_alibi) { - f = -std::abs(kv_self->cells[i].pos - pos); - } else { - f = 0.0f; - } - } - - if (data) { - data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; - } - - // may need to cut off old tokens for sliding window - if (data_swa) { - if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) { - f = -INFINITY; - } - data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; - } - } - } - } - - if (data) { - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; - } - } - } - - if (data_swa) { - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (int j = 0; j < n_kv; ++j) { - data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; - } - } - } - } - } else { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - // when using kv cache, the mask needs to match the kv cache size - const int64_t n_stride = n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_kq_mask->buffer)); - - float * data = (float *) inp.self_kq_mask->data; - - for (int h = 0; h < 1; ++h) { - for (int s1 = 0; s1 < n_seqs; ++s1) { - const llama_seq_id seq_id = ubatch.seq_id[s1][0]; - - for (int j = 0; j < n_seq_tokens; ++j) { - const int32_t tj = s1*n_seq_tokens + j; - - for (int s0 = 0; s0 < n_seqs; ++s0) { - for (int i = 0; i < n_seq_tokens; ++i) { - const int32_t ti = s0*n_seq_tokens + i; - float f = -INFINITY; - - for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) { - if (ubatch.seq_id[s0][s] == seq_id) { - if (hparams.use_alibi) { - f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]); - } else { - f = 0.0f; - } - break; - } - } - - data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f; - } - } - - for (int i = n_tokens; i < n_stride; ++i) { - data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY; - } - } - } - } - } - } - if (inp.self_pos_bucket) { const int64_t n_tokens = ubatch.n_tokens; @@ -3173,37 +3252,45 @@ ggml_tensor * llama_context_kv_self::build_inp_pos_bucket( return inp.self_pos_bucket; } -void llama_context_kv_self::build_attn_inp( - ggml_context * ctx0, - int32_t n_tokens, - bool causal, - bool swa) { +llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp( + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa) const { + auto inp = std::make_shared(model.hparams, cparams, kv_self.get()); + const auto n_kv = kv_self->n; - inp.self_kq_mask = causal + inp->self_kq_mask = causal ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp.self_kq_mask, "KQ_mask", -1); - ggml_set_input(inp.self_kq_mask); + //cb(inp->self_kq_mask, "KQ_mask", -1); + ggml_set_input(inp->self_kq_mask); - inp.self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.self_kq_mask, GGML_TYPE_F16) : inp.self_kq_mask; + inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; if (swa) { const auto & hparams = model.hparams; GGML_ASSERT(hparams.n_swa > 0); - inp.self_kq_mask_swa = causal + inp->self_kq_mask_swa = causal ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - //cb(inp.self_kq_mask_swa, "KQ_mask_swa", -1); - ggml_set_input(inp.self_kq_mask_swa); + //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1); + ggml_set_input(inp->self_kq_mask_swa); - inp.self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.self_kq_mask_swa, GGML_TYPE_F16) : inp.self_kq_mask_swa; + inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; } + + res->add_input(inp); + + return inp; } ggml_tensor * llama_context_kv_self::build_attn( + llama_graph_input_attn_i * inp, ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, @@ -3211,7 +3298,7 @@ ggml_tensor * llama_context_kv_self::build_attn( ggml_tensor * v_cur, ggml_tensor * kq_b, float kq_scale, - int il) { + int il) const { const auto & hparams = model.hparams; const auto & n_ctx = cparams.n_ctx; @@ -3280,7 +3367,7 @@ ggml_tensor * llama_context_kv_self::build_attn( } }; - const auto & kq_mask = is_sliding ? inp.self_kq_mask_swa_cnv : inp.self_kq_mask_cnv; + const auto & kq_mask = is_sliding ? inp->get_kq_mask_swa() : inp->get_kq_mask(); const auto n_kv = kv_self->n; @@ -3897,7 +3984,8 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); - input_set(ubatch); + res->set_inputs(&ubatch); + input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { @@ -3927,11 +4015,11 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} - auto * t_logits = cparams.embeddings ? nullptr : res.t_logits; - auto * t_embd = cparams.embeddings ? res.t_embd : nullptr; + auto * t_logits = cparams.embeddings ? nullptr : res->get_logits(); + auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; - if (t_embd && res.t_embd_pooled) { - t_embd = res.t_embd_pooled; + if (t_embd && res->get_embd_pooled()) { + t_embd = res->get_embd_pooled(); } // extract logits @@ -4604,7 +4692,8 @@ int llama_context_enc::encode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); - input_set(ubatch); + res->set_inputs(&ubatch); + input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { @@ -4619,7 +4708,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) { return -3; } - auto * t_embd = res.t_embd_pooled ? res.t_embd_pooled : res.t_embd; + auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); // extract embeddings if (t_embd) { @@ -4693,38 +4782,41 @@ int llama_context_enc::encode(llama_batch & inp_batch) { // llama_context_dec // -void llama_context_dec::reserve() { - // simulate full KV cache - cross->t_embd = nullptr; +class llama_graph_input_attn_dec : public llama_graph_input_attn_i { +public: + llama_graph_input_attn_dec( + llama_graph_input_attn_i * inp_kv_self, + const llama_cross * cross) : inp_kv_self(inp_kv_self), cross(cross) {} - llama_context_kv_self::reserve(); -} + void set_input(const llama_ubatch * ubatch) override; -void llama_context_dec::input_set(const llama_ubatch & ubatch) { - // call base functionality - llama_context_kv_self::input_set(ubatch); + ggml_tensor * get_kq_mask() override { return inp_kv_self->get_kq_mask(); } + ggml_tensor * get_kq_mask_swa() override { return inp_kv_self->get_kq_mask_swa(); } + ggml_tensor * get_kq_mask_cross() override { return cross_kq_mask_cnv; } - if (inp.cross_embd && cross->t_embd) { - assert(inp.cross_embd->type == GGML_TYPE_F32); + ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch] + ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch] - ggml_backend_tensor_set(inp.cross_embd, cross->v_embd, 0, ggml_nbytes(inp.cross_embd)); - } + llama_graph_input_attn_i * inp_kv_self = nullptr; + const llama_cross * cross = nullptr; +}; - if (inp.cross_kq_mask) { - const int64_t n_enc = inp.cross_kq_mask->ne[0]; - const int64_t n_tokens = ubatch.n_tokens; +void llama_graph_input_attn_dec::set_input(const llama_ubatch * ubatch) { + if (cross_kq_mask) { + const int64_t n_enc = cross_kq_mask->ne[0]; + const int64_t n_tokens = ubatch->n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(inp.cross_kq_mask->buffer)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing + GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer)); + GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing - float * data = (float *) inp.cross_kq_mask->data; + float * data = (float *) cross_kq_mask->data; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_enc; ++i) { float f = -INFINITY; - for (int s = 0; s < ubatch.n_seq_id[j]; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[j][s]; + for (int s = 0; s < ubatch->n_seq_id[j]; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[j][s]; if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) { f = 0.0f; } @@ -4742,6 +4834,25 @@ void llama_context_dec::input_set(const llama_ubatch & ubatch) { } } +void llama_context_dec::reserve() { + // simulate full KV cache + cross->t_embd = nullptr; + + llama_context_kv_self::reserve(); +} + +void llama_context_dec::input_set(const llama_ubatch & ubatch) { + // call base functionality + llama_context_kv_self::input_set(ubatch); + + if (inp.cross_embd && cross->t_embd) { + assert(inp.cross_embd->type == GGML_TYPE_F32); + + ggml_backend_tensor_set(inp.cross_embd, cross->v_embd, 0, ggml_nbytes(inp.cross_embd)); + } + +} + ggml_cgraph * llama_context_dec::graph_init() { inp = {}; @@ -4769,22 +4880,30 @@ ggml_tensor * llama_context_dec::build_inp_cross_embd( return inp.cross_embd; } -void llama_context_dec::build_attn_inp( - ggml_context * ctx0, - int32_t n_tokens, - bool causal, - bool swa) { - llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa); +llama_graph_input_attn_ptr llama_context_dec::build_attn_inp( + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa) const { + auto inp_kv_self = llama_context_kv_self::build_attn_inp(res, ctx0, n_tokens, causal, swa); + + auto inp = std::make_shared(inp_kv_self.get(), cross); const int32_t n_enc = cross->t_embd ? cross->t_embd->ne[1] : model.hparams.n_ctx_train; - inp.cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - ggml_set_input(inp.cross_kq_mask); + inp->cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + ggml_set_input(inp->cross_kq_mask); + + inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask; + + res->add_input(inp); - inp.cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp.cross_kq_mask, GGML_TYPE_F16) : inp.cross_kq_mask; + return inp; } ggml_tensor * llama_context_dec::build_attn_cross( + llama_graph_input_attn_i * inp, ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, @@ -4792,10 +4911,10 @@ ggml_tensor * llama_context_dec::build_attn_cross( ggml_tensor * v_cur, ggml_tensor * kq_b, float kq_scale, - int il) { + int il) const { GGML_UNUSED(il); - const auto & kq_mask = inp.cross_kq_mask_cnv; + const auto & kq_mask = inp->get_kq_mask_cross(); ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); //cb(q, "q", il); diff --git a/src/llama-context.h b/src/llama-context.h index f44652e2d1f18..0f248537eded3 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -251,22 +251,18 @@ class llama_context_base : public llama_context, public llama_graph_i { // when the compute graph is built, it creates the input tensors that it needs // the contents of the input tensors are set by the input_set() function + // TODO: remove, replace by llama_graph_input_i->set_input() virtual void input_set(const llama_ubatch & ubatch); private: + // TODO: remove, implement as llama_graph_input_xxx struct { // base input tensors - ggml_tensor * tokens; // I32 [n_batch] - ggml_tensor * embd; // F32 [n_embd, n_batch] ggml_tensor * pos; // I32 [n_batch] ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] ggml_tensor * out_ids; // I32 [n_outputs] ggml_tensor * mean; // F32 [n_batch, n_batch] ggml_tensor * cls; // I32 [n_batch] - - // KQ mask input tensors - ggml_tensor * kq_mask; // F32 [n_tokens, n_batch] - ggml_tensor * kq_mask_cnv; // [n_tokens, n_batch] } inp; protected: @@ -292,7 +288,7 @@ class llama_context_base : public llama_context, public llama_graph_i { virtual ggml_cgraph * graph_init(); // TODO: add encode/decode graphs - virtual llama_graph_result graph_build( + virtual llama_graph_result_ptr graph_build( ggml_context * ctx, ggml_cgraph * gf, const llama_ubatch & ubatch); @@ -344,9 +340,10 @@ class llama_context_base : public llama_context, public llama_graph_i { ggml_backend_buffer * bbuf) override; ggml_tensor * build_inp_embd( - ggml_context * ctx0, - ggml_tensor * tok_embd, - const llama_ubatch & ubatch) override; + llama_graph_result * res, + ggml_context * ctx0, + ggml_tensor * tok_embd, + const llama_ubatch & ubatch) const override; ggml_tensor * build_inp_pos( ggml_context * ctx0, @@ -367,21 +364,23 @@ class llama_context_base : public llama_context, public llama_graph_i { ggml_context * ctx0, int32_t n_tokens) override; - void build_attn_inp( + llama_graph_input_attn_ptr build_attn_inp( + llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, - bool swa) override; + bool swa) const override; ggml_tensor * build_attn( + llama_graph_input_attn_i * inp, ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - float kq_scale, - int il) override; + float kq_scale, + int il) const override; protected: virtual ggml_tensor * build_attn_mha( @@ -393,7 +392,7 @@ class llama_context_base : public llama_context, public llama_graph_i { ggml_tensor * kq_b, ggml_tensor * kq_mask, bool v_trans, - float kq_scale); + float kq_scale) const; virtual ggml_tensor * build_inp_self_k_shift( ggml_context * ctx0); @@ -563,10 +562,6 @@ class llama_context_kv_self : public llama_context_base { private: struct { ggml_tensor * self_pos_bucket; // I32 [n_kv, n_batch] - ggml_tensor * self_kq_mask; // F32 [n_kv, n_batch] - ggml_tensor * self_kq_mask_cnv; // [n_kv, n_batch] - ggml_tensor * self_kq_mask_swa; // F32 [n_kv, n_batch] - ggml_tensor * self_kq_mask_swa_cnv; // [n_kv, n_batch] ggml_tensor * self_k_shift; // I32 [kv_size] } inp; @@ -586,21 +581,23 @@ class llama_context_kv_self : public llama_context_base { ggml_context * ctx0, int32_t n_tokens) override; - void build_attn_inp( + llama_graph_input_attn_ptr build_attn_inp( + llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, - bool swa) override; + bool swa) const override; ggml_tensor * build_attn( + llama_graph_input_attn_i * inp, ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - float kq_scale, - int il) override; + float kq_scale, + int il) const override; protected: ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override; @@ -786,8 +783,6 @@ class llama_context_dec : public llama_context_kv_self { private: struct { ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc] - ggml_tensor * cross_kq_mask; // F32 [n_outputs_enc, n_batch] - ggml_tensor * cross_kq_mask_cnv; // F32 [n_outputs_enc, n_batch] } inp; protected: @@ -800,13 +795,15 @@ class llama_context_dec : public llama_context_kv_self { ggml_tensor * build_inp_cross_embd( ggml_context * ctx0) override; - void build_attn_inp( + llama_graph_input_attn_ptr build_attn_inp( + llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, - bool swa) override; + bool swa) const override; ggml_tensor * build_attn_cross( + llama_graph_input_attn_i * inp, ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, @@ -814,7 +811,7 @@ class llama_context_dec : public llama_context_kv_self { ggml_tensor * v_cur, ggml_tensor * kq_b, float kq_scale, - int il) override; + int il) const override; public: llama_cross * cross = nullptr; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 1e336e844ada0..549a42c53ba22 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2,17 +2,34 @@ #include "llama-impl.h" +ggml_tensor * llama_graph_input_attn_i::get_kq_mask() { + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + return nullptr; +} + +ggml_tensor * llama_graph_input_attn_i::get_kq_mask_swa() { + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + return nullptr; +} + +ggml_tensor * llama_graph_input_attn_i::get_kq_mask_cross() { + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + return nullptr; +} + llama_graph_i::llama_graph_i(llama_graph_type type) : type(type) {} ggml_tensor * llama_graph_i::build_attn( + llama_graph_input_attn_i * inp, ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - float kq_scale, - int il) { + float kq_scale, + int il) const { + GGML_UNUSED(inp); GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(q_cur); @@ -27,6 +44,7 @@ ggml_tensor * llama_graph_i::build_attn( } ggml_tensor * llama_graph_i::build_attn_cross( + llama_graph_input_attn_i * inp, ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, @@ -34,7 +52,8 @@ ggml_tensor * llama_graph_i::build_attn_cross( ggml_tensor * v_cur, ggml_tensor * kq_b, float kq_scale, - int il) { + int il) const { + GGML_UNUSED(inp); GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(q_cur); diff --git a/src/llama-graph.h b/src/llama-graph.h index 28e8a563067db..a6a9ef00ca860 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -1,6 +1,8 @@ #pragma once #include +#include +#include // note: do not add high-level objects here, such as llama_context, llama_kv_cache, etc. // not sure about llama_batch/llama_sbatch yet @@ -9,6 +11,7 @@ struct ggml_cgraph; struct ggml_context; struct ggml_tensor; struct ggml_backend_buffer; + struct llama_ubatch; enum llama_graph_type { @@ -17,13 +20,78 @@ enum llama_graph_type { LLAMA_GRAPH_TYPE_DECODER, }; -struct llama_graph_result { +// +// llama_graph_input +// + +class llama_graph_input_i { +public: + virtual ~llama_graph_input_i() = default; + + virtual void set_input(const llama_ubatch * ubatch) = 0; +}; + +using llama_graph_input_ptr = std::shared_ptr; + +class llama_graph_input_attn_i : public llama_graph_input_i { +public: + virtual ~llama_graph_input_attn_i() = default; + + virtual ggml_tensor * get_kq_mask(); + virtual ggml_tensor * get_kq_mask_swa(); + virtual ggml_tensor * get_kq_mask_cross(); +}; + +using llama_graph_input_attn_ptr = std::shared_ptr; + +// +// llama_graph_result +// + +class llama_graph_result_i { +public: + virtual ~llama_graph_result_i() = default; + + virtual ggml_tensor * get_logits() = 0; + virtual ggml_tensor * get_embd() = 0; + virtual ggml_tensor * get_embd_pooled() = 0; + + virtual void set_inputs(const llama_ubatch * ubatch) = 0; +}; + +using llama_graph_result_ptr = std::unique_ptr; + +class llama_graph_result : public llama_graph_result_i { +public: + llama_graph_result() = default; + virtual ~llama_graph_result() = default; + + ggml_tensor * get_logits() override { return t_logits; } + ggml_tensor * get_embd() override { return t_embd; } + ggml_tensor * get_embd_pooled() override { return t_embd_pooled; } + + void set_inputs(const llama_ubatch * ubatch) override { + for (auto & input : inputs) { + input->set_input(ubatch); + } + } + + void add_input(llama_graph_input_ptr && input) { + inputs.emplace_back(std::move(input)); + } + // important graph nodes ggml_tensor * t_logits = nullptr; ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd_pooled = nullptr; + + std::vector inputs; }; +// +// llama_graph +// + // TODO: can become more granular in the future class llama_graph_i { public: @@ -75,9 +143,10 @@ class llama_graph_i { // graph build API (context-specific) virtual ggml_tensor * build_inp_embd( + llama_graph_result * res, ggml_context * ctx0, ggml_tensor * tok_embd, - const llama_ubatch & ubatch) = 0; + const llama_ubatch & ubatch) const = 0; // note these methods will become const, i.e. they don't mutate the llama_context that implements them virtual ggml_tensor * build_inp_pos( ggml_context * ctx0, @@ -98,23 +167,26 @@ class llama_graph_i { ggml_context * ctx0, int32_t n_tokens) = 0; - virtual void build_attn_inp( + virtual llama_graph_input_attn_ptr build_attn_inp( + llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, - bool swa) = 0; + bool swa) const = 0; virtual ggml_tensor * build_attn( + llama_graph_input_attn_i * inp, ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, - float kq_scale, - int il); + float kq_scale, + int il) const; virtual ggml_tensor * build_attn_cross( + llama_graph_input_attn_i * inp, ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q_cur, @@ -122,7 +194,7 @@ class llama_graph_i { ggml_tensor * v_cur, ggml_tensor * kq_b, float kq_scale, - int il); + int il) const; virtual ggml_tensor * build_inp_cross_embd( ggml_context * ctx0); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 25a705c657cd9..b6adbb1a1bbed 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2,7 +2,6 @@ #include "llama-impl.h" #include "llama-mmap.h" -#include "llama-graph.h" #include "llama-batch.h" #include "llama-cparams.h" #include "llama-model-loader.h" @@ -3853,7 +3852,7 @@ struct llm_build_context { ggml_context * ctx0 = nullptr; llama_graph_i * lgf = nullptr; - llama_graph_result res; + std::unique_ptr res; // TODO: consider making the entire interface noexcept llm_build_context( @@ -3892,7 +3891,8 @@ struct llm_build_context { pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), ctx0 (ctx), - lgf (lgf) { + lgf (lgf), + res (std::make_unique()) { } // TODO: tmp @@ -3902,7 +3902,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { - struct ggml_tensor * inpL = lgf->build_inp_embd(ctx0, tok_embd, ubatch); + struct ggml_tensor * inpL = lgf->build_inp_embd(res.get(), ctx0, tok_embd, ubatch); cb(inpL, "inp_embd", -1); return inpL; @@ -4259,15 +4259,16 @@ struct llm_build_context { } struct ggml_tensor * build_attn( - struct ggml_cgraph * gf, - struct ggml_tensor * wo, - struct ggml_tensor * wo_b, - struct ggml_tensor * q_cur, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - int32_t n_tokens, // TODO: remove - float kq_scale, - int il) { + llama_graph_input_attn_i * inp, + ggml_cgraph * gf, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, // TODO: remove + float kq_scale, + int il) { GGML_UNUSED(n_tokens); // these nodes are added to the graph together so that they are not reordered @@ -4276,7 +4277,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - ggml_tensor * cur = lgf->build_attn(ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il); + ggml_tensor * cur = lgf->build_attn(inp, ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il); cb(cur, "kqv_out", il); if (wo) { @@ -4295,15 +4296,16 @@ struct llm_build_context { } struct ggml_tensor * build_attn_cross( - struct ggml_cgraph * gf, - struct ggml_tensor * wo, - struct ggml_tensor * wo_b, - struct ggml_tensor * q_cur, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - int32_t n_tokens, // TODO: remove - float kq_scale, - int il) { + llama_graph_input_attn_i * inp, + ggml_cgraph * gf, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, // TODO: remove + float kq_scale, + int il) { GGML_UNUSED(n_tokens); // these nodes are added to the graph together so that they are not reordered @@ -4312,7 +4314,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - ggml_tensor * cur = lgf->build_attn_cross(ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il); + ggml_tensor * cur = lgf->build_attn_cross(inp, ctx0, gf, q_cur, k_cur, v_cur, nullptr, kq_scale, il); cb(cur, "kqv_out", il); if (wo) { @@ -4331,16 +4333,17 @@ struct llm_build_context { } struct ggml_tensor * build_attn_with_kq_b( - struct ggml_cgraph * gf, - struct ggml_tensor * wo, - struct ggml_tensor * wo_b, - struct ggml_tensor * q_cur, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - struct ggml_tensor * kq_b, - int32_t n_tokens, // TODO: remove - float kq_scale, - int il) { + llama_graph_input_attn_i * inp, + ggml_cgraph * gf, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_b, + int32_t n_tokens, // TODO: remove + float kq_scale, + int il) { GGML_UNUSED(n_tokens); // these nodes are added to the graph together so that they are not reordered @@ -4349,7 +4352,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, k_cur); ggml_build_forward_expand(gf, v_cur); - ggml_tensor * cur = lgf->build_attn(ctx0, gf, q_cur, k_cur, v_cur, kq_b, kq_scale, il); + ggml_tensor * cur = lgf->build_attn(inp, ctx0, gf, q_cur, k_cur, v_cur, kq_b, kq_scale, il); cb(cur, "kqv_out", il); if (wo) { @@ -4397,7 +4400,7 @@ struct llm_build_context { } void append_pooling(struct ggml_cgraph * gf) { - struct ggml_tensor * inp = res.t_embd; + struct ggml_tensor * inp = res->t_embd; //// find result_norm tensor for input //for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { @@ -4457,7 +4460,7 @@ struct llm_build_context { } cb(cur, "result_embd_pooled", -1); - res.t_embd_pooled = cur; + res->t_embd_pooled = cur; ggml_build_forward_expand(gf, cur); } @@ -4495,7 +4498,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4548,7 +4551,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, kq_scale, il); } @@ -4626,7 +4629,7 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -4637,7 +4640,7 @@ struct llm_build_context { } cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -4656,7 +4659,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4720,7 +4723,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, kq_scale, il); } @@ -4782,7 +4785,7 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -4793,7 +4796,7 @@ struct llm_build_context { } cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -4812,7 +4815,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4856,7 +4859,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -4903,13 +4906,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -4928,7 +4931,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4962,7 +4965,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -5007,13 +5010,13 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5033,7 +5036,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5084,7 +5087,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -5129,12 +5132,12 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5156,7 +5159,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5206,7 +5209,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } @@ -5277,7 +5280,7 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -5288,7 +5291,7 @@ struct llm_build_context { cur = ggml_scale(ctx0, cur, 0.5773502691896257f); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5308,7 +5311,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5353,7 +5356,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -5405,13 +5408,13 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5430,7 +5433,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -5463,7 +5466,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -5511,12 +5514,12 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5531,7 +5534,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5558,7 +5561,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -5605,13 +5608,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5645,7 +5648,7 @@ struct llm_build_context { inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); cb(inpL, "inp_norm", -1); - lgf->build_attn_inp(ctx0, n_tokens, false, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false); // iterate layers for (int il = 0; il < n_layer; ++il) { @@ -5710,7 +5713,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "kqv_out", il); @@ -5774,7 +5777,7 @@ struct llm_build_context { cur = inpL; cb(cur, "result_embd", -1); - res.t_embd = cur; + res->t_embd = cur; ggml_build_forward_expand(gf, cur); } @@ -5790,7 +5793,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); inpL = build_norm(inpL, model.tok_norm, @@ -5823,7 +5826,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -5871,12 +5874,12 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -5893,7 +5896,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); if (model.pos_embd) { // inp_pos - contains the positions @@ -5956,13 +5959,13 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6012,12 +6015,12 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6035,7 +6038,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { @@ -6108,7 +6111,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6162,13 +6165,13 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6186,7 +6189,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6228,7 +6231,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6275,13 +6278,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6300,7 +6303,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6343,7 +6346,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6388,13 +6391,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6413,7 +6416,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -6461,7 +6464,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6506,13 +6509,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6531,7 +6534,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6574,7 +6577,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6651,13 +6654,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6678,7 +6681,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { attn_norm_output = build_norm(inpL, @@ -6733,7 +6736,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } @@ -6773,7 +6776,7 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output_no_bias", -1); @@ -6781,7 +6784,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, model.output_b); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6801,7 +6804,7 @@ struct llm_build_context { struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - lgf->build_attn_inp(ctx0, n_tokens, true, true); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); for (int il = 0; il < n_layer; ++il) { auto * residual = inpL; @@ -6856,7 +6859,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } @@ -6916,7 +6919,7 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); @@ -6926,7 +6929,7 @@ struct llm_build_context { } cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -6945,7 +6948,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { @@ -6981,7 +6984,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7025,13 +7028,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7051,7 +7054,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -7084,7 +7087,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7132,12 +7135,12 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7157,7 +7160,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -7196,7 +7199,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7244,12 +7247,12 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7268,7 +7271,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7317,7 +7320,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7362,13 +7365,13 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7387,7 +7390,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7436,7 +7439,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7481,13 +7484,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7515,7 +7518,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7630,7 +7633,7 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, q_states, k_states, v_states, n_tokens, kq_scale, il); } @@ -7686,7 +7689,7 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head scaling const float scale_lmhead = float(n_embd_base)/float(n_embd); @@ -7697,7 +7700,7 @@ struct llm_build_context { cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7716,7 +7719,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { // norm @@ -7752,7 +7755,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } @@ -7799,13 +7802,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7824,7 +7827,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, true); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); for (int il = 0; il < n_layer; ++il) { // norm @@ -7866,7 +7869,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f, il); } @@ -7923,7 +7926,7 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -7934,7 +7937,7 @@ struct llm_build_context { cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -7954,7 +7957,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8003,7 +8006,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8049,13 +8052,13 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8103,13 +8106,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8129,7 +8132,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { @@ -8203,7 +8206,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8247,7 +8250,7 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -8257,7 +8260,7 @@ struct llm_build_context { } cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8277,7 +8280,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, true); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); // sliding window switch pattern const int32_t sliding_window_pattern = 4; @@ -8338,7 +8341,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); } - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8377,7 +8380,7 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -8387,7 +8390,7 @@ struct llm_build_context { } cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8412,7 +8415,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8461,7 +8464,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8507,13 +8510,13 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8532,7 +8535,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8576,7 +8579,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8627,13 +8630,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8656,7 +8659,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8704,7 +8707,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8754,13 +8757,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8777,7 +8780,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { const int64_t n_head = hparams.n_head(il); @@ -8834,7 +8837,7 @@ struct llm_build_context { Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); cb(Qcur, "Vcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8881,12 +8884,12 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -8905,7 +8908,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -8944,7 +8947,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -9025,12 +9028,12 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -9049,7 +9052,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9086,7 +9089,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -9154,13 +9157,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -9179,7 +9182,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -9233,7 +9236,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, kq_scale, il); } @@ -9309,13 +9312,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -9342,7 +9345,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9461,7 +9464,7 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, q_states, k_states, v_states, n_tokens, kq_scale, il); } @@ -9536,13 +9539,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -9560,7 +9563,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9619,7 +9622,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, NULL, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); @@ -9687,14 +9690,14 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head // FIXME: do not use model.tok_embd directly, duplicate as model.output cur = build_lora_mm(model.tok_embd, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -9711,7 +9714,7 @@ struct llm_build_context { struct ggml_tensor * pos_bucket_enc = build_pos_bucket(); - lgf->build_attn_inp(ctx0, n_tokens, false, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9740,7 +9743,7 @@ struct llm_build_context { struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b); - cur = build_attn_with_kq_b(gf, + cur = build_attn_with_kq_b(inp_attn.get(), gf, model.layers[il].wo_enc, nullptr, Qcur, Kcur, Vcur, kq_b, n_tokens, 1.0f, il); cb(cur, "kqv_out", il); @@ -9793,7 +9796,7 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; ggml_build_forward_expand(gf, cur); } @@ -9814,7 +9817,7 @@ struct llm_build_context { const int64_t n_outputs_enc = embd_enc->ne[1]; - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9843,7 +9846,7 @@ struct llm_build_context { struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b); - cur = build_attn_with_kq_b(gf, + cur = build_attn_with_kq_b(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, kq_b, n_tokens, 1.0f, il); cb(cur, "kqv_out", il); @@ -9875,7 +9878,7 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc); - cur = build_attn_cross(gf, + cur = build_attn_cross(inp_attn.get(), gf, model.layers[il].wo_cross, nullptr, Qcur, Kcur, Vcur, n_tokens, 1.0f, il); cb(cur, "kqv_out", il); @@ -9955,13 +9958,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -9977,7 +9980,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -10004,7 +10007,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/float(n_embd_head), il); } @@ -10047,12 +10050,12 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10071,7 +10074,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10132,7 +10135,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); @@ -10177,12 +10180,12 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10201,7 +10204,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10251,7 +10254,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -10297,13 +10300,13 @@ struct llm_build_context { LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10322,7 +10325,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10374,7 +10377,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -10420,13 +10423,13 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10513,12 +10516,12 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10597,12 +10600,12 @@ struct llm_build_context { cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10627,7 +10630,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - lgf->build_attn_inp(ctx0, n_tokens, true, false); + auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10696,7 +10699,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = build_attn(gf, + cur = build_attn(inp_attn.get(), gf, model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), il); @@ -10757,7 +10760,7 @@ struct llm_build_context { LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - res.t_embd = cur; + res->t_embd = cur; // lm_head cur = build_lora_mm(model.output, cur); @@ -10777,7 +10780,7 @@ struct llm_build_context { cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx); cb(cur, "result_output", -1); - res.t_logits = cur; + res->t_logits = cur; ggml_build_forward_expand(gf, cur); } @@ -10927,13 +10930,13 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, model.output_b); cb(cur, "result_embd", -1); - res.t_embd = cur; + res->t_embd = cur; ggml_build_forward_expand(gf, cur); } }; -llama_graph_result llama_model::build_graph( +llama_graph_result_ptr llama_model::build_graph( ggml_context * ctx, ggml_cgraph * gf, llama_graph_i * lgf, @@ -11166,7 +11169,7 @@ llama_graph_result llama_model::build_graph( llm.append_pooling(gf); } - return llm.res; + return std::move(llm.res); } // diff --git a/src/llama-model.h b/src/llama-model.h index 447fc0d0576d6..2d64c0d242c09 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -3,6 +3,7 @@ #include "llama.h" #include "llama-arch.h" #include "llama-hparams.h" +#include "llama-graph.h" #include "llama-vocab.h" #include @@ -10,11 +11,9 @@ #include #include -class llama_graph_i; struct llama_cparams; struct llama_ubatch; struct llama_model_loader; -struct llama_graph_result; // available models enum llm_type { @@ -367,7 +366,7 @@ struct llama_model { const struct ggml_tensor * get_tensor(const char * name) const; // TODO: add encode/decode graphs - llama_graph_result build_graph( + llama_graph_result_ptr build_graph( ggml_context * ctx, ggml_cgraph * gf, llama_graph_i * lgf, From 9cab53c7ddeb029c7aeb787cf9fa7ea1779ba4b4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 28 Feb 2025 18:01:25 +0200 Subject: [PATCH 82/84] cont : migrate the rest of the inputs out of llama_context ggml-ci --- src/llama-context.cpp | 922 ++++++++++++++++++++++++------------------ src/llama-context.h | 127 ++---- src/llama-graph.cpp | 86 ++-- src/llama-graph.h | 53 +-- src/llama-model.cpp | 36 +- 5 files changed, 646 insertions(+), 578 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 5ac28f983027e..8587f480fd96f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -71,6 +71,243 @@ void llama_graph_input_embd::set_input(const llama_ubatch * ubatch) { } } +class llama_graph_input_pos : public llama_graph_input_i { +public: + llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} + virtual ~llama_graph_input_pos() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * pos = nullptr; // I32 [n_batch] + + const int64_t n_pos_per_token = 1; +}; + +void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) { + if (ubatch->pos && pos) { + const int64_t n_tokens = ubatch->n_tokens; + + ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); + } +} + +class llama_graph_input_pos_bucket : public llama_graph_input_i { +public: + llama_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {} + virtual ~llama_graph_input_pos_bucket() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] + + const llama_hparams & hparams; +}; + +void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { + if (pos_bucket) { + const int64_t n_tokens = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); + GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing + + int32_t * data = (int32_t *) pos_bucket->data; + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + for (int i = 0; i < n_tokens; ++i) { + data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true); + } + } + } + } +} + +class llama_graph_input_out_ids : public llama_graph_input_i { +public: + llama_graph_input_out_ids( + const llama_hparams & hparams, + const llama_cparams & cparams, + int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} + virtual ~llama_graph_input_out_ids() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * out_ids; // I32 [n_outputs] + + const llama_hparams & hparams; + const llama_cparams & cparams; + + const int32_t n_outputs; +}; + +void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { + if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + //GGML_ASSERT(out_ids && "every model that can must skip unused outputs"); + + if (!out_ids) { + LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__); + } else { + const int64_t n_tokens = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); + int32_t * data = (int32_t *) out_ids->data; + + if (n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (ubatch->output) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch->output[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(n_outputs == n_outputs); + } else if (n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(n_outputs == 0); + } + } + } +} + +class llama_graph_input_mean : public llama_graph_input_i { +public: + llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {} + virtual ~llama_graph_input_mean() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * mean; // F32 [n_batch, n_batch] + + const llama_cparams & cparams; +}; + +void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) { + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(mean); + GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer)); + + float * data = (float *) mean->data; + memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean)); + + std::vector sum(n_tokens, 0); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); + + sum[seq_id] += ubatch->n_seq_tokens; + } + + std::vector div(n_tokens, 0.0f); + for (int i = 0; i < n_tokens; ++i) { + const uint64_t s = sum[i]; + if (s > 0) { + div[i] = 1.0f/float(s); + } + } + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + for (int i = 0; i < n_seq_tokens; ++i) { + data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; + } + } + } +} + +class llama_graph_input_cls : public llama_graph_input_i { +public: + llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {} + virtual ~llama_graph_input_cls() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * cls; // I32 [n_batch] + + const llama_cparams & cparams; +}; + +void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) { + if (cparams.embeddings && ( + cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || + cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(cls); + GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); + + uint32_t * data = (uint32_t *) cls->data; + memset(cls->data, 0, n_tokens * ggml_element_size(cls)); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; + + if (pos == 0) { + data[seq_id] = s*n_seq_tokens + i; + } + } + } + } + + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(cls); + GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); + + uint32_t * data = (uint32_t *) cls->data; + memset(cls->data, 0, n_tokens * ggml_element_size(cls)); + + std::vector last_pos(n_tokens, -1); + std::vector last_row(n_tokens, -1); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; + + if (pos >= last_pos[seq_id]) { + last_pos[seq_id] = pos; + last_row[seq_id] = s*n_seq_tokens + i; + } + } + } + + for (int i = 0; i < n_tokens; ++i) { + if (last_row[i] >= 0) { + data[i] = last_row[i]; + } + } + } +} + class llama_graph_input_attn_base : public llama_graph_input_attn_i { public: llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) : @@ -846,7 +1083,6 @@ int llama_context_base::encode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove, tmp here, until all inputs are migrated outside the context const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { @@ -1003,7 +1239,6 @@ int llama_context_base::decode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { @@ -1132,178 +1367,6 @@ int64_t llama_context_base::n_pos_per_token() const { return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; } -void llama_context_base::input_set(const llama_ubatch & ubatch) { - const llama_hparams & hparams = model.hparams; - - if (ubatch.pos && inp.pos) { - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp.pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp.pos)); - } - - if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(inp.out_ids && "every model that can must skip unused outputs"); - - if (!inp.out_ids) { - LLAMA_LOG_WARN("%s: 'inp.out_ids' is not created\n", __func__); - } else { - const int64_t n_tokens = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp.out_ids->buffer)); - int32_t * data = (int32_t *) inp.out_ids->data; - - if (n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { - data[i] = i; - } - } else if (ubatch.output) { - int32_t n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch.output[i]) { - data[n_outputs++] = i; - } - } - // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(n_outputs == n_outputs); - } else if (n_outputs == 1) { - // only keep last output - data[0] = n_tokens - 1; - } else { - GGML_ASSERT(n_outputs == 0); - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp.mean); - GGML_ASSERT(ggml_backend_buffer_is_host(inp.mean->buffer)); - - float * data = (float *) inp.mean->data; - memset(inp.mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp.mean)); - - std::vector sum(n_tokens, 0); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); - - sum[seq_id] += ubatch.n_seq_tokens; - } - - std::vector div(n_tokens, 0.0f); - for (int i = 0; i < n_tokens; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); - } - } - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - for (int i = 0; i < n_seq_tokens; ++i) { - data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; - } - } - } - - if (cparams.embeddings && ( - cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || - cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp.cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp.cls->buffer)); - - uint32_t * data = (uint32_t *) inp.cls->data; - memset(inp.cls->data, 0, n_tokens * ggml_element_size(inp.cls)); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; - - if (pos == 0) { - data[seq_id] = s*n_seq_tokens + i; - } - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp.cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp.cls->buffer)); - - uint32_t * data = (uint32_t *) inp.cls->data; - memset(inp.cls->data, 0, n_tokens * ggml_element_size(inp.cls)); - - std::vector last_pos(n_tokens, -1); - std::vector last_row(n_tokens, -1); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; - - if (pos >= last_pos[seq_id]) { - last_pos[seq_id] = pos; - last_row[seq_id] = s*n_seq_tokens + i; - } - } - } - - for (int i = 0; i < n_tokens; ++i) { - if (last_row[i] >= 0) { - data[i] = last_row[i]; - } - } - } - - if (inp.pos_bucket) { - const int64_t n_tokens = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp.pos_bucket->buffer)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - - int32_t * data = (int32_t *) inp.pos_bucket->data; - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - for (int i = 0; i < n_tokens; ++i) { - data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, true); - } - } - } - } - - GGML_ASSERT( - // (!a || b) is a logical implication (a -> b) - // !hparams.causal_attn -> !cparams.causal_attn - (hparams.causal_attn || !cparams.causal_attn) && - "causal attention is not supported by this model" - ); -} - // // output // @@ -1423,8 +1486,6 @@ int32_t llama_context_base::graph_max_nodes() const { } ggml_cgraph * llama_context_base::graph_init() { - inp = {}; - struct ggml_init_params params = { /*.mem_size =*/ buf_compute_meta.size(), /*.mem_buffer =*/ buf_compute_meta.data(), @@ -1478,7 +1539,7 @@ void llama_context_base::build_cb( ggml_tensor * cur, const char * name, const llama_ubatch & ubatch, - int il) { + int il) const { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); } else { @@ -1498,7 +1559,7 @@ void llama_context_base::build_cb( if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); - for (auto & backend : backends) { + for (const auto & backend : backends) { if (ggml_backend_get_device(backend.get()) == dev_layer) { if (ggml_backend_supports_op(backend.get(), cur)) { ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get()); @@ -1512,14 +1573,14 @@ void llama_context_base::build_cb( ggml_tensor * llama_context_base::build_cvec( ggml_context * ctx0, ggml_tensor * cur, - int il) { + int il) const { return cvec.apply_to(ctx0, cur, il); } ggml_tensor * llama_context_base::build_lora_mm( ggml_context * ctx0, ggml_tensor * w, - ggml_tensor * cur) { + ggml_tensor * cur) const { struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); for (const auto & lora : loras) { @@ -1547,7 +1608,7 @@ ggml_tensor * llama_context_base::build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, ggml_tensor * cur, - ggml_tensor * ids) { + ggml_tensor * ids) const { struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); for (const auto & lora : loras) { struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); @@ -1572,7 +1633,7 @@ ggml_tensor * llama_context_base::build_lora_mm_id( return res; } -ggml_tensor * llama_context_base::build_rope_factors(int il) { +ggml_tensor * llama_context_base::build_rope_factors(int il) const { const auto & hparams = model.hparams; // choose long/short freq factors based on the context size @@ -1594,7 +1655,7 @@ ggml_tensor * llama_context_base::build_rope_shift( ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, - ggml_backend_buffer * bbuf) { + ggml_backend_buffer * bbuf) const { const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & freq_base = cparams.rope_freq_base; const auto & freq_scale = cparams.rope_freq_scale; @@ -1614,7 +1675,7 @@ ggml_tensor * llama_context_base::build_rope_shift( tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); if (bbuf) { - for (auto & backend : backends) { + for (const auto & backend : backends) { // Figure out which backend KV cache belongs to if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) { ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); @@ -1693,50 +1754,73 @@ ggml_tensor * llama_context_base::build_inp_embd( return inpL; } -ggml_tensor * llama_context_base::build_inp_pos( - ggml_context * ctx0, - int32_t n_tokens) { - inp.pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); - ggml_set_input(inp.pos); +ggml_tensor * llama_context_base::build_inp_pos( + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(n_pos_per_token()); + + inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + ggml_set_input(inp->pos); + + res->add_input(inp); - return inp.pos; + return inp->pos; } ggml_tensor * llama_context_base::build_inp_pos_bucket( - ggml_context * ctx0, - int32_t n_tokens) { - inp.pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); - ggml_set_input(inp.pos_bucket); + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(model.hparams); + + inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); + ggml_set_input(inp->pos_bucket); + + res->add_input(inp); - return inp.pos_bucket; + return inp->pos_bucket; } ggml_tensor * llama_context_base::build_inp_out_ids( - ggml_context * ctx0) { - const int32_t n_out_ids = n_outputs; + llama_graph_result * res, + ggml_context * ctx0) const { + auto inp = std::make_shared(model.hparams, cparams, n_outputs); - inp.out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids); - ggml_set_input(inp.out_ids); + inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); + ggml_set_input(inp->out_ids); + + res->add_input(inp); - return inp.out_ids; + return inp->out_ids; } ggml_tensor * llama_context_base::build_inp_mean( - ggml_context * ctx0, - int32_t n_tokens) { - inp.mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); - ggml_set_input(inp.mean); + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(cparams); + + inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); + ggml_set_input(inp->mean); - return inp.mean; + res->add_input(inp); + + return inp->mean; } ggml_tensor * llama_context_base::build_inp_cls( - ggml_context * ctx0, - int32_t n_tokens) { - inp.cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_input(inp.cls); + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(cparams); + + inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->cls); + + res->add_input(inp); - return inp.cls; + return inp->cls; } llama_graph_input_attn_ptr llama_context_base::build_attn_inp( @@ -1887,33 +1971,6 @@ ggml_tensor * llama_context_base::build_attn_mha( return cur; } -ggml_tensor * llama_context_base::build_inp_self_k_shift( - ggml_context * ctx0) { - GGML_UNUSED(ctx0); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); - return nullptr; -} - -void llama_context_base::build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * gf) { - GGML_UNUSED(ctx0); - GGML_UNUSED(gf); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); -} - -void llama_context_base::build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * gf) { - GGML_UNUSED(ctx0); - GGML_UNUSED(gf); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); -} - - // // perf // @@ -2428,6 +2485,68 @@ size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_i // llama_context_kv_self // +class llama_graph_input_pos_bucket_kv : public llama_graph_input_i { +public: + llama_graph_input_pos_bucket_kv( + const llama_hparams & hparams, + const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {} + virtual ~llama_graph_input_pos_bucket_kv() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] + + const llama_hparams & hparams; + const llama_kv_cache_unified * kv_self; +}; + +void llama_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) { + if (pos_bucket) { + const int64_t n_tokens = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); + GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing + + int32_t * data = (int32_t *) pos_bucket->data; + + const int64_t n_kv = kv_self->n; + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + for (int i = 0; i < n_kv; ++i) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false); + } + } + } + } +} + +class llama_graph_input_k_shift : public llama_graph_input_i { +public: + llama_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {} + virtual ~llama_graph_input_k_shift() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * k_shift; // I32 [kv_size] + + const llama_kv_cache_unified * kv_self; +}; + +void llama_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { + GGML_UNUSED(ubatch); + + if (k_shift) { + assert(ggml_backend_buffer_is_host(k_shift->buffer)); + + int32_t * data = (int32_t *) k_shift->data; + + for (uint32_t i = 0; i < kv_self->size; ++i) { + data[i] = kv_self->cells[i].delta; + } + } +} + class llama_graph_input_attn_kv_self : public llama_graph_input_attn_i { public: llama_graph_input_attn_kv_self( @@ -2661,11 +2780,11 @@ void llama_context_kv_self::kv_self_update() { auto * gf = graph_init(); - build_kv_self_shift(ctx_compute.get(), gf); + auto res = graph_build_kv_self_shift(ctx_compute.get(), gf); ggml_backend_sched_alloc_graph(sched.get(), gf); - input_set({}); + res->set_inputs(nullptr); graph_compute(gf, false); @@ -2689,7 +2808,7 @@ void llama_context_kv_self::kv_self_update() { auto * gf = graph_init(); - build_kv_self_defrag(ctx_compute.get(), gf); + graph_build_kv_self_defrag(ctx_compute.get(), gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2792,7 +2911,6 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { @@ -3031,7 +3149,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { @@ -3190,66 +3307,24 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { return 0; } -void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { - const llama_hparams & hparams = model.hparams; - - if (inp.self_k_shift) { - assert(ggml_backend_buffer_is_host(inp.self_k_shift->buffer)); - - int32_t * data = (int32_t *) inp.self_k_shift->data; - - for (uint32_t i = 0; i < kv_self->size; ++i) { - data[i] = kv_self->cells[i].delta; - } - - // the K-shift graph requires just this input - return; - } - - // call base functionality - llama_context_base::input_set(ubatch); - - if (inp.self_pos_bucket) { - const int64_t n_tokens = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_pos_bucket->buffer)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - - int32_t * data = (int32_t *) inp.self_pos_bucket->data; - - const int64_t n_kv = kv_self->n; - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - for (int i = 0; i < n_kv; ++i) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, false); - } - } - } - } -} - ggml_cgraph * llama_context_kv_self::graph_init() { - inp = {}; - return llama_context_base::graph_init(); } -ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) { - inp.self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); - ggml_set_input(inp.self_k_shift); - - return inp.self_k_shift; -} - ggml_tensor * llama_context_kv_self::build_inp_pos_bucket( - ggml_context * ctx0, - int32_t n_tokens) { + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(model.hparams, kv_self.get()); + const auto n_kv = kv_self->n; - inp.self_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); - ggml_set_input(inp.self_pos_bucket); + inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); + ggml_set_input(inp->pos_bucket); - return inp.self_pos_bucket; + res->inputs.push_back(inp); + + return inp->pos_bucket; } llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp( @@ -3404,9 +3479,11 @@ ggml_tensor * llama_context_kv_self::build_attn( return cur; } -void llama_context_kv_self::build_kv_self_shift( +llama_graph_result_ptr llama_context_kv_self::graph_build_kv_self_shift( ggml_context * ctx0, - ggml_cgraph * gf) { + ggml_cgraph * gf) const { + auto res = std::make_unique(); + const auto & hparams = model.hparams; const auto & n_layer = hparams.n_layer; @@ -3416,7 +3493,12 @@ void llama_context_kv_self::build_kv_self_shift( //GGML_ASSERT(kv_self->size == n_ctx); - ggml_tensor * inp_self_k_shift = build_inp_self_k_shift(ctx0); + auto inp = std::make_shared(kv_self.get()); + + inp->k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); + ggml_set_input(inp->k_shift); + + res->add_input(inp); for (uint32_t il = 0; il < n_layer; ++il) { const int64_t n_head_kv = hparams.n_head_kv(il); @@ -3431,15 +3513,17 @@ void llama_context_kv_self::build_kv_self_shift( ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), 0); - ggml_tensor * cur = build_rope_shift(ctx0, k, inp_self_k_shift, rope_factors, kv_self->k_l[il]->buffer); + ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, kv_self->k_l[il]->buffer); ggml_build_forward_expand(gf, cur); } + + return res; } -void llama_context_kv_self::build_kv_self_defrag( +llama_graph_result_ptr llama_context_kv_self::graph_build_kv_self_defrag( ggml_context * ctx0, - ggml_cgraph * gf) { + ggml_cgraph * gf) const { const auto & hparams = model.hparams; const uint32_t n_layer = hparams.n_layer; @@ -3454,7 +3538,7 @@ void llama_context_kv_self::build_kv_self_defrag( // number of cells moved uint32_t n_moves = 0; - // each move requires 6*n_layer tensors (see build_kv_self_defrag) + // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag) // - source view, destination view, copy operation // - x2 for keys and values //const uint32_t max_moves = max_nodes()/(6*n_layer); @@ -3565,7 +3649,7 @@ void llama_context_kv_self::build_kv_self_defrag( } if (n_moves == 0) { - return; + return nullptr; } //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); @@ -3705,6 +3789,8 @@ void llama_context_kv_self::build_kv_self_defrag( //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); #endif + + return nullptr; } // state save/load @@ -3747,6 +3833,89 @@ size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_se // llama_context_recurrent // +class llama_graph_input_s_copy : public llama_graph_input_i { +public: + llama_graph_input_s_copy(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} + virtual ~llama_graph_input_s_copy() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * s_copy; // I32 [kv_size] + + llama_kv_cache_recurrent * kv_self; +}; + +void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) { + GGML_UNUSED(ubatch); + + const int64_t n_kv = kv_self->n; + + if (s_copy) { + GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer)); + int32_t * data = (int32_t *) s_copy->data; + + // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t cell_id = i + kv_self->head; + llama_kv_cell & kv_cell = kv_self->cells[cell_id]; + + ////////////////////////////////////////////// + // TODO: this should not mutate the KV cache ! + + // prevent out-of-bound sources + if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) { + kv_cell.src = cell_id; + } + + data[i] = kv_cell.src; + + // TODO: do not mutate the KV cache + // ensure copy only happens once + if (kv_cell.src != (int32_t) cell_id) { + kv_cell.src = cell_id; + } + } + } +} + +class llama_graph_input_s_mask : public llama_graph_input_i { +public: + llama_graph_input_s_mask(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} + virtual ~llama_graph_input_s_mask() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * s_mask; // F32 [1, n_kv] + + llama_kv_cache_recurrent * kv_self; +}; + +void llama_graph_input_s_mask::set_input(const llama_ubatch * ubatch) { + GGML_UNUSED(ubatch); + + const int64_t n_kv = kv_self->n; + + if (s_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer)); + float * data = (float *) s_mask->data; + + // clear unused states + for (int i = 0; i < n_kv; ++i) { + const uint32_t cell_id = i + kv_self->head; + llama_kv_cell & kv_cell = kv_self->cells[cell_id]; + + data[i] = (float) (kv_cell.src >= 0); + + ////////////////////////////////////////////// + // TODO: this should not mutate the KV cache ! + // only clear once + if (kv_cell.src < 0) { + kv_cell.src = cell_id; + } + } + } +} + llama_context_recurrent::llama_context_recurrent( const llama_model & model, llama_context_params params, @@ -3985,7 +4154,6 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { @@ -4130,85 +4298,40 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { return 0; } -void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { - // call base functionality - llama_context_base::input_set(ubatch); - - GGML_ASSERT(kv_self->recurrent); - - const int64_t n_kv = kv_self->n; - - if (inp.s_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_mask->buffer)); - float * data = (float *) inp.s_mask->data; - - // clear unused states - for (int i = 0; i < n_kv; ++i) { - const uint32_t cell_id = i + kv_self->head; - llama_kv_cell & kv_cell = kv_self->cells[cell_id]; - - data[i] = (float) (kv_cell.src >= 0); - - // TODO: do not mutate the KV cache - // only clear once - if (kv_cell.src < 0) { - kv_cell.src = cell_id; - } - } - } - - if (inp.s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_copy->buffer)); - int32_t * data = (int32_t *) inp.s_copy->data; - - // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t cell_id = i + kv_self->head; - llama_kv_cell & kv_cell = kv_self->cells[cell_id]; - - // prevent out-of-bound sources - if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) { - kv_cell.src = cell_id; - } - - data[i] = kv_cell.src; - - // TODO: do not mutate the KV cache - // ensure copy only happens once - if (kv_cell.src != (int32_t) cell_id) { - kv_cell.src = cell_id; - } - } - } -} - ggml_cgraph * llama_context_recurrent::graph_init() { - inp.s_copy = nullptr; - inp.s_mask = nullptr; - return llama_context_base::graph_init(); } ggml_tensor * llama_context_recurrent::build_inp_s_copy( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + auto inp = std::make_shared(kv_self.get()); + const auto n_kv = kv_self->n; - inp.s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); + inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); //cb(inp.s_copy, "inp_s_copy", -1); - ggml_set_input(inp.s_copy); + ggml_set_input(inp->s_copy); + + res->add_input(inp); - return inp.s_copy; + return inp->s_copy; } ggml_tensor * llama_context_recurrent::build_inp_s_mask( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + auto inp = std::make_shared(kv_self.get()); + const auto n_kv = kv_self->n; - inp.s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); - //cb(inp.s_mask, "inp_s_mask", -1); - ggml_set_input(inp.s_mask); + inp->s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); + //cb(inp->s_mask, "inp_s_mask", -1); + ggml_set_input(inp->s_mask); + + res->add_input(inp); - return inp.s_mask; + return inp->s_mask; } ggml_tensor * llama_context_recurrent::build_copy_mask_state( @@ -4218,7 +4341,7 @@ ggml_tensor * llama_context_recurrent::build_copy_mask_state( ggml_tensor * state_copy, ggml_tensor * state_mask, int32_t n_state, - int32_t n_seqs) { + int32_t n_seqs) const { const auto n_kv = kv_self->n; const auto kv_head = kv_self->head; @@ -4251,7 +4374,7 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) { + int il) const { const auto & hparams = model.hparams; const auto kv_head = kv_self->head; @@ -4383,7 +4506,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_load( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) { + int il) const { const auto & hparams = model.hparams; const auto token_shift_count = hparams.token_shift_count; @@ -4405,7 +4528,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, - int il) { + int il) const { const auto & hparams = model.hparams; const auto token_shift_count = hparams.token_shift_count; @@ -4430,7 +4553,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) { + int il) const { const auto & hparams = model.hparams; const auto n_tokens = ubatch.n_tokens; @@ -4693,7 +4816,6 @@ int llama_context_enc::encode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { @@ -4782,6 +4904,29 @@ int llama_context_enc::encode(llama_batch & inp_batch) { // llama_context_dec // +class llama_graph_input_cross_embd : public llama_graph_input_i { +public: + llama_graph_input_cross_embd( + const llama_cross * cross) : cross(cross) {} + virtual ~llama_graph_input_cross_embd() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc] + + const llama_cross * cross; +}; + +void llama_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) { + GGML_UNUSED(ubatch); + + if (cross_embd && cross->t_embd) { + assert(cross_embd->type == GGML_TYPE_F32); + + ggml_backend_tensor_set(cross_embd, cross->v_embd, 0, ggml_nbytes(cross_embd)); + } +} + class llama_graph_input_attn_dec : public llama_graph_input_attn_i { public: llama_graph_input_attn_dec( @@ -4841,32 +4986,21 @@ void llama_context_dec::reserve() { llama_context_kv_self::reserve(); } -void llama_context_dec::input_set(const llama_ubatch & ubatch) { - // call base functionality - llama_context_kv_self::input_set(ubatch); - - if (inp.cross_embd && cross->t_embd) { - assert(inp.cross_embd->type == GGML_TYPE_F32); - - ggml_backend_tensor_set(inp.cross_embd, cross->v_embd, 0, ggml_nbytes(inp.cross_embd)); - } - -} - ggml_cgraph * llama_context_dec::graph_init() { - inp = {}; - return llama_context_kv_self::graph_init(); } ggml_tensor * llama_context_dec::build_inp_cross_embd( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + auto inp = std::make_shared(cross); + // if we have the output embeddings from the encoder, use them directly // TODO: needs more work to be correct, for now just use the tensor shape //if (cross->t_embd) { - // inp.cross_embd = ggml_view_tensor(ctx0, cross->t_embd); + // inp->cross_embd = ggml_view_tensor(ctx0, cross->t_embd); - // return inp.cross_embd; + // return inp->cross_embd; //} const auto & hparams = model.hparams; @@ -4874,10 +5008,12 @@ ggml_tensor * llama_context_dec::build_inp_cross_embd( const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd; const auto n_enc = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train; - inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); - ggml_set_input(inp.cross_embd); + inp->cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); + ggml_set_input(inp->cross_embd); + + res->add_input(inp); - return inp.cross_embd; + return inp->cross_embd; } llama_graph_input_attn_ptr llama_context_dec::build_attn_inp( diff --git a/src/llama-context.h b/src/llama-context.h index 0f248537eded3..21015e8796e40 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -248,24 +248,6 @@ class llama_context_base : public llama_context, public llama_graph_i { virtual int64_t n_pos_per_token() const; // vision - // when the compute graph is built, it creates the input tensors that it needs - // the contents of the input tensors are set by the input_set() function - - // TODO: remove, replace by llama_graph_input_i->set_input() - virtual void input_set(const llama_ubatch & ubatch); - -private: - // TODO: remove, implement as llama_graph_input_xxx - struct { - // base input tensors - ggml_tensor * pos; // I32 [n_batch] - ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] - ggml_tensor * out_ids; // I32 [n_outputs] - ggml_tensor * mean; // F32 [n_batch, n_batch] - ggml_tensor * cls; // I32 [n_batch] - } inp; - -protected: // // output // @@ -309,35 +291,35 @@ class llama_context_base : public llama_context, public llama_graph_i { ggml_tensor * cur, const char * name, const llama_ubatch & ubatch, - int il) override; + int il) const override; // apply control vector for layer il ggml_tensor * build_cvec( ggml_context * ctx0, ggml_tensor * cur, - int il) override; + int il) const override; // do mat_mul, while optionally apply lora ggml_tensor * build_lora_mm( ggml_context * ctx0, ggml_tensor * w, - ggml_tensor * cur) override; + ggml_tensor * cur) const override; // do mat_mul_id, while optionally apply lora ggml_tensor * build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, // struct ggml_tensor * as ggml_tensor * cur, // struct ggml_tensor * b - ggml_tensor * ids) override; + ggml_tensor * ids) const override; - ggml_tensor * build_rope_factors(int il) override; + ggml_tensor * build_rope_factors(int il) const override; ggml_tensor * build_rope_shift( ggml_context * ctx0, ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, - ggml_backend_buffer * bbuf) override; + ggml_backend_buffer * bbuf) const override; ggml_tensor * build_inp_embd( llama_graph_result * res, @@ -346,23 +328,28 @@ class llama_context_base : public llama_context, public llama_graph_i { const llama_ubatch & ubatch) const override; ggml_tensor * build_inp_pos( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) override; + int32_t n_tokens) const override; ggml_tensor * build_inp_pos_bucket( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) override; + int32_t n_tokens) const override; ggml_tensor * build_inp_out_ids( - ggml_context * ctx0) override; + llama_graph_result * res, + ggml_context * ctx0) const override; ggml_tensor * build_inp_mean( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) override; + int32_t n_tokens) const override; ggml_tensor * build_inp_cls( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) override; + int32_t n_tokens) const override; llama_graph_input_attn_ptr build_attn_inp( llama_graph_result * res, @@ -394,18 +381,6 @@ class llama_context_base : public llama_context, public llama_graph_i { bool v_trans, float kq_scale) const; - virtual ggml_tensor * build_inp_self_k_shift( - ggml_context * ctx0); - - virtual void build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * gf); - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * gf); - public: // // perf @@ -552,19 +527,6 @@ class llama_context_kv_self : public llama_context_base { int encode(llama_batch & inp_batch) override; int decode(llama_batch & inp_batch) override; -protected: - // - // input - // - - void input_set(const llama_ubatch & ubatch) override; - -private: - struct { - ggml_tensor * self_pos_bucket; // I32 [n_kv, n_batch] - ggml_tensor * self_k_shift; // I32 [kv_size] - } inp; - protected: // // graph @@ -578,8 +540,9 @@ class llama_context_kv_self : public llama_context_base { // ggml_tensor * build_inp_pos_bucket( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) override; + int32_t n_tokens) const override; llama_graph_input_attn_ptr build_attn_inp( llama_graph_result * res, @@ -600,16 +563,14 @@ class llama_context_kv_self : public llama_context_base { int il) const override; protected: - ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override; - - void build_kv_self_shift( + llama_graph_result_ptr graph_build_kv_self_shift( ggml_context * ctx0, - ggml_cgraph * gf) override; + ggml_cgraph * gf) const; // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - void build_kv_self_defrag( + llama_graph_result_ptr graph_build_kv_self_defrag( ggml_context * ctx0, - ggml_cgraph * gf) override; + ggml_cgraph * gf) const; // // state save/load @@ -651,19 +612,6 @@ class llama_context_recurrent : public llama_context_base { int encode(llama_batch & inp_batch) override; int decode(llama_batch & inp_batch) override; -protected: - // - // input - // - - void input_set(const llama_ubatch & ubatch) override; - -private: - struct { - ggml_tensor * s_copy; // I32 [kv_size] - ggml_tensor * s_mask; // F32 [1, n_kv] - } inp; - protected: // // graph @@ -677,10 +625,12 @@ class llama_context_recurrent : public llama_context_base { // ggml_tensor * build_inp_s_copy( - ggml_context * ctx0) override; + llama_graph_result * res, + ggml_context * ctx0) const override; ggml_tensor * build_inp_s_mask( - ggml_context * ctx0) override; + llama_graph_result * res, + ggml_context * ctx0) const override; ggml_tensor * build_copy_mask_state( ggml_context * ctx0, @@ -689,7 +639,7 @@ class llama_context_recurrent : public llama_context_base { ggml_tensor * state_copy, ggml_tensor * state_mask, int32_t n_state, - int32_t n_seqs) override; + int32_t n_seqs) const override; ggml_tensor * build_mamba_layer( ggml_context * ctx0, @@ -698,7 +648,7 @@ class llama_context_recurrent : public llama_context_base { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) override; + int il) const override; ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, @@ -706,13 +656,13 @@ class llama_context_recurrent : public llama_context_base { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) override; + int il) const override; ggml_tensor * build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, - int il) override; + int il) const override; ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, @@ -722,7 +672,7 @@ class llama_context_recurrent : public llama_context_base { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) override; + int il) const override; protected: // @@ -774,18 +724,6 @@ class llama_context_dec : public llama_context_kv_self { protected: void reserve() override; - // - // input - // - - void input_set(const llama_ubatch & ubatch) override; - -private: - struct { - ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc] - } inp; - -protected: // // graph // @@ -793,7 +731,8 @@ class llama_context_dec : public llama_context_kv_self { ggml_cgraph * graph_init() override; ggml_tensor * build_inp_cross_embd( - ggml_context * ctx0) override; + llama_graph_result * res, + ggml_context * ctx0) const override; llama_graph_input_attn_ptr build_attn_inp( llama_graph_result * res, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 549a42c53ba22..79b26d1734ca3 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -68,25 +68,19 @@ ggml_tensor * llama_graph_i::build_attn_cross( } ggml_tensor * llama_graph_i::build_inp_cross_embd( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); return nullptr; } -ggml_tensor * llama_graph_i::build_inp_cross_kq_mask( - ggml_context * ctx0, - int32_t n_tokens) { - GGML_UNUSED(ctx0); - GGML_UNUSED(n_tokens); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); - return nullptr; -} - ggml_tensor * llama_graph_i::build_inp_s_copy ( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -95,7 +89,9 @@ ggml_tensor * llama_graph_i::build_inp_s_copy ( } ggml_tensor * llama_graph_i::build_inp_s_mask( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -104,13 +100,13 @@ ggml_tensor * llama_graph_i::build_inp_s_mask( } ggml_tensor * llama_graph_i::build_copy_mask_state( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * s, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - int32_t n_state, - int32_t n_seqs) { + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_state, + int32_t n_seqs) const { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(s); @@ -125,13 +121,13 @@ ggml_tensor * llama_graph_i::build_copy_mask_state( } ggml_tensor * llama_graph_i::build_mamba_layer( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * cur, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il) { + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il) const { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(cur); @@ -146,12 +142,12 @@ ggml_tensor * llama_graph_i::build_mamba_layer( } ggml_tensor * llama_graph_i::build_rwkv_token_shift_load( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il) { + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il) const { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(state_copy); @@ -165,10 +161,10 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_load( } ggml_tensor * llama_graph_i::build_rwkv_token_shift_store( - ggml_context * ctx0, - ggml_tensor * token_shift, - const llama_ubatch & ubatch, - int il) { + ggml_context * ctx0, + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il) const { GGML_UNUSED(ctx0); GGML_UNUSED(token_shift); GGML_UNUSED(ubatch); @@ -180,14 +176,14 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_store( } ggml_tensor * llama_graph_i::build_rwkv6_time_mix( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * cur, - ggml_tensor * x_prev, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il) { + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il) const { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(cur); diff --git a/src/llama-graph.h b/src/llama-graph.h index a6a9ef00ca860..7ae99becc7e23 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -93,6 +93,7 @@ class llama_graph_result : public llama_graph_result_i { // // TODO: can become more granular in the future +// TODO: move all methods that do not require things from llama_context to llm_build_context class llama_graph_i { public: llama_graph_i(llama_graph_type type); @@ -109,28 +110,28 @@ class llama_graph_i { ggml_tensor * cur, const char * name, const llama_ubatch & ubatch, - int il) = 0; + int il) const = 0; // apply control vector for layer il virtual ggml_tensor * build_cvec( ggml_context * ctx0, ggml_tensor * cur, - int il) = 0; + int il) const = 0; // do mat_mul, while optionally apply lora virtual ggml_tensor * build_lora_mm( ggml_context * ctx0, ggml_tensor * w, - ggml_tensor * cur) = 0; + ggml_tensor * cur) const = 0; // do mat_mul_id, while optionally apply lora virtual ggml_tensor * build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, // struct ggml_tensor * as ggml_tensor * cur, // struct ggml_tensor * b - ggml_tensor * ids) = 0; + ggml_tensor * ids) const = 0; - virtual ggml_tensor * build_rope_factors(int il) = 0; + virtual ggml_tensor * build_rope_factors(int il) const = 0; // note: optionally set the backend to be the same as the bbuf's backend virtual ggml_tensor * build_rope_shift( @@ -138,7 +139,7 @@ class llama_graph_i { ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, - ggml_backend_buffer * bbuf) = 0; + ggml_backend_buffer * bbuf) const = 0; // graph build API (context-specific) @@ -146,26 +147,31 @@ class llama_graph_i { llama_graph_result * res, ggml_context * ctx0, ggml_tensor * tok_embd, - const llama_ubatch & ubatch) const = 0; // note these methods will become const, i.e. they don't mutate the llama_context that implements them + const llama_ubatch & ubatch) const = 0; virtual ggml_tensor * build_inp_pos( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) = 0; + int32_t n_tokens) const = 0; virtual ggml_tensor * build_inp_pos_bucket( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) = 0; + int32_t n_tokens) const = 0; virtual ggml_tensor * build_inp_out_ids( - ggml_context * ctx0) = 0; + llama_graph_result * res, + ggml_context * ctx0) const = 0; virtual ggml_tensor * build_inp_mean( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) = 0; + int32_t n_tokens) const = 0; virtual ggml_tensor * build_inp_cls( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) = 0; + int32_t n_tokens) const = 0; virtual llama_graph_input_attn_ptr build_attn_inp( llama_graph_result * res, @@ -197,17 +203,16 @@ class llama_graph_i { int il) const; virtual ggml_tensor * build_inp_cross_embd( - ggml_context * ctx0); - - virtual ggml_tensor * build_inp_cross_kq_mask( - ggml_context * ctx0, - int32_t n_tokens); + llama_graph_result * res, + ggml_context * ctx0) const; virtual ggml_tensor * build_inp_s_copy( - ggml_context * ctx0); + llama_graph_result * res, + ggml_context * ctx0) const; virtual ggml_tensor * build_inp_s_mask( - ggml_context * ctx0); + llama_graph_result * res, + ggml_context * ctx0) const; virtual ggml_tensor * build_copy_mask_state( ggml_context * ctx0, @@ -216,7 +221,7 @@ class llama_graph_i { ggml_tensor * state_copy, ggml_tensor * state_mask, int32_t n_state, - int32_t n_seqs); + int32_t n_seqs) const; virtual ggml_tensor * build_mamba_layer( ggml_context * ctx0, @@ -225,7 +230,7 @@ class llama_graph_i { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il); + int il) const; virtual ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, @@ -233,13 +238,13 @@ class llama_graph_i { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il); + int il) const; virtual ggml_tensor * build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, - int il); + int il) const; virtual ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, @@ -249,5 +254,5 @@ class llama_graph_i { ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il); + int il) const; }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b6adbb1a1bbed..7fae82c6ecc49 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3910,7 +3910,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_pos() { - ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens); cb(cur, "inp_pos", -1); return cur; @@ -3918,7 +3918,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_out_ids() { - ggml_tensor * cur = lgf->build_inp_out_ids(ctx0); + ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0); cb(cur, "inp_out_ids", -1); return cur; @@ -3926,7 +3926,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_mean() { - ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens); cb(cur, "inp_mean", -1); return cur; @@ -3934,7 +3934,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_cls() { - ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens); cb(cur, "inp_cls", -1); return cur; @@ -3957,7 +3957,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_pos_bucket() { - ggml_tensor * cur = lgf->build_inp_pos_bucket(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens); cb(cur, "pos_bucket", -1); return cur; @@ -3965,20 +3965,12 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_cross_embd() { - ggml_tensor * cur = lgf->build_inp_cross_embd(ctx0); + ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0); cb(cur, "embd_enc", -1); return cur; } - // TODO: tmp - struct ggml_tensor * build_inp_cross_kq_mask() { - ggml_tensor * cur = lgf->build_inp_cross_kq_mask(ctx0, n_tokens); - cb(cur, "KQ_mask_cross", -1); - - return cur; - } - struct ggml_tensor * build_norm( struct ggml_tensor * cur, struct ggml_tensor * mw, @@ -3986,8 +3978,8 @@ struct llm_build_context { llm_norm_type type, int il) { switch (type) { - case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break; - case LLM_NORM_RMS: cur = ggml_rms_norm (ctx0, cur, hparams.f_norm_rms_eps); break; + case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm(ctx0, cur, hparams.f_norm_rms_eps); break; case LLM_NORM_GROUP: { cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]); @@ -8070,8 +8062,8 @@ struct llm_build_context { // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); for (int il = 0; il < n_layer; ++il) { // norm @@ -10443,8 +10435,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10535,8 +10527,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; From 0f7daa9d1bce23b962d6c648dc4d7f71d338c8c6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 28 Feb 2025 19:56:10 +0200 Subject: [PATCH 83/84] graph : move non-context related logic to llm_build_context ggml-ci --- src/llama-context.cpp | 520 +++++++++--------------------------------- src/llama-context.h | 118 ++++------ src/llama-graph.cpp | 12 +- src/llama-graph.h | 67 +++--- src/llama-model.cpp | 425 +++++++++++++++++++++++++++------- src/llama-model.h | 1 - 6 files changed, 529 insertions(+), 614 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8587f480fd96f..7ba86a2a7f91a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -71,26 +71,7 @@ void llama_graph_input_embd::set_input(const llama_ubatch * ubatch) { } } -class llama_graph_input_pos : public llama_graph_input_i { -public: - llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} - virtual ~llama_graph_input_pos() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * pos = nullptr; // I32 [n_batch] - - const int64_t n_pos_per_token = 1; -}; - -void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) { - if (ubatch->pos && pos) { - const int64_t n_tokens = ubatch->n_tokens; - - ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); - } -} - +// I32 [n_batch, n_batch] class llama_graph_input_pos_bucket : public llama_graph_input_i { public: llama_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {} @@ -98,19 +79,17 @@ class llama_graph_input_pos_bucket : public llama_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] - const llama_hparams & hparams; }; void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { - if (pos_bucket) { + if (cur) { const int64_t n_tokens = ubatch->n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer)); GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing - int32_t * data = (int32_t *) pos_bucket->data; + int32_t * data = (int32_t *) cur->data; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -122,192 +101,6 @@ void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { } } -class llama_graph_input_out_ids : public llama_graph_input_i { -public: - llama_graph_input_out_ids( - const llama_hparams & hparams, - const llama_cparams & cparams, - int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} - virtual ~llama_graph_input_out_ids() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * out_ids; // I32 [n_outputs] - - const llama_hparams & hparams; - const llama_cparams & cparams; - - const int32_t n_outputs; -}; - -void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { - if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(out_ids && "every model that can must skip unused outputs"); - - if (!out_ids) { - LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__); - } else { - const int64_t n_tokens = ubatch->n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); - int32_t * data = (int32_t *) out_ids->data; - - if (n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { - data[i] = i; - } - } else if (ubatch->output) { - int32_t n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch->output[i]) { - data[n_outputs++] = i; - } - } - // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(n_outputs == n_outputs); - } else if (n_outputs == 1) { - // only keep last output - data[0] = n_tokens - 1; - } else { - GGML_ASSERT(n_outputs == 0); - } - } - } -} - -class llama_graph_input_mean : public llama_graph_input_i { -public: - llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {} - virtual ~llama_graph_input_mean() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * mean; // F32 [n_batch, n_batch] - - const llama_cparams & cparams; -}; - -void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) { - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { - const int64_t n_tokens = ubatch->n_tokens; - const int64_t n_seq_tokens = ubatch->n_seq_tokens; - const int64_t n_seqs = ubatch->n_seqs; - - GGML_ASSERT(mean); - GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer)); - - float * data = (float *) mean->data; - memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean)); - - std::vector sum(n_tokens, 0); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch->seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); - - sum[seq_id] += ubatch->n_seq_tokens; - } - - std::vector div(n_tokens, 0.0f); - for (int i = 0; i < n_tokens; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); - } - } - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch->seq_id[s][0]; - - for (int i = 0; i < n_seq_tokens; ++i) { - data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; - } - } - } -} - -class llama_graph_input_cls : public llama_graph_input_i { -public: - llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {} - virtual ~llama_graph_input_cls() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * cls; // I32 [n_batch] - - const llama_cparams & cparams; -}; - -void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) { - if (cparams.embeddings && ( - cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || - cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { - const int64_t n_tokens = ubatch->n_tokens; - const int64_t n_seq_tokens = ubatch->n_seq_tokens; - const int64_t n_seqs = ubatch->n_seqs; - - GGML_ASSERT(cls); - GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); - - uint32_t * data = (uint32_t *) cls->data; - memset(cls->data, 0, n_tokens * ggml_element_size(cls)); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch->seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; - - if (pos == 0) { - data[seq_id] = s*n_seq_tokens + i; - } - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { - const int64_t n_tokens = ubatch->n_tokens; - const int64_t n_seq_tokens = ubatch->n_seq_tokens; - const int64_t n_seqs = ubatch->n_seqs; - - GGML_ASSERT(cls); - GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); - - uint32_t * data = (uint32_t *) cls->data; - memset(cls->data, 0, n_tokens * ggml_element_size(cls)); - - std::vector last_pos(n_tokens, -1); - std::vector last_row(n_tokens, -1); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch->seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; - - if (pos >= last_pos[seq_id]) { - last_pos[seq_id] = pos; - last_row[seq_id] = s*n_seq_tokens + i; - } - } - } - - for (int i = 0; i < n_tokens; ++i) { - if (last_row[i] >= 0) { - data[i] = last_row[i]; - } - } - } -} - class llama_graph_input_attn_base : public llama_graph_input_attn_i { public: llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) : @@ -1359,14 +1152,6 @@ int llama_context_base::decode(llama_batch & inp_batch) { return 0; } -// -// input -// - -int64_t llama_context_base::n_pos_per_token() const { - return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; -} - // // output // @@ -1535,6 +1320,10 @@ enum ggml_status llama_context_base::graph_compute( // graph build API // +int32_t llama_context_base::get_n_outputs() const { + return n_outputs; +} + void llama_context_base::build_cb( ggml_tensor * cur, const char * name, @@ -1650,57 +1439,7 @@ ggml_tensor * llama_context_base::build_rope_factors(int il) const { return model.layers[il].rope_short; } -ggml_tensor * llama_context_base::build_rope_shift( - ggml_context * ctx0, - ggml_tensor * cur, - ggml_tensor * shift, - ggml_tensor * factors, - ggml_backend_buffer * bbuf) const { - const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; - const auto & freq_base = cparams.rope_freq_base; - const auto & freq_scale = cparams.rope_freq_scale; - - const auto & yarn_ext_factor = cparams.yarn_ext_factor; - const auto & yarn_attn_factor = cparams.yarn_attn_factor; - const auto & yarn_beta_fast = cparams.yarn_beta_fast; - const auto & yarn_beta_slow = cparams.yarn_beta_slow; - - const auto & n_rot = model.hparams.n_rot; - const auto & rope_type = model.hparams.rope_type; - - struct ggml_tensor * tmp; - - if (ggml_is_quantized(cur->type)) { - // dequantize to f32 -> RoPE -> quantize back - tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); - - if (bbuf) { - for (const auto & backend : backends) { - // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) { - ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); - break; - } - } - } - - tmp = ggml_rope_ext_inplace(ctx0, tmp, - shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); - - tmp = ggml_cpy(ctx0, tmp, cur); - } else { - // we rotate only the first n_rot dimensions - tmp = ggml_rope_ext_inplace(ctx0, cur, - shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); - } - - return tmp; -} - -ggml_tensor * llama_context_base::build_inp_embd( - llama_graph_result * res, +llama_graph_input_ptr llama_context_base::build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, const llama_ubatch & ubatch) const { @@ -1710,14 +1449,14 @@ ggml_tensor * llama_context_base::build_inp_embd( auto inp = std::make_shared(); - struct ggml_tensor * inpL; + auto & cur = inp->cur; if (ubatch.token) { inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); //cb(inp->tokens, "inp_tokens", -1); ggml_set_input(inp->tokens); - inpL = ggml_get_rows(ctx0, tok_embd, inp->tokens); + cur = ggml_get_rows(ctx0, tok_embd, inp->tokens); // apply lora for embedding tokens if needed for (const auto & lora : loras) { @@ -1734,97 +1473,36 @@ ggml_tensor * llama_context_base::build_inp_embd( ggml_get_rows(ctx0, lw->a, inp->tokens) ), scale); - inpL = ggml_add(ctx0, inpL, inpL_delta); + cur = ggml_add(ctx0, cur, inpL_delta); } } else { inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - inpL = inp->embd; + cur = inp->embd; ggml_set_input(inp->embd); } // For Granite architecture if (hparams.f_embedding_scale != 0.0f) { - inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale); + cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale); } - res->add_input(std::move(inp)); - - //cb(inpL, "inp_embd", -1); - - return inpL; -} - -ggml_tensor * llama_context_base::build_inp_pos( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const { - auto inp = std::make_shared(n_pos_per_token()); - - inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); - ggml_set_input(inp->pos); - - res->add_input(inp); + //cb(cur, "inp_embd", -1); - return inp->pos; + return inp; } -ggml_tensor * llama_context_base::build_inp_pos_bucket( - llama_graph_result * res, +llama_graph_input_ptr llama_context_base::build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) const { auto inp = std::make_shared(model.hparams); - inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); - ggml_set_input(inp->pos_bucket); - - res->add_input(inp); - - return inp->pos_bucket; -} - -ggml_tensor * llama_context_base::build_inp_out_ids( - llama_graph_result * res, - ggml_context * ctx0) const { - auto inp = std::make_shared(model.hparams, cparams, n_outputs); - - inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); - ggml_set_input(inp->out_ids); - - res->add_input(inp); - - return inp->out_ids; -} - -ggml_tensor * llama_context_base::build_inp_mean( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const { - auto inp = std::make_shared(cparams); - - inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); - ggml_set_input(inp->mean); - - res->add_input(inp); + inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); + ggml_set_input(inp->cur); - return inp->mean; -} - -ggml_tensor * llama_context_base::build_inp_cls( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const { - auto inp = std::make_shared(cparams); - - inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_input(inp->cls); - - res->add_input(inp); - - return inp->cls; + return inp; } llama_graph_input_attn_ptr llama_context_base::build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -1841,8 +1519,6 @@ llama_graph_input_attn_ptr llama_context_base::build_attn_inp( inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask; - res->add_input(inp); - return inp; } @@ -1874,6 +1550,55 @@ ggml_tensor * llama_context_base::build_attn( return cur; } +ggml_tensor * llama_context_base::build_rope_shift( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + ggml_backend_buffer * bbuf) const { + const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; + const auto & freq_base = cparams.rope_freq_base; + const auto & freq_scale = cparams.rope_freq_scale; + + const auto & yarn_ext_factor = cparams.yarn_ext_factor; + const auto & yarn_attn_factor = cparams.yarn_attn_factor; + const auto & yarn_beta_fast = cparams.yarn_beta_fast; + const auto & yarn_beta_slow = cparams.yarn_beta_slow; + + const auto & n_rot = model.hparams.n_rot; + const auto & rope_type = model.hparams.rope_type; + + struct ggml_tensor * tmp; + + if (ggml_is_quantized(cur->type)) { + // dequantize to f32 -> RoPE -> quantize back + tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); + + if (bbuf) { + for (const auto & backend : backends) { + // Figure out which backend KV cache belongs to + if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) { + ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); + break; + } + } + } + + tmp = ggml_rope_ext_inplace(ctx0, tmp, + shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + + tmp = ggml_cpy(ctx0, tmp, cur); + } else { + // we rotate only the first n_rot dimensions + tmp = ggml_rope_ext_inplace(ctx0, cur, + shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + } + + return tmp; +} + ggml_tensor * llama_context_base::build_attn_mha( ggml_context * ctx0, ggml_cgraph * gf, @@ -2485,6 +2210,7 @@ size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_i // llama_context_kv_self // +// I32 [n_kv, n_batch] class llama_graph_input_pos_bucket_kv : public llama_graph_input_i { public: llama_graph_input_pos_bucket_kv( @@ -2494,20 +2220,18 @@ class llama_graph_input_pos_bucket_kv : public llama_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] - const llama_hparams & hparams; const llama_kv_cache_unified * kv_self; }; void llama_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) { - if (pos_bucket) { + if (cur) { const int64_t n_tokens = ubatch->n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer)); GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing - int32_t * data = (int32_t *) pos_bucket->data; + int32_t * data = (int32_t *) cur->data; const int64_t n_kv = kv_self->n; @@ -3311,24 +3035,20 @@ ggml_cgraph * llama_context_kv_self::graph_init() { return llama_context_base::graph_init(); } -ggml_tensor * llama_context_kv_self::build_inp_pos_bucket( - llama_graph_result * res, +llama_graph_input_ptr llama_context_kv_self::build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) const { auto inp = std::make_shared(model.hparams, kv_self.get()); const auto n_kv = kv_self->n; - inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); - ggml_set_input(inp->pos_bucket); + inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); + ggml_set_input(inp->cur); - res->inputs.push_back(inp); - - return inp->pos_bucket; + return inp; } llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -3359,8 +3079,6 @@ llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp( inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; } - res->add_input(inp); - return inp; } @@ -3833,6 +3551,7 @@ size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_se // llama_context_recurrent // +// I32 [kv_size] class llama_graph_input_s_copy : public llama_graph_input_i { public: llama_graph_input_s_copy(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} @@ -3840,8 +3559,6 @@ class llama_graph_input_s_copy : public llama_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * s_copy; // I32 [kv_size] - llama_kv_cache_recurrent * kv_self; }; @@ -3850,9 +3567,9 @@ void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) { const int64_t n_kv = kv_self->n; - if (s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer)); - int32_t * data = (int32_t *) s_copy->data; + if (cur) { + GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer)); + int32_t * data = (int32_t *) cur->data; // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_kv; ++i) { @@ -3878,6 +3595,7 @@ void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) { } } +// F32 [1, n_kv] class llama_graph_input_s_mask : public llama_graph_input_i { public: llama_graph_input_s_mask(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} @@ -3885,8 +3603,6 @@ class llama_graph_input_s_mask : public llama_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * s_mask; // F32 [1, n_kv] - llama_kv_cache_recurrent * kv_self; }; @@ -3895,9 +3611,9 @@ void llama_graph_input_s_mask::set_input(const llama_ubatch * ubatch) { const int64_t n_kv = kv_self->n; - if (s_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer)); - float * data = (float *) s_mask->data; + if (cur) { + GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer)); + float * data = (float *) cur->data; // clear unused states for (int i = 0; i < n_kv; ++i) { @@ -4302,36 +4018,30 @@ ggml_cgraph * llama_context_recurrent::graph_init() { return llama_context_base::graph_init(); } -ggml_tensor * llama_context_recurrent::build_inp_s_copy( - llama_graph_result * res, +llama_graph_input_ptr llama_context_recurrent::build_inp_s_copy( ggml_context * ctx0) const { auto inp = std::make_shared(kv_self.get()); const auto n_kv = kv_self->n; - inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); - //cb(inp.s_copy, "inp_s_copy", -1); - ggml_set_input(inp->s_copy); - - res->add_input(inp); + inp->cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); + //cb(inp.cur, "inp_s_copy", -1); + ggml_set_input(inp->cur); - return inp->s_copy; + return inp; } -ggml_tensor * llama_context_recurrent::build_inp_s_mask( - llama_graph_result * res, +llama_graph_input_ptr llama_context_recurrent::build_inp_s_mask( ggml_context * ctx0) const { auto inp = std::make_shared(kv_self.get()); const auto n_kv = kv_self->n; - inp->s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); - //cb(inp->s_mask, "inp_s_mask", -1); - ggml_set_input(inp->s_mask); - - res->add_input(inp); + inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); + //cb(inp->cur, "inp_s_mask", -1); + ggml_set_input(inp->cur); - return inp->s_mask; + return inp; } ggml_tensor * llama_context_recurrent::build_copy_mask_state( @@ -4904,6 +4614,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) { // llama_context_dec // +// F32 [n_embd, n_outputs_enc] class llama_graph_input_cross_embd : public llama_graph_input_i { public: llama_graph_input_cross_embd( @@ -4912,26 +4623,24 @@ class llama_graph_input_cross_embd : public llama_graph_input_i { void set_input(const llama_ubatch * ubatch) override; - ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc] - const llama_cross * cross; }; void llama_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) { GGML_UNUSED(ubatch); - if (cross_embd && cross->t_embd) { - assert(cross_embd->type == GGML_TYPE_F32); + if (cur && cross->t_embd) { + assert(cur->type == GGML_TYPE_F32); - ggml_backend_tensor_set(cross_embd, cross->v_embd, 0, ggml_nbytes(cross_embd)); + ggml_backend_tensor_set(cur, cross->v_embd, 0, ggml_nbytes(cur)); } } class llama_graph_input_attn_dec : public llama_graph_input_attn_i { public: llama_graph_input_attn_dec( - llama_graph_input_attn_i * inp_kv_self, - const llama_cross * cross) : inp_kv_self(inp_kv_self), cross(cross) {} + llama_graph_input_attn_ptr inp_kv_self, + const llama_cross * cross) : inp_kv_self(std::move(inp_kv_self)), cross(cross) {} void set_input(const llama_ubatch * ubatch) override; @@ -4942,11 +4651,14 @@ class llama_graph_input_attn_dec : public llama_graph_input_attn_i { ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch] ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch] - llama_graph_input_attn_i * inp_kv_self = nullptr; + llama_graph_input_attn_ptr inp_kv_self = nullptr; + const llama_cross * cross = nullptr; }; void llama_graph_input_attn_dec::set_input(const llama_ubatch * ubatch) { + inp_kv_self->set_input(ubatch); + if (cross_kq_mask) { const int64_t n_enc = cross_kq_mask->ne[0]; const int64_t n_tokens = ubatch->n_tokens; @@ -4990,17 +4702,16 @@ ggml_cgraph * llama_context_dec::graph_init() { return llama_context_kv_self::graph_init(); } -ggml_tensor * llama_context_dec::build_inp_cross_embd( - llama_graph_result * res, +llama_graph_input_ptr llama_context_dec::build_inp_cross_embd( ggml_context * ctx0) const { auto inp = std::make_shared(cross); // if we have the output embeddings from the encoder, use them directly // TODO: needs more work to be correct, for now just use the tensor shape //if (cross->t_embd) { - // inp->cross_embd = ggml_view_tensor(ctx0, cross->t_embd); + // inp->cur = ggml_view_tensor(ctx0, cross->t_embd); - // return inp->cross_embd; + // return inp->cur; //} const auto & hparams = model.hparams; @@ -5008,23 +4719,20 @@ ggml_tensor * llama_context_dec::build_inp_cross_embd( const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd; const auto n_enc = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train; - inp->cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); - ggml_set_input(inp->cross_embd); - - res->add_input(inp); + inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); + ggml_set_input(inp->cur); - return inp->cross_embd; + return inp; } llama_graph_input_attn_ptr llama_context_dec::build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, bool swa) const { - auto inp_kv_self = llama_context_kv_self::build_attn_inp(res, ctx0, n_tokens, causal, swa); + auto inp_kv_self = llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa); - auto inp = std::make_shared(inp_kv_self.get(), cross); + auto inp = std::make_shared(std::move(inp_kv_self), cross); const int32_t n_enc = cross->t_embd ? cross->t_embd->ne[1] : model.hparams.n_ctx_train; @@ -5033,8 +4741,6 @@ llama_graph_input_attn_ptr llama_context_dec::build_attn_inp( inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask; - res->add_input(inp); - return inp; } diff --git a/src/llama-context.h b/src/llama-context.h index 21015e8796e40..a5159bc5b34b6 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -242,12 +242,6 @@ class llama_context_base : public llama_context, public llama_graph_i { int decode(llama_batch & inp_batch) override; protected: - // - // input - // - - virtual int64_t n_pos_per_token() const; // vision - // // output // @@ -287,6 +281,8 @@ class llama_context_base : public llama_context, public llama_graph_i { // graph build // + int32_t get_n_outputs() const override; + void build_cb( ggml_tensor * cur, const char * name, @@ -314,45 +310,16 @@ class llama_context_base : public llama_context, public llama_graph_i { ggml_tensor * build_rope_factors(int il) const override; - ggml_tensor * build_rope_shift( - ggml_context * ctx0, - ggml_tensor * cur, - ggml_tensor * shift, - ggml_tensor * factors, - ggml_backend_buffer * bbuf) const override; - - ggml_tensor * build_inp_embd( - llama_graph_result * res, + llama_graph_input_ptr build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, const llama_ubatch & ubatch) const override; - ggml_tensor * build_inp_pos( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const override; - - ggml_tensor * build_inp_pos_bucket( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const override; - - ggml_tensor * build_inp_out_ids( - llama_graph_result * res, - ggml_context * ctx0) const override; - - ggml_tensor * build_inp_mean( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const override; - - ggml_tensor * build_inp_cls( - llama_graph_result * res, + llama_graph_input_ptr build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) const override; llama_graph_input_attn_ptr build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -370,7 +337,15 @@ class llama_context_base : public llama_context, public llama_graph_i { int il) const override; protected: - virtual ggml_tensor * build_attn_mha( + // note: optionally set the backend to be the same as the bbuf's backend + ggml_tensor * build_rope_shift( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + ggml_backend_buffer * bbuf) const; + + ggml_tensor * build_attn_mha( ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * q, @@ -458,28 +433,9 @@ class llama_context_base : public llama_context, public llama_graph_i { llama_loras loras; llama_sbatch sbatch; - ggml_threadpool_t threadpool = nullptr; - ggml_threadpool_t threadpool_batch = nullptr; - - ggml_abort_callback abort_callback = nullptr; - void * abort_callback_data = nullptr; - - ggml_backend_t backend_cpu = nullptr; - std::vector backends; - - std::vector> set_n_threads_fns; - ggml_backend_sched_ptr sched; - // buffer types used for the compute buffer of each backend - std::vector backend_ptrs; - std::vector backend_buft; - - // memory buffers used to evaluate the model - std::vector buf_compute_meta; - - // host buffer for the model output (logits and embeddings) - ggml_backend_buffer_ptr buf_output; + // TODO: these below likely need some rework in the future, together with the batch-refactoring // TODO: remove bool logits_all = false; @@ -502,6 +458,30 @@ class llama_context_base : public llama_context, public llama_graph_i { std::vector output_ids; // map batch token positions to ids of the logits and embd buffers +private: + // base functionality - should not leak into derived classes + + ggml_threadpool_t threadpool = nullptr; + ggml_threadpool_t threadpool_batch = nullptr; + + ggml_abort_callback abort_callback = nullptr; + void * abort_callback_data = nullptr; + + ggml_backend_t backend_cpu = nullptr; + std::vector backends; + + std::vector> set_n_threads_fns; + + // buffer types used for the compute buffer of each backend + std::vector backend_ptrs; + std::vector backend_buft; + + // memory buffers used to evaluate the model + std::vector buf_compute_meta; + + // host buffer for the model output (logits and embeddings) + ggml_backend_buffer_ptr buf_output; + bool has_evaluated_once = false; }; @@ -539,13 +519,11 @@ class llama_context_kv_self : public llama_context_base { // graph build // - ggml_tensor * build_inp_pos_bucket( - llama_graph_result * res, + llama_graph_input_ptr build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) const override; llama_graph_input_attn_ptr build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -624,12 +602,10 @@ class llama_context_recurrent : public llama_context_base { // graph build // - ggml_tensor * build_inp_s_copy( - llama_graph_result * res, + llama_graph_input_ptr build_inp_s_copy( ggml_context * ctx0) const override; - ggml_tensor * build_inp_s_mask( - llama_graph_result * res, + llama_graph_input_ptr build_inp_s_mask( ggml_context * ctx0) const override; ggml_tensor * build_copy_mask_state( @@ -694,6 +670,10 @@ class llama_context_recurrent : public llama_context_base { std::unique_ptr kv_self; }; +// +// enc-dec +// + // TODO: tmp - need something better to pass the data from the encoder to the decoder struct llama_cross { // the output embeddings from the encoder as a ggml tensor @@ -714,7 +694,7 @@ class llama_context_enc : public llama_context_base { int encode(llama_batch & inp_batch) override; - llama_cross * cross = nullptr; + llama_cross * cross = nullptr; // TODO: hacky, rework }; class llama_context_dec : public llama_context_kv_self { @@ -730,12 +710,10 @@ class llama_context_dec : public llama_context_kv_self { ggml_cgraph * graph_init() override; - ggml_tensor * build_inp_cross_embd( - llama_graph_result * res, + llama_graph_input_ptr build_inp_cross_embd( ggml_context * ctx0) const override; llama_graph_input_attn_ptr build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -753,7 +731,7 @@ class llama_context_dec : public llama_context_kv_self { int il) const override; public: - llama_cross * cross = nullptr; + llama_cross * cross = nullptr; // TODO: hacky, rework }; class llama_context_enc_dec : public llama_context { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 79b26d1734ca3..89e311a915a31 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -67,20 +67,16 @@ ggml_tensor * llama_graph_i::build_attn_cross( return nullptr; } -ggml_tensor * llama_graph_i::build_inp_cross_embd( - llama_graph_result * res, +llama_graph_input_ptr llama_graph_i::build_inp_cross_embd( ggml_context * ctx0) const { - GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); return nullptr; } -ggml_tensor * llama_graph_i::build_inp_s_copy ( - llama_graph_result * res, +llama_graph_input_ptr llama_graph_i::build_inp_s_copy ( ggml_context * ctx0) const { - GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -88,10 +84,8 @@ ggml_tensor * llama_graph_i::build_inp_s_copy ( return nullptr; // NOLINT } -ggml_tensor * llama_graph_i::build_inp_s_mask( - llama_graph_result * res, +llama_graph_input_ptr llama_graph_i::build_inp_s_mask( ggml_context * ctx0) const { - GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); diff --git a/src/llama-graph.h b/src/llama-graph.h index 7ae99becc7e23..343d4a0772277 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -29,6 +29,9 @@ class llama_graph_input_i { virtual ~llama_graph_input_i() = default; virtual void set_input(const llama_ubatch * ubatch) = 0; + + // by default, we produce a single input tensor, but some children could produce more + ggml_tensor * cur = nullptr; }; using llama_graph_input_ptr = std::shared_ptr; @@ -76,7 +79,7 @@ class llama_graph_result : public llama_graph_result_i { } } - void add_input(llama_graph_input_ptr && input) { + void add_input(llama_graph_input_ptr input) { inputs.emplace_back(std::move(input)); } @@ -92,19 +95,23 @@ class llama_graph_result : public llama_graph_result_i { // llama_graph // +// note: keep all methods const // TODO: can become more granular in the future -// TODO: move all methods that do not require things from llama_context to llm_build_context class llama_graph_i { public: llama_graph_i(llama_graph_type type); virtual ~llama_graph_i() = default; - llama_graph_type get_type() const { return type; } + llama_graph_type get_type() const { + return type; + } -protected: +private: llama_graph_type type; public: + virtual int32_t get_n_outputs() const = 0; + // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) virtual void build_cb( ggml_tensor * cur, @@ -131,50 +138,27 @@ class llama_graph_i { ggml_tensor * cur, // struct ggml_tensor * b ggml_tensor * ids) const = 0; + // rope factors based on the current context size virtual ggml_tensor * build_rope_factors(int il) const = 0; - // note: optionally set the backend to be the same as the bbuf's backend - virtual ggml_tensor * build_rope_shift( - ggml_context * ctx0, - ggml_tensor * cur, - ggml_tensor * shift, - ggml_tensor * factors, - ggml_backend_buffer * bbuf) const = 0; - // graph build API (context-specific) - virtual ggml_tensor * build_inp_embd( - llama_graph_result * res, + // input embeddings with optional lora + virtual llama_graph_input_ptr build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, const llama_ubatch & ubatch) const = 0; - virtual ggml_tensor * build_inp_pos( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const = 0; - - virtual ggml_tensor * build_inp_pos_bucket( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const = 0; - - virtual ggml_tensor * build_inp_out_ids( - llama_graph_result * res, - ggml_context * ctx0) const = 0; - - virtual ggml_tensor * build_inp_mean( - llama_graph_result * res, + // enc-dec pos + virtual llama_graph_input_ptr build_inp_pos_bucket( ggml_context * ctx0, int32_t n_tokens) const = 0; - virtual ggml_tensor * build_inp_cls( - llama_graph_result * res, - ggml_context * ctx0, - int32_t n_tokens) const = 0; + // + // attention API + // virtual llama_graph_input_attn_ptr build_attn_inp( - llama_graph_result * res, ggml_context * ctx0, int32_t n_tokens, bool causal, @@ -202,16 +186,17 @@ class llama_graph_i { float kq_scale, int il) const; - virtual ggml_tensor * build_inp_cross_embd( - llama_graph_result * res, + virtual llama_graph_input_ptr build_inp_cross_embd( ggml_context * ctx0) const; - virtual ggml_tensor * build_inp_s_copy( - llama_graph_result * res, + // + // recurrent API + // + + virtual llama_graph_input_ptr build_inp_s_copy( ggml_context * ctx0) const; - virtual ggml_tensor * build_inp_s_mask( - llama_graph_result * res, + virtual llama_graph_input_ptr build_inp_s_mask( ggml_context * ctx0) const; virtual ggml_tensor * build_copy_mask_state( diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7fae82c6ecc49..60a8cc0f8b0a7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3813,6 +3813,212 @@ enum llm_norm_type { LLM_NORM_GROUP, }; +class llama_graph_input_pos : public llama_graph_input_i { +public: + llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} + virtual ~llama_graph_input_pos() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * pos = nullptr; // I32 [n_batch] + + const int64_t n_pos_per_token = 1; +}; + +void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) { + if (ubatch->pos && pos) { + const int64_t n_tokens = ubatch->n_tokens; + + ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); + } +} + +class llama_graph_input_out_ids : public llama_graph_input_i { +public: + llama_graph_input_out_ids( + const llama_hparams & hparams, + const llama_cparams & cparams, + int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} + virtual ~llama_graph_input_out_ids() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * out_ids; // I32 [n_outputs] + + const llama_hparams & hparams; + const llama_cparams & cparams; + + const int32_t n_outputs; +}; + +void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { + if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + //GGML_ASSERT(out_ids && "every model that can must skip unused outputs"); + + if (!out_ids) { + LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__); + } else { + const int64_t n_tokens = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); + int32_t * data = (int32_t *) out_ids->data; + + if (n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (ubatch->output) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch->output[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(n_outputs == n_outputs); + } else if (n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(n_outputs == 0); + } + } + } +} + +class llama_graph_input_mean : public llama_graph_input_i { +public: + llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {} + virtual ~llama_graph_input_mean() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * mean; // F32 [n_batch, n_batch] + + const llama_cparams & cparams; +}; + +void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) { + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(mean); + GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer)); + + float * data = (float *) mean->data; + memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean)); + + std::vector sum(n_tokens, 0); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); + + sum[seq_id] += ubatch->n_seq_tokens; + } + + std::vector div(n_tokens, 0.0f); + for (int i = 0; i < n_tokens; ++i) { + const uint64_t s = sum[i]; + if (s > 0) { + div[i] = 1.0f/float(s); + } + } + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + for (int i = 0; i < n_seq_tokens; ++i) { + data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; + } + } + } +} + +class llama_graph_input_cls : public llama_graph_input_i { +public: + llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {} + virtual ~llama_graph_input_cls() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * cls; // I32 [n_batch] + + const llama_cparams & cparams; +}; + +void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) { + if (cparams.embeddings && ( + cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || + cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(cls); + GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); + + uint32_t * data = (uint32_t *) cls->data; + memset(cls->data, 0, n_tokens * ggml_element_size(cls)); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; + + if (pos == 0) { + data[seq_id] = s*n_seq_tokens + i; + } + } + } + } + + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(cls); + GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); + + uint32_t * data = (uint32_t *) cls->data; + memset(cls->data, 0, n_tokens * ggml_element_size(cls)); + + std::vector last_pos(n_tokens, -1); + std::vector last_row(n_tokens, -1); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; + + if (pos >= last_pos[seq_id]) { + last_pos[seq_id] = pos; + last_row[seq_id] = s*n_seq_tokens + i; + } + } + } + + for (int i = 0; i < n_tokens; ++i) { + if (last_row[i] >= 0) { + data[i] = last_row[i]; + } + } + } +} + struct llm_build_context { const llama_model & model; const llama_hparams & hparams; @@ -3895,55 +4101,75 @@ struct llm_build_context { res (std::make_unique()) { } + int64_t n_pos_per_token() const { + return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; + } + // TODO: tmp void cb(struct ggml_tensor * cur, const char * name, int il) { lgf->build_cb(cur, name, ubatch, il); } - // TODO: tmp struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { - struct ggml_tensor * inpL = lgf->build_inp_embd(res.get(), ctx0, tok_embd, ubatch); - cb(inpL, "inp_embd", -1); + auto inp = lgf->build_inp_embd(ctx0, tok_embd, ubatch); + + cb(inp->cur, "inp_embd", -1); - return inpL; + res->add_input(inp); + + return inp->cur; } - // TODO: tmp - struct ggml_tensor * build_inp_pos() { - ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens); - cb(cur, "inp_pos", -1); + struct ggml_tensor * build_inp_pos() const { + auto inp = std::make_shared(n_pos_per_token()); - return cur; + inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + ggml_set_input(inp->pos); + + res->add_input(inp); + + return inp->pos; } - // TODO: tmp struct ggml_tensor * build_inp_out_ids() { - ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0); - cb(cur, "inp_out_ids", -1); + const auto n_outputs = lgf->get_n_outputs(); - return cur; + auto inp = std::make_shared(hparams, cparams, n_outputs); + + inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); + ggml_set_input(inp->out_ids); + + res->add_input(inp); + + return inp->out_ids; } - // TODO: tmp struct ggml_tensor * build_inp_mean() { - ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens); - cb(cur, "inp_mean", -1); + auto inp = std::make_shared(cparams); - return cur; + inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); + ggml_set_input(inp->mean); + + res->add_input(inp); + + return inp->mean; } - // TODO: tmp struct ggml_tensor * build_inp_cls() { - ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens); - cb(cur, "inp_cls", -1); + auto inp = std::make_shared(cparams); - return cur; + inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->cls); + + res->add_input(inp); + + return inp->cls; } // TODO: tmp struct ggml_tensor * build_lora_mm( struct ggml_tensor * w, - struct ggml_tensor * cur) { + struct ggml_tensor * cur) const { return lgf->build_lora_mm(ctx0, w, cur); } @@ -3951,24 +4177,42 @@ struct llm_build_context { struct ggml_tensor * build_lora_mm_id( struct ggml_tensor * w, // struct ggml_tensor * as struct ggml_tensor * cur, // struct ggml_tensor * b - struct ggml_tensor * ids) { + struct ggml_tensor * ids) const { return lgf->build_lora_mm_id(ctx0, w, cur, ids); } - // TODO: tmp struct ggml_tensor * build_pos_bucket() { - ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens); - cb(cur, "pos_bucket", -1); + auto inp = lgf->build_inp_pos_bucket(ctx0, n_tokens); + cb(inp->cur, "pos_bucket", -1); - return cur; + res->add_input(inp); + + return inp->cur; } - // TODO: tmp struct ggml_tensor * build_inp_cross_embd() { - ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0); - cb(cur, "embd_enc", -1); + auto inp = lgf->build_inp_cross_embd(ctx0); + cb(inp->cur, "embd_enc", -1); - return cur; + res->add_input(inp); + + return inp->cur; + } + + struct ggml_tensor * build_inp_s_copy() const { + auto inp = lgf->build_inp_s_copy(ctx0); + + res->add_input(inp); + + return inp->cur; + } + + struct ggml_tensor * build_inp_s_mask() const { + auto inp = lgf->build_inp_s_mask(ctx0); + + res->add_input(inp); + + return inp->cur; } struct ggml_tensor * build_norm( @@ -4250,6 +4494,18 @@ struct llm_build_context { return moe_out; } + llama_graph_input_attn_ptr build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa) const { + auto inp = lgf->build_attn_inp(ctx0, n_tokens, causal, swa); + + res->add_input(inp); + + return inp; + } + struct ggml_tensor * build_attn( llama_graph_input_attn_i * inp, ggml_cgraph * gf, @@ -4490,7 +4746,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4651,7 +4907,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4807,7 +5063,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4923,7 +5179,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5028,7 +5284,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5151,7 +5407,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5303,7 +5559,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5425,7 +5681,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -5526,7 +5782,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5640,7 +5896,7 @@ struct llm_build_context { inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); cb(inpL, "inp_norm", -1); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, false, false); // iterate layers for (int il = 0; il < n_layer; ++il) { @@ -5785,7 +6041,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); inpL = build_norm(inpL, model.tok_norm, @@ -5888,7 +6144,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); if (model.pos_embd) { // inp_pos - contains the positions @@ -6030,11 +6286,9 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { - - // norm cur = build_norm(inpL, model.layers[il].attn_norm, @@ -6181,7 +6435,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6295,7 +6549,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6408,7 +6662,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -6526,7 +6780,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6673,7 +6927,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { attn_norm_output = build_norm(inpL, @@ -6795,8 +7049,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true); for (int il = 0; il < n_layer; ++il) { auto * residual = inpL; @@ -6940,7 +7193,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { @@ -7046,7 +7299,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -7152,7 +7405,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -7263,7 +7516,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7382,7 +7635,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7510,7 +7763,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7711,7 +7964,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { // norm @@ -7819,7 +8072,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true); for (int il = 0; il < n_layer; ++il) { // norm @@ -7949,7 +8202,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8062,8 +8315,8 @@ struct llm_build_context { // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); + struct ggml_tensor * state_copy = build_inp_s_copy(); + struct ggml_tensor * state_mask = build_inp_s_mask(); for (int il = 0; il < n_layer; ++il) { // norm @@ -8124,7 +8377,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { @@ -8272,7 +8525,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true); // sliding window switch pattern const int32_t sliding_window_pattern = 4; @@ -8407,7 +8660,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8527,7 +8780,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8651,7 +8904,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -8772,7 +9025,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { const int64_t n_head = hparams.n_head(il); @@ -8900,7 +9153,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -9044,7 +9297,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9174,7 +9427,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -9337,7 +9590,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9555,7 +9808,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9706,7 +9959,7 @@ struct llm_build_context { struct ggml_tensor * pos_bucket_enc = build_pos_bucket(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, false, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9809,7 +10062,7 @@ struct llm_build_context { const int64_t n_outputs_enc = embd_enc->ne[1]; - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -9972,7 +10225,7 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -10066,7 +10319,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10196,7 +10449,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10317,7 +10570,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10435,8 +10688,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); + struct ggml_tensor * state_copy = build_inp_s_copy(); + struct ggml_tensor * state_mask = build_inp_s_mask(); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10527,8 +10780,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); + struct ggml_tensor * state_copy = build_inp_s_copy(); + struct ggml_tensor * state_mask = build_inp_s_mask(); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10622,7 +10875,7 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false); + auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; diff --git a/src/llama-model.h b/src/llama-model.h index 2d64c0d242c09..45abce7d53d8a 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -365,7 +365,6 @@ struct llama_model { const struct ggml_tensor * get_tensor(const char * name) const; - // TODO: add encode/decode graphs llama_graph_result_ptr build_graph( ggml_context * ctx, ggml_cgraph * gf, From 624f7bd03bdea9e8d5c6d2ca02d87268394cc20c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 28 Feb 2025 21:13:08 +0200 Subject: [PATCH 84/84] graph : add comments ggml-ci --- src/llama-context.cpp | 1 + src/llama-graph.cpp | 16 +++++++------- src/llama-graph.h | 51 ++++++++++++++++++++++++++++++++++++------- 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7ba86a2a7f91a..8963b85ca8151 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -101,6 +101,7 @@ void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { } } +// note: this does not depend on the context and can technically be moved to llama-model.cpp class llama_graph_input_attn_base : public llama_graph_input_attn_i { public: llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) : diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 89e311a915a31..119f1a56f3841 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -19,6 +19,14 @@ ggml_tensor * llama_graph_input_attn_i::get_kq_mask_cross() { llama_graph_i::llama_graph_i(llama_graph_type type) : type(type) {} +llama_graph_input_ptr llama_graph_i::build_inp_cross_embd( + ggml_context * ctx0) const { + GGML_UNUSED(ctx0); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); + return nullptr; +} + ggml_tensor * llama_graph_i::build_attn( llama_graph_input_attn_i * inp, ggml_context * ctx0, @@ -67,14 +75,6 @@ ggml_tensor * llama_graph_i::build_attn_cross( return nullptr; } -llama_graph_input_ptr llama_graph_i::build_inp_cross_embd( - ggml_context * ctx0) const { - GGML_UNUSED(ctx0); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); - return nullptr; -} - llama_graph_input_ptr llama_graph_i::build_inp_s_copy ( ggml_context * ctx0) const { GGML_UNUSED(ctx0); diff --git a/src/llama-graph.h b/src/llama-graph.h index 343d4a0772277..2d62c674f2679 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -10,32 +10,49 @@ struct ggml_cgraph; struct ggml_context; struct ggml_tensor; -struct ggml_backend_buffer; struct llama_ubatch; +// certain models (typically multi-modal) can produce different types of graphs +// the llama_context specifies which type of graph it needs through the llama_graph_i::type member enum llama_graph_type { LLAMA_GRAPH_TYPE_DEFAULT, LLAMA_GRAPH_TYPE_ENCODER, LLAMA_GRAPH_TYPE_DECODER, }; + // // llama_graph_input // +// denotes an input to the graph +// typically, the data of these objects is populated based on the contents of the current llama_ubatch: +// +// - llama_graph_input_pos +// - llama_graph_input_out_ids +// - etc. +// +// some inputs require context-specific data (e.g. KV cache) - such inputs are defined for the specific llama_context: +// +// - llama_graph_input_embd (can apply lora) +// - llama_graph_input_attn_kv_self (requires KV cache instance) +// - etc. +// + class llama_graph_input_i { public: virtual ~llama_graph_input_i() = default; virtual void set_input(const llama_ubatch * ubatch) = 0; - // by default, we produce a single input tensor, but some children could produce more + // by default, we produce a single input tensor, but some implementations could produce more ggml_tensor * cur = nullptr; }; using llama_graph_input_ptr = std::shared_ptr; + class llama_graph_input_attn_i : public llama_graph_input_i { public: virtual ~llama_graph_input_attn_i() = default; @@ -47,10 +64,17 @@ class llama_graph_input_attn_i : public llama_graph_input_i { using llama_graph_input_attn_ptr = std::shared_ptr; + // // llama_graph_result // +// these objects deliver the result from the graph build process back to the llama_context +// note that the input tensors created for the graph are referenced here - the goal is to be able to populate their +// specific data, by calling the set_inputs() method +// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc. +// these are used by the llama_context to extact the relevant data, based on the compute parameters + class llama_graph_result_i { public: virtual ~llama_graph_result_i() = default; @@ -64,9 +88,9 @@ class llama_graph_result_i { using llama_graph_result_ptr = std::unique_ptr; + class llama_graph_result : public llama_graph_result_i { public: - llama_graph_result() = default; virtual ~llama_graph_result() = default; ggml_tensor * get_logits() override { return t_logits; } @@ -91,10 +115,19 @@ class llama_graph_result : public llama_graph_result_i { std::vector inputs; }; + // // llama_graph // +// this interface defines an API for building graphs by abstracting some high-level concepts such as attention, lora, etc. +// functionality that is trivial and does not rely on the llama_context should be directly implemented in llm_build_context +// other context-specific functionality should be declared here and implemented in the llama_context variations +// +// the main goal of this interface is to separate the llama_context specifics from the graph building logic +// this allows to have cleaner model architecture definitions while being able to overload certain complex +// functionality in order to fit different use cases and/or explore new implementations and ideas + // note: keep all methods const // TODO: can become more granular in the future class llama_graph_i { @@ -112,6 +145,10 @@ class llama_graph_i { public: virtual int32_t get_n_outputs() const = 0; + // + // context-specific API + // + // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) virtual void build_cb( ggml_tensor * cur, @@ -141,8 +178,6 @@ class llama_graph_i { // rope factors based on the current context size virtual ggml_tensor * build_rope_factors(int il) const = 0; - // graph build API (context-specific) - // input embeddings with optional lora virtual llama_graph_input_ptr build_inp_embd( ggml_context * ctx0, @@ -154,6 +189,9 @@ class llama_graph_i { ggml_context * ctx0, int32_t n_tokens) const = 0; + virtual llama_graph_input_ptr build_inp_cross_embd( + ggml_context * ctx0) const; + // // attention API // @@ -186,9 +224,6 @@ class llama_graph_i { float kq_scale, int il) const; - virtual llama_graph_input_ptr build_inp_cross_embd( - ggml_context * ctx0) const; - // // recurrent API //