llama : move llama_batch backward-compat function to common

ngxson · ngxson · commit 5e8f7764734d · 2024-10-04T23:15:01.000+02:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -1856,6 +1856,83 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
     return result;
 }
 
+//
+// Compatibility with old API
+//
+
+struct llama_batch llama_batch_get_one(
+              llama_token * tokens,
+                  int32_t   n_tokens,
+                llama_pos   pos_0,
+             llama_seq_id   seq_id,
+                     bool   logits_all) {
+    // because old API does not call llama_batch_free,
+    // we assume that batches generated by llama_batch_get_one is a singleton
+    static std::vector<llama_pos>       pos;
+    static std::vector<int32_t>         n_seq_id;
+    static std::array <llama_seq_id, 1> seq_id_0;
+    static std::vector<llama_seq_id *>  seq_ids;
+    static std::vector<int8_t>          logits;
+    pos     .resize(n_tokens);
+    n_seq_id.resize(n_tokens);
+    seq_ids .resize(n_tokens + 1);
+    logits  .resize(n_tokens);
+    seq_id_0[0] = seq_id;
+    seq_ids [n_tokens] = nullptr;
+    llama_batch batch = {
+        /*n_tokens       =*/ 0,
+        /*tokens         =*/ tokens,
+        /*embd           =*/ nullptr,
+        /*pos            =*/ pos.data(),
+        /*n_seq_id       =*/ n_seq_id.data(),
+        /*seq_id         =*/ seq_ids.data(),
+        /*logits         =*/ logits.data(),
+    };
+    for (int i = 0; i < n_tokens; i++) {
+        batch.seq_id[i] = seq_id_0.data();
+        bool logits = logits_all || i == n_tokens - 1;
+        llama_batch_add(batch, tokens[i], pos_0 + i, { seq_id }, logits);
+    }
+    return batch;
+}
+
+struct llama_batch llama_batch_get_one(
+                    float * embd,
+                  int32_t   n_tokens,
+                llama_pos   pos_0,
+             llama_seq_id   seq_id,
+                     bool   logits_all) {
+    // because old API does not call llama_batch_free,
+    // we assume that batches generated by llama_batch_get_one is a singleton
+    static std::vector<llama_pos>       pos;
+    static std::vector<int32_t>         n_seq_id;
+    static std::array <llama_seq_id, 1> seq_id_0;
+    static std::vector<llama_seq_id *>  seq_ids;
+    static std::vector<int8_t>          logits;
+    pos     .resize(n_tokens);
+    n_seq_id.resize(n_tokens);
+    seq_ids .resize(n_tokens + 1);
+    logits  .resize(n_tokens);
+    seq_id_0[0] = seq_id;
+    seq_ids [n_tokens] = nullptr;
+    llama_batch batch = {
+        /*n_tokens       =*/ n_tokens,
+        /*tokens         =*/ nullptr,
+        /*embd           =*/ embd,
+        /*pos            =*/ pos.data(),
+        /*n_seq_id       =*/ n_seq_id.data(),
+        /*seq_id         =*/ seq_ids.data(),
+        /*logits         =*/ logits.data(),
+    };
+    for (int i = 0; i < n_tokens; i++) {
+        batch.pos     [i] = pos_0 + i;
+        batch.n_seq_id[i] = 1;
+        batch.seq_id  [i] = seq_id_0.data();
+        batch.logits  [i] = logits_all || i == n_tokens - 1;
+    }
+    return batch;
+}
+
 //
 // YAML utils
 //
diff --git a/common/common.h b/common/common.h
@@ -542,6 +542,26 @@ static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
+//
+// Compatibility with old API
+//
+
+// Return batch for single sequence of tokens starting at pos_0
+struct llama_batch llama_batch_get_one(
+              llama_token * tokens,
+                  int32_t   n_tokens,
+                llama_pos   pos_0,
+             llama_seq_id   seq_id,
+                     bool   logits_all = false);
+
+// Return batch for single sequence of embeddings starting at pos_0
+struct llama_batch llama_batch_get_one(
+                    float * embd,
+                  int32_t   n_tokens,
+                llama_pos   pos_0,
+             llama_seq_id   seq_id,
+                     bool   logits_all = false);
+
 //
 // YAML utils
 //
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
@@ -508,8 +508,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
                 tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
             }
 
-            // TODO: use batch.logits to save computations instead of relying on logits_all == true
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0, true))) {
                 LOG_ERR("%s : failed to eval\n", __func__);
                 return false;
             }
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -2,6 +2,7 @@
 #include "llava.h"
 
 #include "llama.h"
+#include "common.h"
 
 #include <algorithm>
 #include <cerrno>
@@ -409,7 +410,8 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        float * embd = image_embed->embed+i*n_embd;
+        llama_batch batch = llama_batch_get_one(embd, n_eval, *n_past, 0);
         if (llama_decode(ctx_llama, batch)) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return false;
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -308,7 +308,6 @@ int main(int argc, char ** argv) {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -410,8 +410,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
             const int batch_size  = std::min(end - batch_start, n_batch);
 
             //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
-            // TODO: use llama_batch.logits instead of relying on logits_all == true
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0, true))) {
                 //LOG_ERR("%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
@@ -699,7 +698,6 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
             batch.n_seq_id + i,
             batch.seq_id   + i,
             batch.logits   + i,
-            0, 0, 0, // unused
         };
 
         const int ret = llama_decode(ctx, batch_view);
@@ -1790,8 +1788,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
                 tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
             }
 
-            // TODO: use llama_batch.logits instead of relying on logits_all == true
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0, true))) {
                 LOG_ERR("%s : failed to eval\n", __func__);
                 return;
             }
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2283,7 +2283,6 @@ struct server_context {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
 
             const int ret = llama_decode(ctx, batch_view);
diff --git a/include/llama.h b/include/llama.h
@@ -244,15 +244,6 @@ extern "C" {
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
         int8_t       *  logits; // TODO: rename this to "output"
-
-        // NOTE: helpers for smooth API transition - can be deprecated in the future
-        //       for future-proof code, use the above fields instead and ignore everything below
-        //
-        // pos[i] = all_pos_0 + i*all_pos_1
-        //
-        llama_pos    all_pos_0;  // used if pos == NULL
-        llama_pos    all_pos_1;  // used if pos == NULL
-        llama_seq_id all_seq_id; // used if seq_id == NULL
     } llama_batch;
 
     enum llama_model_kv_override_type {
@@ -775,16 +766,6 @@ extern "C" {
     // Decoding
     //
 
-    // Return batch for single sequence of tokens starting at pos_0
-    //
-    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
-    //
-    LLAMA_API struct llama_batch llama_batch_get_one(
-                  llama_token * tokens,
-                      int32_t   n_tokens,
-                    llama_pos   pos_0,
-                 llama_seq_id   seq_id);
-
     // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
     // Each token can be assigned up to n_seq_max sequence ids
     // The batch has to be freed with llama_batch_free()
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -2941,9 +2941,6 @@ struct llama_sbatch_seq {
     llama_seq_id * seq_id;
     size_t offset;
     size_t length;
-
-    // helper for smoother batch API transition -- can be deprecated in the future
-    llama_seq_id all_seq_id; // used if seq_id == NULL
 };
 
 // sequence-length-aware batch splitting
@@ -3038,30 +3035,18 @@ struct llama_sbatch {
         } else {
             ubatch.embd = nullptr;
         }
-        // from here on, the else branches are deprecated;
-        // they are helpers for smoother batch API transition
-        if (batch->pos) {
-            if (ubatch.equal_seqs) {
-                for (size_t i = 0; i < length; ++i) {
-                    ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
-                }
-            } else {
-                // simple split
-                ubatch.pos = batch->pos + seq.offset;
-            }
-        } else {
+        if (ubatch.equal_seqs) {
             for (size_t i = 0; i < length; ++i) {
-                llama_pos bi = ids[seq.offset + i];
-                ubatch.pos[ubatch.n_tokens + i] = batch->all_pos_0 + (bi * batch->all_pos_1);
+                ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
             }
+        } else {
+            // simple split
+            ubatch.pos = batch->pos + seq.offset;
         }
         if (ubatch.equal_seqs) {
             ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
             if (seq.seq_id) {
                 ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
-            } else {
-                GGML_ASSERT(seq.n_seq_id == 1);
-                ubatch.seq_id[ubatch.n_seqs] = &seq.all_seq_id;
             }
         } else {
             // simple split
@@ -3074,10 +3059,6 @@ struct llama_sbatch {
             }
             if (batch->seq_id) {
                 ubatch.seq_id = batch->seq_id + seq.offset;
-            } else {
-                for (size_t i = 0; i < length; ++i) {
-                    ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;
-                }
             }
         }
         if (logits_all) {
@@ -3196,7 +3177,6 @@ struct llama_sbatch {
             s.seq_id = nullptr;
             s.offset = 0;
             s.length = n_tokens;
-            s.all_seq_id = batch.all_seq_id;
             return;
         }
         std::sort(ids.begin(), ids.end(),
@@ -3219,7 +3199,7 @@ struct llama_sbatch {
                     if (batch.pos) {
                         return batch.pos[a] < batch.pos[b];
                     }
-                    // no pos, sort by id (assuming batch.all_pos_1 is positive)
+                    // no pos, sort by id
                     return a < b;
                 }
                 // shared prompts go first
@@ -3229,30 +3209,25 @@ struct llama_sbatch {
         // init seq
         llama_sbatch_seq * last_seq = nullptr;
 
-        if (batch.n_seq_id != nullptr && batch.seq_id != nullptr) {
-            for (size_t i = 0; i < n_tokens; ++i) {
-                const size_t bi = ids[i];
-                const int32_t n_seqs = batch.n_seq_id[bi];
-                llama_seq_id * seq_ids = batch.seq_id[bi];
-                if (last_seq != nullptr) {
-                    bool same = n_seqs == last_seq->n_seq_id;
-                    for (int32_t j = 0; same && j < n_seqs; ++j) {
-                        if (seq_ids[j] != last_seq->seq_id[j]) {
-                            same = false;
-                        }
-                    }
-                    if (same) {
-                        last_seq->length += 1;
-                        continue;
+        for (size_t i = 0; i < n_tokens; ++i) {
+            const size_t bi = ids[i];
+            const int32_t n_seqs = batch.n_seq_id[bi];
+            llama_seq_id * seq_ids = batch.seq_id[bi];
+            if (last_seq != nullptr) {
+                bool same = n_seqs == last_seq->n_seq_id;
+                for (int32_t j = 0; same && j < n_seqs; ++j) {
+                    if (seq_ids[j] != last_seq->seq_id[j]) {
+                        same = false;
                     }
                 }
-                llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1, batch.all_seq_id};
-                seq.push_back(new_seq);
-                last_seq = &seq.back();
+                if (same) {
+                    last_seq->length += 1;
+                    continue;
+                }
             }
-        } else {
-            llama_sbatch_seq new_seq = {1, nullptr, 0, n_tokens, batch.all_seq_id};
+            llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
             seq.push_back(new_seq);
+            last_seq = &seq.back();
         }
         // keep shared prompts first at the end, then sort by length descending.
         std::sort(seq.begin(), seq.end(),
@@ -21067,25 +21042,6 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
     ctx->cparams.causal_attn = causal_attn;
 }
 
-struct llama_batch llama_batch_get_one(
-             llama_token * tokens,
-                 int32_t   n_tokens,
-               llama_pos   pos_0,
-            llama_seq_id   seq_id) {
-    return {
-        /*n_tokens       =*/ n_tokens,
-        /*tokens         =*/ tokens,
-        /*embd           =*/ nullptr,
-        /*pos            =*/ nullptr,
-        /*n_seq_id       =*/ nullptr,
-        /*seq_id         =*/ nullptr,
-        /*logits         =*/ nullptr,
-        /*all_pos_0      =*/ pos_0,
-        /*all_pos_1      =*/ 1,
-        /*all_seq_id     =*/ seq_id,
-    };
-}
-
 struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
     llama_batch batch = {
         /*n_tokens       =*/ 0,
@@ -21095,9 +21051,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
         /*n_seq_id       =*/ nullptr,
         /*seq_id         =*/ nullptr,
         /*logits         =*/ nullptr,
-        /*all_pos_0      =*/ 0,
-        /*all_pos_1      =*/ 0,
-        /*all_seq_id     =*/ 0,
     };
 
     if (embd) {

Original file line number	Diff line number	Diff line change
`@@ -508,8 +508,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {`
`508`	`508`	`tokens[batch_start] = llama_token_bos(llama_get_model(ctx));`
`509`	`509`	`}`
`510`	`510`
`511`		`- // TODO: use batch.logits to save computations instead of relying on logits_all == true`
`512`		`- if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {`
	`511`	`+ if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0, true))) {`
`513`	`512`	`LOG_ERR("%s : failed to eval\n", __func__);`
`514`	`513`	`return false;`
`515`	`514`	`}`