remove C API llama_batch_ext_init_from_text

ngxson · ngxson · commit 30f1db993611 · 2025-03-20T22:27:36.000+01:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -1016,7 +1016,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
 
         if (llama_model_has_encoder(model)) {
-            llama_batch_ext_ptr batch(llama_batch_ext_init_from_text(tmp.data(), tmp.size(), 0, 0, true));
+            auto batch = llama_batch_ext_ptr::init_from_text(tmp.data(), tmp.size(), 0, 0, true);
             llama_encode_ext(lctx, batch.get());
             llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
             if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
@@ -1026,7 +1026,7 @@ struct common_init_result common_init_from_params(common_params & params) {
             tmp.push_back(decoder_start_token_id);
         }
         if (llama_model_has_decoder(model)) {
-            llama_batch_ext_ptr batch(llama_batch_ext_init_from_text(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0, true));
+            auto batch = llama_batch_ext_ptr::init_from_text(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0, true);
             llama_decode_ext(lctx, batch.get());
         }
         llama_kv_self_clear(lctx);
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
@@ -92,8 +92,8 @@ int main(int argc, char ** argv) {
     const auto t_enc_start = ggml_time_us();
 
     // eval the prompt
-    llama_batch_ext_ptr batch0(llama_batch_ext_init_from_text( inp.data(), n_input - 1, 0,           0, true));
-    llama_batch_ext_ptr batch1(llama_batch_ext_init_from_text(&inp.back(),           1, n_input - 1, 0, true));
+    auto batch0 = llama_batch_ext_ptr::init_from_text( inp.data(), n_input - 1, 0,           0, true);
+    auto batch1 = llama_batch_ext_ptr::init_from_text(&inp.back(),           1, n_input - 1, 0, true);
     llama_decode_ext(ctx, batch0.get());
     llama_decode_ext(ctx, batch1.get());
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
@@ -91,8 +91,8 @@ int main(int argc, char ** argv){
 
     const auto t_enc_start = ggml_time_us();
 
-    llama_batch_ext_ptr batch0(llama_batch_ext_init_from_text( inp.data(), n_input - 1, 0,           0, true));
-    llama_batch_ext_ptr batch1(llama_batch_ext_init_from_text(&inp.back(),           1, n_input - 1, 0, true));
+    auto batch0 = llama_batch_ext_ptr::init_from_text( inp.data(), n_input - 1, 0,           0, true);
+    auto batch1 = llama_batch_ext_ptr::init_from_text(&inp.back(),           1, n_input - 1, 0, true);
     llama_decode_ext(ctx, batch0.get());
     llama_decode_ext(ctx, batch1.get());
 
diff --git a/examples/run/run.cpp b/examples/run/run.cpp
@@ -1017,7 +1017,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
         print_word_and_concatenate_to_response(piece, response);
 
         // prepare the next batch with the sampled token
-        batch.reset(llama_batch_ext_init_from_text(&new_token_id, 1, llama_data.n_past, 0, true));
+        batch = llama_batch_ext_ptr::init_from_text(&new_token_id, 1, llama_data.n_past, 0, true);
     }
 
     printf(LOG_COL_DEFAULT);
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
@@ -48,11 +48,11 @@ int main(int argc, char ** argv) {
     auto tokens = common_tokenize(ctx, params.prompt, true);
 
     // prepare the batch
-    llama_batch_ext * batch = llama_batch_ext_init_from_text(tokens.data(), tokens.size(), 0, 0, true);
+    auto batch = llama_batch_ext_ptr::init_from_text(tokens.data(), tokens.size(), 0, 0, true);
 
     // evaluate prompt
-    llama_decode_ext(ctx, batch);
-    n_past += llama_batch_ext_get_n_tokens(batch);
+    llama_decode_ext(ctx, batch.get());
+    n_past += llama_batch_ext_get_n_tokens(batch.get());
 
     // save state (rng, logits, embedding and kv_cache) to file
     {
@@ -79,13 +79,13 @@ int main(int argc, char ** argv) {
         printf("%s", next_token_str.c_str());
         result0 += next_token_str;
 
-        llama_batch_ext_clear(batch);
+        llama_batch_ext_clear(batch.get());
         llama_seq_id seq_id = 0;
-        llama_batch_ext_add_text(batch, next_token, 0, &seq_id, 1, true);
+        llama_batch_ext_add_text(batch.get(), next_token, 0, &seq_id, 1, true);
 
-        if (llama_decode_ext(ctx, batch)) {
+        if (llama_decode_ext(ctx, batch.get())) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_ext_free(batch);
+            llama_batch_ext_free(batch.get());
             return 1;
         }
         n_past += 1;
@@ -132,13 +132,13 @@ int main(int argc, char ** argv) {
         printf("%s", next_token_str.c_str());
         result1 += next_token_str;
 
-        llama_batch_ext_clear(batch);
+        llama_batch_ext_clear(batch.get());
         llama_seq_id seq_id = 0;
-        llama_batch_ext_add_text(batch, next_token, 0, &seq_id, 1, true);
+        llama_batch_ext_add_text(batch.get(), next_token, 0, &seq_id, 1, true);
 
-        if (llama_decode_ext(ctx2, batch)) {
+        if (llama_decode_ext(ctx2, batch.get())) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_ext_free(batch);
+            llama_batch_ext_free(batch.get());
             return 1;
         }
         n_past += 1;
@@ -214,13 +214,13 @@ int main(int argc, char ** argv) {
         printf("%s", next_token_str.c_str());
         result2 += next_token_str;
 
-        llama_batch_ext_clear(batch);
+        llama_batch_ext_clear(batch.get());
         llama_seq_id seq_id = 1; // seq 1 instead of 0
-        llama_batch_ext_add_text(batch, next_token, 0, &seq_id, 1, true);
+        llama_batch_ext_add_text(batch.get(), next_token, 0, &seq_id, 1, true);
 
-        if (llama_decode_ext(ctx3, batch)) {
+        if (llama_decode_ext(ctx3, batch.get())) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_ext_free(batch);
+            llama_batch_ext_free(batch.get());
             return 1;
         }
         n_past += 1;
@@ -232,7 +232,7 @@ int main(int argc, char ** argv) {
     llama_sampler_free(smpl2);
     llama_sampler_free(smpl3);
 
-    llama_batch_ext_free(batch);
+    llama_batch_ext_free(batch.get());
 
     if (result0 != result2) {
         fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
@@ -1,4 +1,5 @@
 #include "llama.h"
+#include "llama-cpp.h"
 #include <cstdio>
 #include <cstring>
 #include <iostream>
@@ -109,21 +110,21 @@ int main(int argc, char ** argv) {
 
         // prepare a batch for the prompt
         llama_pos n_past = 0;
-        llama_batch_ext * batch = llama_batch_ext_init_from_text(prompt_tokens.data(), prompt_tokens.size(), n_past, 0, true);
-        n_past += llama_batch_ext_get_n_tokens(batch);
+        auto batch = llama_batch_ext_ptr::init_from_text(prompt_tokens.data(), prompt_tokens.size(), n_past, 0, true);
+        n_past += llama_batch_ext_get_n_tokens(batch.get());
 
         llama_token new_token_id;
         while (true) {
             // check if we have enough space in the context to evaluate this batch
             int n_ctx = llama_n_ctx(ctx);
             int n_ctx_used = llama_kv_self_used_cells(ctx);
-            if (n_ctx_used + llama_batch_ext_get_n_tokens(batch) > n_ctx) {
+            if (n_ctx_used + llama_batch_ext_get_n_tokens(batch.get()) > n_ctx) {
                 printf("\033[0m\n");
                 fprintf(stderr, "context size exceeded\n");
                 exit(0);
             }
 
-            if (llama_decode_ext(ctx, batch)) {
+            if (llama_decode_ext(ctx, batch.get())) {
                 GGML_ABORT("failed to decode\n");
             }
 
@@ -147,13 +148,13 @@ int main(int argc, char ** argv) {
             response += piece;
 
             // prepare the next batch with the sampled token
-            llama_batch_ext_clear(batch);
+            llama_batch_ext_clear(batch.get());
             llama_seq_id seq_id = 0;
-            llama_batch_ext_add_text(batch, new_token_id, n_past, &seq_id, 1, true);
+            llama_batch_ext_add_text(batch.get(), new_token_id, n_past, &seq_id, 1, true);
             n_past++;
         }
 
-        llama_batch_ext_free(batch);
+        llama_batch_ext_free(batch.get());
 
         return response;
     };
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -1,4 +1,5 @@
 #include "llama.h"
+#include "llama-cpp.h"
 #include <cstdio>
 #include <cstring>
 #include <string>
@@ -143,22 +144,22 @@ int main(int argc, char ** argv) {
 
     // prepare a batch for the prompt
 
-    llama_batch_ext * batch = llama_batch_ext_init_from_text(prompt_tokens.data(), prompt_tokens.size(), 0, 0, true);
+    auto batch = llama_batch_ext_ptr::init_from_text(prompt_tokens.data(), prompt_tokens.size(), 0, 0, true);
 
     // main loop
 
     const auto t_main_start = ggml_time_us();
     int n_decode = 0;
     llama_token new_token_id;
 
-    for (int n_pos = 0; n_pos + llama_batch_ext_get_n_tokens(batch) < n_prompt + n_predict; ) {
+    for (int n_pos = 0; n_pos + llama_batch_ext_get_n_tokens(batch.get()) < n_prompt + n_predict; ) {
         // evaluate the current batch with the transformer model
-        if (llama_decode_ext(ctx, batch)) {
+        if (llama_decode_ext(ctx, batch.get())) {
             fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
 
-        n_pos += llama_batch_ext_get_n_tokens(batch);
+        n_pos += llama_batch_ext_get_n_tokens(batch.get());
 
         // sample the next token
         {
@@ -180,9 +181,9 @@ int main(int argc, char ** argv) {
             fflush(stdout);
 
             // prepare the next batch with the sampled token
-            llama_batch_ext_clear(batch);
+            llama_batch_ext_clear(batch.get());
             llama_seq_id seq_id = 0;
-            llama_batch_ext_add_text(batch, new_token_id, n_pos, &seq_id, 1, true);
+            llama_batch_ext_add_text(batch.get(), new_token_id, n_pos, &seq_id, 1, true);
 
             n_decode += 1;
         }
@@ -200,7 +201,6 @@ int main(int argc, char ** argv) {
     llama_perf_context_print(ctx);
     fprintf(stderr, "\n");
 
-    llama_batch_ext_free(batch);
     llama_sampler_free(smpl);
     llama_free(ctx);
     llama_model_free(model);
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -165,9 +165,9 @@ int main(int argc, char ** argv) {
     const auto t_enc_start = ggml_time_us();
 
     // eval the prompt with both models
-    llama_batch_ext_ptr batch0(llama_batch_ext_init_from_text( inp.data(), n_input - 1, 0,           0, true));
-    llama_batch_ext_ptr batch1(llama_batch_ext_init_from_text(&inp.back(),           1, n_input - 1, 0, true));
-    llama_batch_ext_ptr batch2(llama_batch_ext_init_from_text( inp.data(), n_input    , 0,           0, true));
+    auto batch0 = llama_batch_ext_ptr::init_from_text( inp.data(), n_input - 1, 0,           0, true);
+    auto batch1 = llama_batch_ext_ptr::init_from_text(&inp.back(),           1, n_input - 1, 0, true);
+    auto batch2 = llama_batch_ext_ptr::init_from_text( inp.data(), n_input    , 0,           0, true);
     llama_decode_ext(ctx_tgt, batch0.get());
     llama_decode_ext(ctx_tgt, batch1.get());
     llama_decode_ext(ctx_dft, batch2.get());
diff --git a/include/llama-cpp.h b/include/llama-cpp.h
@@ -37,21 +37,31 @@ struct llama_batch_ext_ptr : std::unique_ptr<llama_batch_ext, llama_batch_ext_de
     llama_batch_ext_ptr() : std::unique_ptr<llama_batch_ext, llama_batch_ext_deleter>() {}
     llama_batch_ext_ptr(llama_batch_ext * batch) : std::unique_ptr<llama_batch_ext, llama_batch_ext_deleter>(batch) {}
 
-    // convenience function to create a batch from text tokens, without worrying about manually freeing it
+    // Convenience C++ wrapper to create a batch from text tokens, without worrying about manually freeing it
+    // First token will be at position pos0
+    // The sequence ID will be fixed to seq_id
+    // If output_last is true, the last token will have output set
     static llama_batch_ext_ptr init_from_text(llama_token * tokens,
-                                             int32_t   n_tokens,
-                                             int32_t   pos0,
-                                             int32_t   seq_id,
-                                                bool   output_last) {
-        return llama_batch_ext_ptr(llama_batch_ext_init_from_text(tokens, n_tokens, pos0, seq_id, output_last));
+                                                  int32_t   n_tokens,
+                                                llama_pos   pos0,
+                                             llama_seq_id   seq_id,
+                                                     bool   output_last) {
+        llama_batch_ext * batch = llama_batch_ext_init(n_tokens, 1);
+        for (int32_t i = 0; i < n_tokens; i++) {
+            llama_batch_ext_add_text(batch, tokens[i], pos0 + i, &seq_id, 1, false);
+        }
+        if (output_last) {
+            llama_batch_ext_set_output_last(batch);
+        }
+        return llama_batch_ext_ptr(batch);
     }
 
-    // convenience function to create a batch from text embeddings, without worrying about manually freeing it
+    // Convenience C++ wrapper to create a batch from text embeddings, without worrying about manually freeing it
     static llama_batch_ext_ptr init_from_embd(float * embd,
-                                        size_t   n_tokens,
-                                        size_t   n_embd,
-                                       int32_t   pos0,
-                                       int32_t   seq_id) {
+                                             size_t   n_tokens,
+                                             size_t   n_embd,
+                                          llama_pos   pos0,
+                                       llama_seq_id   seq_id) {
         return llama_batch_ext_ptr(llama_batch_ext_init_from_embd(embd, n_tokens, n_embd, pos0, seq_id));
     }
 };
diff --git a/include/llama.h b/include/llama.h
@@ -900,7 +900,7 @@ extern "C" {
     //
     DEPRECATED(LLAMA_API struct llama_batch llama_batch_get_one(
                   llama_token * tokens,
-                      int32_t   n_tokens), "use llama_batch_ext_init_from_text instead");
+                      int32_t   n_tokens), "use llama_batch_ext API instead");
 
     // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
     // Each token can be assigned up to n_seq_max sequence ids
@@ -925,30 +925,18 @@ extern "C" {
             int32_t n_tokens,
             int32_t n_seq_max);
 
-    // Same with llama_batch_init, but initializes the batch with the provided text tokens
-    // First token will be at position pos0
-    // The sequence ID will be fixed to seq_id
-    // If output_last is true, the last token will have output set
-    // The batch has to be freed with llama_batch_ext_free()
-    LLAMA_API struct llama_batch_ext * llama_batch_ext_init_from_text(
-            llama_token * tokens,
-                int32_t   n_tokens,
-                int32_t   pos0,
-                int32_t   seq_id,
-                   bool   output_last);
-
     // Same with llama_batch_init, but initializes the batch with the provided raw embeddings
     // Size of embd should be n_tokens * n_embd
     // n_embd is the number of embeddings per token, can be obtained from llama_model_n_embd()
     // First token will be at position pos0
     // The sequence ID will be fixed to seq_id
     // The batch has to be freed with llama_batch_ext_free()
     LLAMA_API struct llama_batch_ext * llama_batch_ext_init_from_embd(
-              float * embd,
-             size_t   n_tokens,
-             size_t   n_embd,
-            int32_t   pos0,
-            int32_t   seq_id);
+                  const float * embd,
+                       size_t   n_tokens,
+                       size_t   n_embd,
+                    llama_pos   pos0,
+                 llama_seq_id   seq_id);
 
     // Set arbitrary token to the embeddings batch
     // Note: this is only to be used in conjunction with llama_batch_ext_init_from_embd()
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -337,22 +337,6 @@ struct llama_batch llama_batch_get_one(
     };
 }
 
-struct llama_batch_ext * llama_batch_ext_init_from_text(
-            llama_token * tokens,
-                int32_t   n_tokens,
-                int32_t   pos0,
-                int32_t   seq_id,
-                  bool    output_last) {
-    llama_batch_ext * batch = llama_batch_ext_init(n_tokens, 1);
-    for (int32_t i = 0; i < n_tokens; i++) {
-        llama_batch_ext_add_text(batch, tokens[i], pos0 + i, &seq_id, 1, false);
-    }
-    if (output_last) {
-        llama_batch_ext_set_output_last(batch);
-    }
-    return batch;
-}
-
 static struct llama_batch_ext * llama_batch_ext_init_impl(int32_t n_tokens_alloc, int32_t n_embd, int32_t n_seq_max) {
     llama_batch_ext * batch = new llama_batch_ext{
         /*n_tokens       =*/ 0,
@@ -390,11 +374,11 @@ struct llama_batch_ext * llama_batch_ext_init(int32_t n_tokens_alloc, int32_t n_
 }
 
 struct llama_batch_ext * llama_batch_ext_init_from_embd(
-              float * embd,
-            size_t    n_tokens,
-            size_t    n_embd,
-            int32_t   pos0,
-            int32_t   seq_id) {
+        const float * embd,
+             size_t   n_tokens,
+             size_t   n_embd,
+          llama_pos   pos0,
+       llama_seq_id   seq_id) {
     struct llama_batch_ext * batch = llama_batch_ext_init_impl(n_tokens, n_embd, 1);
     memcpy(batch->embd, embd, n_tokens * n_embd * sizeof(float));
     for (size_t i = 0; i < n_tokens; i++) {

Original file line number	Diff line number	Diff line change
`@@ -1016,7 +1016,7 @@ struct common_init_result common_init_from_params(common_params & params) {`
`1016`	`1016`	`}`
`1017`	`1017`
`1018`	`1018`	`if (llama_model_has_encoder(model)) {`
`1019`		`- llama_batch_ext_ptr batch(llama_batch_ext_init_from_text(tmp.data(), tmp.size(), 0, 0, true));`
	`1019`	`+ auto batch = llama_batch_ext_ptr::init_from_text(tmp.data(), tmp.size(), 0, 0, true);`
`1020`	`1020`	`llama_encode_ext(lctx, batch.get());`
`1021`	`1021`	`llama_token decoder_start_token_id = llama_model_decoder_start_token(model);`
`1022`	`1022`	`if (decoder_start_token_id == LLAMA_TOKEN_NULL) {`
`@@ -1026,7 +1026,7 @@ struct common_init_result common_init_from_params(common_params & params) {`
`1026`	`1026`	`tmp.push_back(decoder_start_token_id);`
`1027`	`1027`	`}`
`1028`	`1028`	`if (llama_model_has_decoder(model)) {`
`1029`		`- llama_batch_ext_ptr batch(llama_batch_ext_init_from_text(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0, true));`
	`1029`	`+ auto batch = llama_batch_ext_ptr::init_from_text(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0, true);`
`1030`	`1030`	`llama_decode_ext(lctx, batch.get());`
`1031`	`1031`	`}`
`1032`	`1032`	`llama_kv_self_clear(lctx);`
Original file line number	Diff line number	Diff line change
`@@ -1017,7 +1017,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str`
`1017`	`1017`	`print_word_and_concatenate_to_response(piece, response);`
`1018`	`1018`
`1019`	`1019`	`// prepare the next batch with the sampled token`
`1020`		`- batch.reset(llama_batch_ext_init_from_text(&new_token_id, 1, llama_data.n_past, 0, true));`
	`1020`	`+ batch = llama_batch_ext_ptr::init_from_text(&new_token_id, 1, llama_data.n_past, 0, true);`
`1021`	`1021`	`}`
`1022`	`1022`
`1023`	`1023`	`printf(LOG_COL_DEFAULT);`