Skip to content

Commit 1c48616

Browse files
committed
adapt all examples
1 parent b226c5b commit 1c48616

File tree

20 files changed

+92
-37
lines changed

20 files changed

+92
-37
lines changed

common/common.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -912,7 +912,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
912912
}
913913

914914
if (llama_model_has_encoder(model)) {
915-
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
915+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
916916
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
917917
if (decoder_start_token_id == -1) {
918918
decoder_start_token_id = bos;
@@ -921,7 +921,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
921921
tmp.push_back(decoder_start_token_id);
922922
}
923923
if (llama_model_has_decoder(model)) {
924-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
924+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
925925
}
926926
llama_kv_cache_clear(lctx);
927927
llama_synchronize(lctx);

examples/batched-bench/batched-bench.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
7474
batch.n_seq_id + i,
7575
batch.seq_id + i,
7676
batch.logits + i,
77-
0, 0, 0, // unused
7877
};
7978

8079
const int ret = llama_decode(ctx, batch_view);

examples/cvector-generator/cvector-generator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
339339

340340
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
341341
llama_kv_cache_clear(ctx);
342-
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
342+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
343343
fprintf(stderr, "%s : failed to eval\n", __func__);
344344
return false;
345345
}

examples/eval-callback/eval-callback.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
131131

132132
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
133133

134-
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
134+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
135135
LOG_ERR("%s : failed to eval\n", __func__);
136136
return false;
137137
}

examples/imatrix/imatrix.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -508,12 +508,21 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
508508
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
509509
}
510510

511-
// TODO: use batch.logits to save computations instead of relying on logits_all == true
512-
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
511+
llama_batch batch = llama_batch_init(batch_size, 0, 1);
512+
for (int i = 0; i < batch_size; i++) {
513+
batch. token[i] = tokens[batch_start + i];
514+
batch. pos[i] = j*n_batch + i;
515+
batch.logits[i] = true;
516+
batch.seq_id[i][0] = 0;
517+
}
518+
519+
if (llama_decode(ctx, batch)) {
513520
LOG_ERR("%s : failed to eval\n", __func__);
514521
return false;
515522
}
516523

524+
llama_batch_free(batch);
525+
517526
// restore the original token in case it was set to BOS
518527
tokens[batch_start] = token_org;
519528

examples/infill/infill.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ int main(int argc, char ** argv) {
396396

397397
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
398398

399-
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
399+
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
400400
LOG_ERR("%s : failed to eval\n", __func__);
401401
return 1;
402402
}

examples/llama-bench/llama-bench.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,7 +1446,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
14461446
for (int i = 1; i < n_tokens; i++) {
14471447
tokens[i] = std::rand() % n_vocab;
14481448
}
1449-
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
1449+
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
14501450
n_processed += n_tokens;
14511451
}
14521452

@@ -1462,7 +1462,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
14621462
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
14631463

14641464
for (int i = 0; i < n_gen; i++) {
1465-
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
1465+
llama_decode(ctx, llama_batch_get_one(&token, 1));
14661466
llama_synchronize(ctx);
14671467
token = std::rand() % n_vocab;
14681468
}

examples/llama.android/llama/src/main/cpp/llama-android.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -283,9 +283,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
283283
nullptr,
284284
nullptr,
285285
nullptr,
286-
0,
287-
0,
288-
0,
289286
};
290287

291288
if (embd) {

examples/llava/llava-cli.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
2020
if (n_eval > n_batch) {
2121
n_eval = n_batch;
2222
}
23-
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
23+
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
2424
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
2525
return false;
2626
}

examples/llava/llava.cpp

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,39 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
401401
return true;
402402
}
403403

404+
struct llava_embd_batch {
405+
std::vector<llama_pos> pos;
406+
std::vector<int32_t> n_seq_id;
407+
std::vector<llama_seq_id> seq_id_0;
408+
std::vector<llama_seq_id *> seq_ids;
409+
std::vector<int8_t> logits;
410+
llama_batch batch;
411+
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
412+
pos .resize(n_tokens);
413+
n_seq_id.resize(n_tokens);
414+
seq_ids .resize(n_tokens + 1);
415+
logits .resize(n_tokens);
416+
seq_id_0.resize(1);
417+
seq_id_0[0] = seq_id;
418+
seq_ids [n_tokens] = nullptr;
419+
batch = {
420+
/*n_tokens =*/ n_tokens,
421+
/*tokens =*/ nullptr,
422+
/*embd =*/ embd,
423+
/*pos =*/ pos.data(),
424+
/*n_seq_id =*/ n_seq_id.data(),
425+
/*seq_id =*/ seq_ids.data(),
426+
/*logits =*/ logits.data(),
427+
};
428+
for (int i = 0; i < n_tokens; i++) {
429+
batch.pos [i] = pos_0 + i;
430+
batch.n_seq_id[i] = 1;
431+
batch.seq_id [i] = seq_id_0.data();
432+
batch.logits [i] = false;
433+
}
434+
}
435+
};
436+
404437
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
405438
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
406439

@@ -409,8 +442,9 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
409442
if (n_eval > n_batch) {
410443
n_eval = n_batch;
411444
}
412-
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
413-
if (llama_decode(ctx_llama, batch)) {
445+
float * embd = image_embed->embed+i*n_embd;
446+
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
447+
if (llama_decode(ctx_llama, llava_batch.batch)) {
414448
LOG_ERR("%s : failed to eval\n", __func__);
415449
return false;
416450
}

examples/llava/minicpmv-cli.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
9797
if (n_eval > n_batch) {
9898
n_eval = n_batch;
9999
}
100-
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
100+
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
101101
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
102102
return false;
103103
}

examples/lookahead/lookahead.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ int main(int argc, char ** argv) {
8989
const auto t_enc_start = ggml_time_us();
9090

9191
// eval the prompt
92-
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
93-
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
92+
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
93+
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
9494

9595
for (int s = 1; s < W + G + 1; ++s) {
9696
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);

examples/lookup/lookup.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ int main(int argc, char ** argv){
8989

9090
const auto t_enc_start = ggml_time_us();
9191

92-
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
93-
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
92+
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
93+
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
9494

9595
const auto t_enc_end = ggml_time_us();
9696

examples/main/main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ int main(int argc, char ** argv) {
528528
int enc_input_size = embd_inp.size();
529529
llama_token * enc_input_buf = embd_inp.data();
530530

531-
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
531+
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
532532
LOG_ERR("%s : failed to eval\n", __func__);
533533
return 1;
534534
}
@@ -648,7 +648,7 @@ int main(int argc, char ** argv) {
648648

649649
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
650650

651-
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
651+
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
652652
LOG_ERR("%s : failed to eval\n", __func__);
653653
return 1;
654654
}

examples/parallel/parallel.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,6 @@ int main(int argc, char ** argv) {
308308
batch.n_seq_id + i,
309309
batch.seq_id + i,
310310
batch.logits + i,
311-
0, 0, 0, // unused
312311
};
313312

314313
const int ret = llama_decode(ctx, batch_view);

examples/perplexity/perplexity.cpp

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -409,13 +409,22 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
409409
const int batch_start = start + j * n_batch;
410410
const int batch_size = std::min(end - batch_start, n_batch);
411411

412+
llama_batch batch = llama_batch_init(batch_size, 0, 1);
413+
for (int i = 0; i < batch_size; i++) {
414+
batch. token[i] = tokens[batch_start + i];
415+
batch. pos[i] = j*n_batch + i;
416+
batch.logits[i] = true;
417+
batch.seq_id[i][0] = 0;
418+
}
419+
412420
//LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
413-
// TODO: use llama_batch.logits instead of relying on logits_all == true
414-
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
421+
if (llama_decode(ctx, batch)) {
415422
//LOG_ERR("%s : failed to eval\n", __func__);
416423
return {tokens, -1, logit_history, prob_history};
417424
}
418425

426+
llama_batch_free(batch);
427+
419428
// save original token and restore it after eval
420429
const auto token_org = tokens[batch_start];
421430

@@ -699,7 +708,6 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
699708
batch.n_seq_id + i,
700709
batch.seq_id + i,
701710
batch.logits + i,
702-
0, 0, 0, // unused
703711
};
704712

705713
const int ret = llama_decode(ctx, batch_view);
@@ -1790,12 +1798,21 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
17901798
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
17911799
}
17921800

1793-
// TODO: use llama_batch.logits instead of relying on logits_all == true
1794-
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
1801+
llama_batch batch = llama_batch_init(batch_size, 0, 1);
1802+
for (int i = 0; i < batch_size; i++) {
1803+
batch. token[i] = tokens[batch_start + i];
1804+
batch. pos[i] = j*n_batch + i;
1805+
batch.logits[i] = true;
1806+
batch.seq_id[i][0] = 0;
1807+
}
1808+
1809+
if (llama_decode(ctx, batch)) {
17951810
LOG_ERR("%s : failed to eval\n", __func__);
17961811
return;
17971812
}
17981813

1814+
llama_batch_free(batch);
1815+
17991816
// restore the original token in case it was set to BOS
18001817
tokens[batch_start] = token_org;
18011818

examples/save-load-state/save-load-state.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ int main(int argc, char ** argv) {
4949
auto tokens = llama_tokenize(ctx, params.prompt, true);
5050

5151
// evaluate prompt
52-
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
52+
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()));
5353
n_past += tokens.size();
5454

5555
// save state (rng, logits, embedding and kv_cache) to file
@@ -77,7 +77,7 @@ int main(int argc, char ** argv) {
7777
printf("%s", next_token_str.c_str());
7878
result0 += next_token_str;
7979

80-
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
80+
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1))) {
8181
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
8282
llama_free(ctx);
8383
llama_free_model(model);
@@ -133,7 +133,7 @@ int main(int argc, char ** argv) {
133133
printf("%s", next_token_str.c_str());
134134
result1 += next_token_str;
135135

136-
if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
136+
if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1))) {
137137
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
138138
llama_free(ctx2);
139139
llama_free_model(model);
@@ -221,7 +221,7 @@ int main(int argc, char ** argv) {
221221
printf("%s", next_token_str.c_str());
222222
result2 += next_token_str;
223223

224-
if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
224+
if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1))) {
225225
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
226226
llama_free(ctx3);
227227
llama_free_model(model);

examples/server/server.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2283,7 +2283,6 @@ struct server_context {
22832283
batch.n_seq_id + i,
22842284
batch.seq_id + i,
22852285
batch.logits + i,
2286-
0, 0, 0, // unused
22872286
};
22882287

22892288
const int ret = llama_decode(ctx, batch_view);

examples/speculative/speculative.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,9 @@ int main(int argc, char ** argv) {
155155
const auto t_enc_start = ggml_time_us();
156156

157157
// eval the prompt with both models
158-
llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
159-
llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
160-
llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0));
158+
llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1));
159+
llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1));
160+
llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input));
161161

162162
const auto t_enc_end = ggml_time_us();
163163

src/llama.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21144,6 +21144,7 @@ struct llama_batch_allocr {
2114421144
logits[logits.size() - 1] = true;
2114521145
batch.logits = logits.data();
2114621146
}
21147+
return batch;
2114721148
}
2114821149
};
2114921150

0 commit comments

Comments
 (0)