Skip to content

Commit 51fb96b

Browse files
authored
context : remove logits_all flag (#13284)
* context : remove logits_all flag ggml-ci * llama : remove logits_all flag + reorder llama_context_params ggml-ci
1 parent 70a6991 commit 51fb96b

File tree

9 files changed

+13
-37
lines changed

9 files changed

+13
-37
lines changed

common/arg.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2097,13 +2097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20972097
params.cache_type_v = kv_cache_type_from_str(value);
20982098
}
20992099
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
2100-
add_opt(common_arg(
2101-
{"--perplexity", "--all-logits"},
2102-
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
2103-
[](common_params & params) {
2104-
params.logits_all = true;
2105-
}
2106-
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
21072100
add_opt(common_arg(
21082101
{"--hellaswag"},
21092102
"compute HellaSwag score over random tasks from datafile supplied with -f",

common/common.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10961096
cparams.n_threads = params.cpuparams.n_threads;
10971097
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
10981098
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1099-
cparams.logits_all = params.logits_all;
11001099
cparams.embeddings = params.embedding;
11011100
cparams.rope_scaling_type = params.rope_scaling_type;
11021101
cparams.rope_freq_base = params.rope_freq_base;

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,6 @@ struct common_params {
324324
bool ctx_shift = true; // context shift on inifinite text generation
325325

326326
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
327-
bool logits_all = false; // return logits for all tokens in the batch
328327
bool use_mmap = true; // use mmap for faster loads
329328
bool use_mlock = false; // use mlock to keep model in memory
330329
bool verbose_prompt = false; // print prompt tokens before generation

include/llama.h

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -351,19 +351,17 @@ extern "C" {
351351
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
352352
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
353353

354-
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
355-
// TODO: move at the end of the struct
356-
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
357-
bool embeddings; // if true, extract embeddings (together with logits)
358-
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
359-
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
360-
bool no_perf; // whether to measure performance timings
361-
362354
// Abort callback
363355
// if it returns true, execution of llama_decode() will be aborted
364356
// currently works only with CPU execution
365357
ggml_abort_callback abort_callback;
366358
void * abort_callback_data;
359+
360+
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
361+
bool embeddings; // if true, extract embeddings (together with logits)
362+
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
363+
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
364+
bool no_perf; // whether to measure performance timings
367365
};
368366

369367
// model quantization parameters

src/llama-context.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,6 @@ llama_context::llama_context(
116116
__func__, n_ctx_per_seq, hparams.n_ctx_train);
117117
}
118118

119-
logits_all = params.logits_all;
120-
121119
if (!hparams.vocab_only) {
122120
// GPU backends
123121
for (auto * dev : model.devices) {
@@ -890,7 +888,7 @@ int llama_context::decode(llama_batch & inp_batch) {
890888
for (uint32_t i = 0; i < n_tokens_all; ++i) {
891889
n_outputs_all += batch.logits[i] != 0;
892890
}
893-
} else if (logits_all || embd_pooled) {
891+
} else if (embd_pooled) {
894892
n_outputs_all = n_tokens_all;
895893
} else {
896894
// keep last output only
@@ -1853,13 +1851,12 @@ llama_context_params llama_context_default_params() {
18531851
/*.cb_eval_user_data =*/ nullptr,
18541852
/*.type_k =*/ GGML_TYPE_F16,
18551853
/*.type_v =*/ GGML_TYPE_F16,
1856-
/*.logits_all =*/ false,
1854+
/*.abort_callback =*/ nullptr,
1855+
/*.abort_callback_data =*/ nullptr,
18571856
/*.embeddings =*/ false,
18581857
/*.offload_kqv =*/ true,
18591858
/*.flash_attn =*/ false,
18601859
/*.no_perf =*/ true,
1861-
/*.abort_callback =*/ nullptr,
1862-
/*.abort_callback_data =*/ nullptr,
18631860
};
18641861

18651862
return result;

src/llama-context.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,6 @@ struct llama_context {
187187

188188
std::unique_ptr<llama_memory_i> memory;
189189

190-
// TODO: remove
191-
bool logits_all = false;
192-
193190
// decode output (2-dimensional array: [n_outputs][n_vocab])
194191
size_t logits_size = 0; // capacity (of floats) for logits
195192
float * logits = nullptr;

tools/imatrix/imatrix.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,6 @@ int main(int argc, char ** argv) {
585585
params.out_file = "imatrix.dat" ;
586586

587587
params.n_ctx = 512;
588-
params.logits_all = true;
589588
params.escape = false;
590589

591590
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {

tools/main/main.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,14 +99,6 @@ int main(int argc, char ** argv) {
9999
console::init(params.simple_io, params.use_color);
100100
atexit([]() { console::cleanup(); });
101101

102-
if (params.logits_all) {
103-
LOG_ERR("************\n");
104-
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
105-
LOG_ERR("************\n\n");
106-
107-
return 0;
108-
}
109-
110102
if (params.embedding) {
111103
LOG_ERR("************\n");
112104
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);

tools/perplexity/perplexity.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,7 +1554,10 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
15541554
if (int(batch_indeces.size()) != num_answers) {
15551555
batch_indeces.resize(num_answers);
15561556
}
1557-
for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
1557+
1558+
for (int s = 0; s < num_answers; ++s) {
1559+
batch_indeces[s] = s0 + s;
1560+
}
15581561

15591562
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
15601563
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
@@ -1970,7 +1973,6 @@ int main(int argc, char ** argv) {
19701973
common_params params;
19711974

19721975
params.n_ctx = 512;
1973-
params.logits_all = true;
19741976
params.escape = false;
19751977

19761978
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {

0 commit comments

Comments
 (0)