diff --git a/common/arg.cpp b/common/arg.cpp index 5e07e8a699b8f..9f87e9910b540 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2097,13 +2097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.cache_type_v = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_V")); - add_opt(common_arg( - {"--perplexity", "--all-logits"}, - string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](common_params & params) { - params.logits_all = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag"}, "compute HellaSwag score over random tasks from datafile supplied with -f", diff --git a/common/common.cpp b/common/common.cpp index 94f545f815c27..bd20af233695c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.n_threads = params.cpuparams.n_threads; cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? params.cpuparams.n_threads : params.cpuparams_batch.n_threads; - cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; cparams.rope_freq_base = params.rope_freq_base; diff --git a/common/common.h b/common/common.h index 400f674b2283d..90702245463cb 100644 --- a/common/common.h +++ b/common/common.h @@ -324,7 +324,6 @@ struct common_params { bool ctx_shift = true; // context shift on inifinite text generation bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation diff --git a/include/llama.h b/include/llama.h index 06c56395c139f..e18e9b8da337f 100644 --- a/include/llama.h +++ b/include/llama.h @@ -351,19 +351,17 @@ extern "C" { enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] - // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. - // TODO: move at the end of the struct - bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) - bool embeddings; // if true, extract embeddings (together with logits) - bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - bool no_perf; // whether to measure performance timings - // Abort callback // if it returns true, execution of llama_decode() will be aborted // currently works only with CPU execution ggml_abort_callback abort_callback; void * abort_callback_data; + + // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. + bool embeddings; // if true, extract embeddings (together with logits) + bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + bool flash_attn; // whether to use flash attention [EXPERIMENTAL] + bool no_perf; // whether to measure performance timings }; // model quantization parameters diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 45591be992d87..dadb87517f7ef 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -116,8 +116,6 @@ llama_context::llama_context( __func__, n_ctx_per_seq, hparams.n_ctx_train); } - logits_all = params.logits_all; - if (!hparams.vocab_only) { // GPU backends for (auto * dev : model.devices) { @@ -890,7 +888,7 @@ int llama_context::decode(llama_batch & inp_batch) { for (uint32_t i = 0; i < n_tokens_all; ++i) { n_outputs_all += batch.logits[i] != 0; } - } else if (logits_all || embd_pooled) { + } else if (embd_pooled) { n_outputs_all = n_tokens_all; } else { // keep last output only @@ -1853,13 +1851,12 @@ llama_context_params llama_context_default_params() { /*.cb_eval_user_data =*/ nullptr, /*.type_k =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16, - /*.logits_all =*/ false, + /*.abort_callback =*/ nullptr, + /*.abort_callback_data =*/ nullptr, /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, - /*.abort_callback =*/ nullptr, - /*.abort_callback_data =*/ nullptr, }; return result; diff --git a/src/llama-context.h b/src/llama-context.h index cf41ac57b9fba..5a080e67fcc4b 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -187,9 +187,6 @@ struct llama_context { std::unique_ptr memory; - // TODO: remove - bool logits_all = false; - // decode output (2-dimensional array: [n_outputs][n_vocab]) size_t logits_size = 0; // capacity (of floats) for logits float * logits = nullptr; diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index b4640f9faf229..2c39278dba3d9 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -585,7 +585,6 @@ int main(int argc, char ** argv) { params.out_file = "imatrix.dat" ; params.n_ctx = 512; - params.logits_all = true; params.escape = false; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { diff --git a/tools/main/main.cpp b/tools/main/main.cpp index c59b941bf5e47..756297c257a6e 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -99,14 +99,6 @@ int main(int argc, char ** argv) { console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); - if (params.logits_all) { - LOG_ERR("************\n"); - LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - LOG_ERR("************\n\n"); - - return 0; - } - if (params.embedding) { LOG_ERR("************\n"); LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp index 175f2804b5da0..b5cdf5beb1b24 100644 --- a/tools/perplexity/perplexity.cpp +++ b/tools/perplexity/perplexity.cpp @@ -1554,7 +1554,10 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par if (int(batch_indeces.size()) != num_answers) { batch_indeces.resize(num_answers); } - for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s; + + for (int s = 0; s < num_answers; ++s) { + batch_indeces[s] = s0 + s; + } for (size_t i = 0; i < cur_task.common_prefix; ++i) { //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); @@ -1970,7 +1973,6 @@ int main(int argc, char ** argv) { common_params params; params.n_ctx = 512; - params.logits_all = true; params.escape = false; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {