Skip to content

Commit d4ddcac

Browse files
committed
llama : remove logits_all flag + reorder llama_context_params
ggml-ci
1 parent e07cb9f commit d4ddcac

File tree

3 files changed

+8
-12
lines changed

3 files changed

+8
-12
lines changed

common/common.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10961096
cparams.n_threads = params.cpuparams.n_threads;
10971097
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
10981098
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1099-
cparams.logits_all = false;
11001099
cparams.embeddings = params.embedding;
11011100
cparams.rope_scaling_type = params.rope_scaling_type;
11021101
cparams.rope_freq_base = params.rope_freq_base;

include/llama.h

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -351,19 +351,17 @@ extern "C" {
351351
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
352352
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
353353

354-
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
355-
// TODO: move at the end of the struct
356-
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
357-
bool embeddings; // if true, extract embeddings (together with logits)
358-
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
359-
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
360-
bool no_perf; // whether to measure performance timings
361-
362354
// Abort callback
363355
// if it returns true, execution of llama_decode() will be aborted
364356
// currently works only with CPU execution
365357
ggml_abort_callback abort_callback;
366358
void * abort_callback_data;
359+
360+
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
361+
bool embeddings; // if true, extract embeddings (together with logits)
362+
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
363+
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
364+
bool no_perf; // whether to measure performance timings
367365
};
368366

369367
// model quantization parameters

src/llama-context.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1851,13 +1851,12 @@ llama_context_params llama_context_default_params() {
18511851
/*.cb_eval_user_data =*/ nullptr,
18521852
/*.type_k =*/ GGML_TYPE_F16,
18531853
/*.type_v =*/ GGML_TYPE_F16,
1854-
/*.logits_all =*/ false,
1854+
/*.abort_callback =*/ nullptr,
1855+
/*.abort_callback_data =*/ nullptr,
18551856
/*.embeddings =*/ false,
18561857
/*.offload_kqv =*/ true,
18571858
/*.flash_attn =*/ false,
18581859
/*.no_perf =*/ true,
1859-
/*.abort_callback =*/ nullptr,
1860-
/*.abort_callback_data =*/ nullptr,
18611860
};
18621861

18631862
return result;

0 commit comments

Comments
 (0)