File tree 3 files changed +8
-12
lines changed 3 files changed +8
-12
lines changed Original file line number Diff line number Diff line change @@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1096
1096
cparams.n_threads = params.cpuparams .n_threads ;
1097
1097
cparams.n_threads_batch = params.cpuparams_batch .n_threads == -1 ?
1098
1098
params.cpuparams .n_threads : params.cpuparams_batch .n_threads ;
1099
- cparams.logits_all = false ;
1100
1099
cparams.embeddings = params.embedding ;
1101
1100
cparams.rope_scaling_type = params.rope_scaling_type ;
1102
1101
cparams.rope_freq_base = params.rope_freq_base ;
Original file line number Diff line number Diff line change @@ -351,19 +351,17 @@ extern "C" {
351
351
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
352
352
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
353
353
354
- // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
355
- // TODO: move at the end of the struct
356
- bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
357
- bool embeddings; // if true, extract embeddings (together with logits)
358
- bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
359
- bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
360
- bool no_perf; // whether to measure performance timings
361
-
362
354
// Abort callback
363
355
// if it returns true, execution of llama_decode() will be aborted
364
356
// currently works only with CPU execution
365
357
ggml_abort_callback abort_callback;
366
358
void * abort_callback_data;
359
+
360
+ // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
361
+ bool embeddings; // if true, extract embeddings (together with logits)
362
+ bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
363
+ bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
364
+ bool no_perf; // whether to measure performance timings
367
365
};
368
366
369
367
// model quantization parameters
Original file line number Diff line number Diff line change @@ -1851,13 +1851,12 @@ llama_context_params llama_context_default_params() {
1851
1851
/* .cb_eval_user_data =*/ nullptr ,
1852
1852
/* .type_k =*/ GGML_TYPE_F16,
1853
1853
/* .type_v =*/ GGML_TYPE_F16,
1854
- /* .logits_all =*/ false ,
1854
+ /* .abort_callback =*/ nullptr ,
1855
+ /* .abort_callback_data =*/ nullptr ,
1855
1856
/* .embeddings =*/ false ,
1856
1857
/* .offload_kqv =*/ true ,
1857
1858
/* .flash_attn =*/ false ,
1858
1859
/* .no_perf =*/ true ,
1859
- /* .abort_callback =*/ nullptr ,
1860
- /* .abort_callback_data =*/ nullptr ,
1861
1860
};
1862
1861
1863
1862
return result;
You can’t perform that action at this time.
0 commit comments