@@ -3081,7 +3081,6 @@ struct server_context {
3081
3081
// without pooling, we want to output the embeddings for all the tokens in the batch
3082
3082
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type (slot.ctx ) == LLAMA_POOLING_TYPE_NONE;
3083
3083
3084
- // batch.add_text(prompt_tokens[slot.n_past], slot.n_past, slot.id, need_embd);
3085
3084
llama_batch_ext_add_text (batch.get (), prompt_tokens[slot.n_past ], slot.n_past , &slot.id , 1 , need_embd);
3086
3085
3087
3086
if (slot.params .cache_prompt ) {
@@ -3109,7 +3108,6 @@ struct server_context {
3109
3108
}
3110
3109
3111
3110
// extract the logits only for the last token
3112
- // batch.set_logits_last();
3113
3111
llama_batch_ext_set_output_last (batch.get ());
3114
3112
3115
3113
slot.n_decoded = 0 ;
@@ -3280,13 +3278,10 @@ struct server_context {
3280
3278
}
3281
3279
3282
3280
// construct the speculation batch
3283
- // slot.batch_spec.clear();
3284
- // slot.batch_spec.add_text(id, slot.n_past, slot.id, true);
3285
3281
llama_batch_ext_clear (slot.batch_spec .get ());
3286
3282
llama_batch_ext_add_text (slot.batch_spec .get (), id, slot.n_past , &slot.id , 1 , true );
3287
3283
3288
3284
for (size_t i = 0 ; i < draft.size (); ++i) {
3289
- // slot.batch_spec.add_text(draft[i], slot.n_past + 1 + i, slot.id, true);
3290
3285
llama_batch_ext_add_text (slot.batch_spec .get (), draft[i], slot.n_past + 1 + i, &slot.id , 1 , true );
3291
3286
}
3292
3287
0 commit comments