@@ -1308,11 +1308,12 @@ struct server_slot {
1308
1308
common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1309
1309
1310
1310
// stats
1311
- size_t n_sent_text = 0 ; // number of sent text character
1311
+ size_t n_sent_text = 0 ; // number of sent text character
1312
1312
1313
1313
int64_t t_start_process_prompt;
1314
1314
int64_t t_start_generation;
1315
1315
1316
+ size_t n_prompt_processing = 0 ; // number of decoded prompt tokens (may be less than prompt_tokens.n_kv_tokens(), in case we are using cache)
1316
1317
double t_prompt_processing; // ms
1317
1318
double t_token_generation; // ms
1318
1319
@@ -1334,6 +1335,7 @@ struct server_slot {
1334
1335
stopping_word = " " ;
1335
1336
n_past = 0 ;
1336
1337
n_sent_text = 0 ;
1338
+ n_prompt_processing = 0 ;
1337
1339
task_type = SERVER_TASK_TYPE_COMPLETION;
1338
1340
1339
1341
generated_tokens.clear ();
@@ -1402,10 +1404,10 @@ struct server_slot {
1402
1404
1403
1405
result_timings get_timings () const {
1404
1406
result_timings timings;
1405
- timings.prompt_n = prompt_tokens. n_kv_tokens () ;
1407
+ timings.prompt_n = n_prompt_processing ;
1406
1408
timings.prompt_ms = t_prompt_processing;
1407
- timings.prompt_per_token_ms = t_prompt_processing / prompt_tokens. n_kv_tokens () ;
1408
- timings.prompt_per_second = 1e3 / t_prompt_processing * prompt_tokens. n_kv_tokens () ;
1409
+ timings.prompt_per_token_ms = t_prompt_processing / n_prompt_processing ;
1410
+ timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_processing ;
1409
1411
1410
1412
timings.predicted_n = n_decoded;
1411
1413
timings.predicted_ms = t_token_generation;
@@ -3212,8 +3214,9 @@ struct server_context {
3212
3214
slot.cache_tokens .push_back (chunk.get ()); // copy
3213
3215
}
3214
3216
3215
- slot.n_past += n_pos;
3216
- slot.n_kv_tokens += n_tok;
3217
+ slot.n_past += n_pos;
3218
+ slot.n_kv_tokens += n_tok;
3219
+ slot.n_prompt_processing += n_tok; // for stats only
3217
3220
}
3218
3221
3219
3222
// add prompt tokens for processing in the current batch
@@ -3233,6 +3236,7 @@ struct server_context {
3233
3236
3234
3237
slot.n_kv_tokens ++;
3235
3238
slot.n_past ++;
3239
+ slot.n_prompt_processing ++; // for stats only
3236
3240
}
3237
3241
3238
3242
// SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
0 commit comments