Skip to content

Commit f9cc9f2

Browse files
committed
fix stats report
1 parent 678d7b1 commit f9cc9f2

File tree

1 file changed

+10
-6
lines changed

1 file changed

+10
-6
lines changed

tools/server/server.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,11 +1308,12 @@ struct server_slot {
13081308
common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
13091309

13101310
// stats
1311-
size_t n_sent_text = 0; // number of sent text character
1311+
size_t n_sent_text = 0; // number of sent text character
13121312

13131313
int64_t t_start_process_prompt;
13141314
int64_t t_start_generation;
13151315

1316+
size_t n_prompt_processing = 0; // number of decoded prompt tokens (may be less than prompt_tokens.n_kv_tokens(), in case we are using cache)
13161317
double t_prompt_processing; // ms
13171318
double t_token_generation; // ms
13181319

@@ -1334,6 +1335,7 @@ struct server_slot {
13341335
stopping_word = "";
13351336
n_past = 0;
13361337
n_sent_text = 0;
1338+
n_prompt_processing = 0;
13371339
task_type = SERVER_TASK_TYPE_COMPLETION;
13381340

13391341
generated_tokens.clear();
@@ -1402,10 +1404,10 @@ struct server_slot {
14021404

14031405
result_timings get_timings() const {
14041406
result_timings timings;
1405-
timings.prompt_n = prompt_tokens.n_kv_tokens();
1407+
timings.prompt_n = n_prompt_processing;
14061408
timings.prompt_ms = t_prompt_processing;
1407-
timings.prompt_per_token_ms = t_prompt_processing / prompt_tokens.n_kv_tokens();
1408-
timings.prompt_per_second = 1e3 / t_prompt_processing * prompt_tokens.n_kv_tokens();
1409+
timings.prompt_per_token_ms = t_prompt_processing / n_prompt_processing;
1410+
timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_processing;
14091411

14101412
timings.predicted_n = n_decoded;
14111413
timings.predicted_ms = t_token_generation;
@@ -3212,8 +3214,9 @@ struct server_context {
32123214
slot.cache_tokens.push_back(chunk.get()); // copy
32133215
}
32143216

3215-
slot.n_past += n_pos;
3216-
slot.n_kv_tokens += n_tok;
3217+
slot.n_past += n_pos;
3218+
slot.n_kv_tokens += n_tok;
3219+
slot.n_prompt_processing += n_tok; // for stats only
32173220
}
32183221

32193222
// add prompt tokens for processing in the current batch
@@ -3233,6 +3236,7 @@ struct server_context {
32333236

32343237
slot.n_kv_tokens++;
32353238
slot.n_past++;
3239+
slot.n_prompt_processing++; // for stats only
32363240
}
32373241

32383242
// SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());

0 commit comments

Comments
 (0)