@@ -383,9 +383,9 @@ struct llama_client_slot
383
383
bool stopped_eos = false ;
384
384
bool stopped_word = false ;
385
385
bool stopped_limit = false ;
386
-
386
+
387
387
bool oaicompat = false ;
388
- std::string oaicompat_model = " " ;
388
+ std::string oaicompat_model;
389
389
390
390
std::string stopping_word;
391
391
@@ -486,7 +486,7 @@ struct llama_client_slot
486
486
};
487
487
}
488
488
489
- void print_timings () {
489
+ void print_timings () const {
490
490
LOG_TEE (" \n " );
491
491
LOG_TEE (" %s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n " ,
492
492
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
@@ -685,15 +685,15 @@ struct llama_server_context
685
685
bool launch_slot_with_data (llama_client_slot* &slot, json data) {
686
686
slot_params default_params;
687
687
llama_sampling_params default_sparams;
688
-
688
+
689
689
if (data.count (" __oaicompat" ) != 0 ) {
690
690
slot->oaicompat = true ;
691
691
slot->oaicompat_model = json_value (data, " model" , std::string (DEFAULT_OAICOMPAT_MODEL));
692
692
} else {
693
693
slot->oaicompat = false ;
694
694
slot->oaicompat_model = " " ;
695
695
}
696
-
696
+
697
697
slot->params .stream = json_value (data, " stream" , false );
698
698
slot->params .cache_prompt = json_value (data, " cache_prompt" , false );
699
699
slot->params .n_predict = json_value (data, " n_predict" , default_params.n_predict );
@@ -1284,7 +1284,7 @@ struct llama_server_context
1284
1284
std::lock_guard<std::mutex> lock (mutex_tasks);
1285
1285
task_server task;
1286
1286
task.id = id_gen++;
1287
- task.data = data;
1287
+ task.data = std::move ( data) ;
1288
1288
task.infill_mode = infill;
1289
1289
task.embedding_mode = embedding;
1290
1290
task.type = COMPLETION_TASK;
@@ -2252,29 +2252,27 @@ json oaicompat_completion_params_parse(
2252
2252
llama_params[" __oaicompat" ] = true ;
2253
2253
2254
2254
// Map OpenAI parameters to llama.cpp parameters
2255
- llama_params[" prompt" ] = format_chatml (body[" messages" ]); // OpenAI 'messages' to llama.cpp 'prompt'
2256
- llama_params[" temperature" ] = json_value (body, " temperature" , 0.8 );
2257
- llama_params[" top_k" ] = json_value (body, " max_tokens " , 40 );
2258
- llama_params[" top_p" ] = json_value (body, " top_p" , 0.95 );
2259
- llama_params[" n_predict" ] = json_value (body, " max_tokens" , -1 );
2260
- llama_params[" logit_bias" ] = json_value (body, " logit_bias" ,json::object ());
2255
+ llama_params[" prompt" ] = format_chatml (body[" messages" ]); // OpenAI 'messages' to llama.cpp 'prompt'
2256
+ llama_params[" temperature" ] = json_value (body, " temperature" , 0.8 );
2257
+ llama_params[" top_k" ] = json_value (body, " top_k " , 40 );
2258
+ llama_params[" top_p" ] = json_value (body, " top_p" , 0.95 );
2259
+ llama_params[" n_predict" ] = json_value (body, " max_tokens" , -1 );
2260
+ llama_params[" logit_bias" ] = json_value (body, " logit_bias" ,json::object ());
2261
2261
llama_params[" frequency_penalty" ] = json_value (body, " frequency_penalty" , 0.0 );
2262
- llama_params[" presence_penalty" ] = json_value (body, " presence_penalty" , 0.0 );
2263
- llama_params[" seed" ] = json_value (body, " seed" , 0 );
2264
- llama_params[" stream" ] = json_value (body, " stream" , false );
2265
- llama_params[" mirostat" ] = json_value (body, " mirostat" , false );
2266
- llama_params[" mirostat_tau" ] = json_value (body, " mirostat_tau" , 0.0 );
2267
- llama_params[" mirostat_eta" ] = json_value (body, " mirostat_eta" , 0.0 );
2268
- llama_params[" penalize_nl" ] = json_value (body, " penalize_nl" , false );
2269
- llama_params[" typical_p" ] = json_value (body, " typical_p" , 0.0 );
2270
- llama_params[" repeat_last_n" ] = json_value (body, " repeat_last_n" , 0 );
2271
- llama_params[" ignore_eos" ] = json_value (body, " ignore_eos" , false );
2272
- llama_params[" tfs_z" ] = json_value (body, " tfs_z" , 0.0 );
2273
-
2262
+ llama_params[" presence_penalty" ] = json_value (body, " presence_penalty" , 0.0 );
2263
+ llama_params[" seed" ] = json_value (body, " seed" , 0 );
2264
+ llama_params[" stream" ] = json_value (body, " stream" , false );
2265
+ llama_params[" mirostat" ] = json_value (body, " mirostat" , false );
2266
+ llama_params[" mirostat_tau" ] = json_value (body, " mirostat_tau" , 0.0 );
2267
+ llama_params[" mirostat_eta" ] = json_value (body, " mirostat_eta" , 0.0 );
2268
+ llama_params[" penalize_nl" ] = json_value (body, " penalize_nl" , false );
2269
+ llama_params[" typical_p" ] = json_value (body, " typical_p" , 0.0 );
2270
+ llama_params[" repeat_last_n" ] = json_value (body, " repeat_last_n" , 0 );
2271
+ llama_params[" ignore_eos" ] = json_value (body, " ignore_eos" , false );
2272
+ llama_params[" tfs_z" ] = json_value (body, " tfs_z" , 0.0 );
2273
+
2274
2274
if (llama_params.count (" grammar" ) != 0 ) {
2275
- llama_params[" grammar" ] = json_value (
2276
- body, " grammar" ,
2277
- json::object ());
2275
+ llama_params[" grammar" ] = json_value (body, " grammar" , json::object ());
2278
2276
}
2279
2277
2280
2278
// Handle 'stop' field
@@ -2287,23 +2285,22 @@ json oaicompat_completion_params_parse(
2287
2285
body, " stop" ,
2288
2286
json::array ());
2289
2287
}
2290
-
2288
+
2291
2289
// Ensure there is ChatML-specific end sequence among stop words
2292
2290
llama_params[" stop" ].push_back (" <|im_end|>" );
2293
2291
2294
2292
return llama_params;
2295
2293
}
2296
2294
2297
- static json format_final_response_oaicompat (json request, task_result response,
2298
- bool streaming = false )
2295
+ static json format_final_response_oaicompat (const json &request, const task_result &response, bool streaming = false )
2299
2296
{
2300
2297
json result = response.result_json ;
2301
2298
2302
- bool stopped_word = result.count (" stopped_word" ) != 0 ;
2303
- bool stopped_eos = json_value (result, " stopped_eos" , false );
2299
+ bool stopped_word = result.count (" stopped_word" ) != 0 ;
2300
+ bool stopped_eos = json_value (result, " stopped_eos" , false );
2304
2301
int num_tokens_predicted = json_value (result, " tokens_predicted" , 0 );
2305
- int num_prompt_tokens = json_value (result, " tokens_evaluated" , 0 );
2306
- std::string content = json_value (result, " content" , std::string (" " ));
2302
+ int num_prompt_tokens = json_value (result, " tokens_evaluated" , 0 );
2303
+ std::string content = json_value (result, " content" , std::string (" " ));
2307
2304
2308
2305
std::string finish_reason = " length" ;
2309
2306
if (stopped_word || stopped_eos) {
@@ -2314,10 +2311,10 @@ static json format_final_response_oaicompat(json request, task_result response,
2314
2311
streaming ? json::array ({json{{" finish_reason" , finish_reason},
2315
2312
{" index" , 0 },
2316
2313
{" delta" , json::object ()}}})
2317
- : json::array ({json{{" finish_reason" , finish_reason},
2314
+ : json::array ({json{{" finish_reason" , finish_reason},
2318
2315
{" index" , 0 },
2319
2316
{" message" , json{{" content" , content},
2320
- {" role" , " assistant" }}}}});
2317
+ {" role" , " assistant" }}}}});
2321
2318
2322
2319
std::time_t t = std::time (0 );
2323
2320
@@ -2345,23 +2342,22 @@ static json format_final_response_oaicompat(json request, task_result response,
2345
2342
}
2346
2343
2347
2344
// return value is vector as there is one case where we might need to generate two responses
2348
- static std::vector<json> format_partial_response_oaicompat (task_result response) {
2345
+ static std::vector<json> format_partial_response_oaicompat (const task_result & response) {
2349
2346
json result = response.result_json ;
2350
2347
2351
2348
if (!result.contains (" model" ) || !result.contains (" oaicompat_token_ctr" )) {
2352
2349
return std::vector<json>({response.result_json });
2353
2350
}
2354
2351
2355
2352
bool first = json_value (result, " oaicompat_token_ctr" , 0 ) == 0 ;
2356
- std::string modelname =
2357
- json_value (result, " model" , std::string (DEFAULT_OAICOMPAT_MODEL));
2353
+ std::string modelname = json_value (result, " model" , std::string (DEFAULT_OAICOMPAT_MODEL));
2358
2354
2359
- bool stopped_word = json_value (result, " stopped_word" , false );
2360
- bool stopped_eos = json_value (result, " stopped_eos" , false );
2361
- bool stopped_limit = json_value (result, " stopped_limit" , false );
2355
+ bool stopped_word = json_value (result, " stopped_word" , false );
2356
+ bool stopped_eos = json_value (result, " stopped_eos" , false );
2357
+ bool stopped_limit = json_value (result, " stopped_limit" , false );
2362
2358
std::string content = json_value (result, " content" , std::string (" " ));
2363
2359
2364
- std::string finish_reason = " " ;
2360
+ std::string finish_reason;
2365
2361
if (stopped_word || stopped_eos) {
2366
2362
finish_reason = " stop" ;
2367
2363
}
@@ -2383,7 +2379,7 @@ static std::vector<json> format_partial_response_oaicompat(task_result response)
2383
2379
choices = json::array ({json{{" finish_reason" , nullptr },
2384
2380
{" index" , 0 },
2385
2381
{" delta" , json{{" role" , " assistant" }}}}});
2386
- } else {
2382
+ } else {
2387
2383
// We have to send this as two updates to conform to openai behavior
2388
2384
json initial_ret = json{{" choices" , json::array ({json{
2389
2385
{" finish_reason" , nullptr },
@@ -2400,13 +2396,13 @@ static std::vector<json> format_partial_response_oaicompat(task_result response)
2400
2396
{" choices" , json::array ({json{{" finish_reason" , nullptr },
2401
2397
{" index" , 0 },
2402
2398
{" delta" , json{
2403
- {" content" , content}}}
2399
+ {" content" , content}}}
2404
2400
}})},
2405
2401
{" created" , t},
2406
2402
{" id" , gen_chatcmplid ()},
2407
2403
{" model" , modelname},
2408
2404
{" object" , " chat.completion.chunk" }};
2409
-
2405
+
2410
2406
return std::vector<json>({initial_ret, second_ret});
2411
2407
}
2412
2408
} else {
@@ -2612,9 +2608,9 @@ int main(int argc, char **argv)
2612
2608
task_result result = llama.next_result (task_id);
2613
2609
if (!result.error ) {
2614
2610
const std::string str =
2615
- " data: " +
2616
- result.result_json .dump (-1 , ' ' , false , json::error_handler_t ::replace) +
2617
- " \n\n " ;
2611
+ " data: " +
2612
+ result.result_json .dump (-1 , ' ' , false , json::error_handler_t ::replace) +
2613
+ " \n\n " ;
2618
2614
LOG_VERBOSE (" data stream" , {
2619
2615
{ " to_send" , str }
2620
2616
});
@@ -2627,9 +2623,9 @@ int main(int argc, char **argv)
2627
2623
}
2628
2624
} else {
2629
2625
const std::string str =
2630
- " error: " +
2631
- result.result_json .dump (-1 , ' ' , false , json::error_handler_t ::replace) +
2632
- " \n\n " ;
2626
+ " error: " +
2627
+ result.result_json .dump (-1 , ' ' , false , json::error_handler_t ::replace) +
2628
+ " \n\n " ;
2633
2629
LOG_VERBOSE (" data stream" , {
2634
2630
{ " to_send" , str }
2635
2631
});
@@ -2655,13 +2651,13 @@ int main(int argc, char **argv)
2655
2651
});
2656
2652
2657
2653
2658
- svr. Post ( " /v1/chat/completions " , [&llama]( const httplib::Request &req,
2659
- httplib::Response &res)
2654
+ // TODO: add mount point without "/v1" prefix -- how?
2655
+ svr. Post ( " /v1/chat/completions " , [&llama]( const httplib::Request &req, httplib::Response &res)
2660
2656
{
2661
2657
json data = oaicompat_completion_params_parse (json::parse (req.body ));
2662
2658
2663
2659
const int task_id = llama.request_completion (data, false , false );
2664
-
2660
+
2665
2661
if (!json_value (data, " stream" , false )) {
2666
2662
std::string completion_text;
2667
2663
task_result result = llama.next_result (task_id);
@@ -2683,7 +2679,7 @@ int main(int argc, char **argv)
2683
2679
task_result llama_result = llama.next_result (task_id);
2684
2680
if (!llama_result.error ) {
2685
2681
std::vector<json> result_array = format_partial_response_oaicompat ( llama_result);
2686
-
2682
+
2687
2683
for (auto it = result_array.begin (); it != result_array.end (); ++it)
2688
2684
{
2689
2685
if (!it->empty ()) {
@@ -2725,7 +2721,7 @@ int main(int argc, char **argv)
2725
2721
res.set_chunked_content_provider (" text/event-stream" , chunked_content_provider, on_complete);
2726
2722
}
2727
2723
});
2728
-
2724
+
2729
2725
svr.Post (" /infill" , [&llama](const httplib::Request &req, httplib::Response &res)
2730
2726
{
2731
2727
json data = json::parse (req.body );
0 commit comments