@@ -2208,7 +2208,7 @@ struct server_context {
2208
2208
bool full_stop_reached = false ;
2209
2209
bool partial_stop_reached = false ;
2210
2210
2211
- // search start strings
2211
+ // search the start strings
2212
2212
if (start_string_missing && !incomplete && slot.has_next_token ) {
2213
2213
size_t max_start_string_size = slot.params .start_string_max_len ;
2214
2214
size_t search_len = max_start_string_size + token_str.size ();
@@ -2230,17 +2230,11 @@ struct server_context {
2230
2230
}
2231
2231
}
2232
2232
2233
+ // search the stop strings
2233
2234
if (!incomplete) {
2234
2235
size_t pos = std::min (slot.n_sent_text , slot.generated_text .size ());
2235
2236
2236
2237
const std::string str_test = slot.generated_text .substr (pos);
2237
- bool send_text = true ;
2238
-
2239
- // Handle the start strings
2240
- if (start_string_missing)
2241
- {
2242
- send_text = false ;
2243
- }
2244
2238
2245
2239
// search stop word and delete it
2246
2240
size_t stop_pos = slot.find_stopping_strings (str_test, token_str.size (), true );
@@ -2249,33 +2243,44 @@ struct server_context {
2249
2243
slot.generated_text .begin () + pos + stop_pos,
2250
2244
slot.generated_text .end ());
2251
2245
pos = std::min (slot.n_sent_text , slot.generated_text .size ());
2246
+ full_stop_reached = true ;
2252
2247
} else if (slot.has_next_token ) {
2253
2248
stop_pos = slot.find_stopping_strings (str_test, token_str.size (), false );
2254
- send_text = send_text && stop_pos == std::string::npos;
2249
+ partial_stop_reached = ( stop_pos != std::string::npos) ;
2255
2250
}
2251
+ }
2256
2252
2257
- // check if there is any token to predict
2258
- if (send_text) {
2259
- // no send the stop word in the response
2260
- result.text_to_send = slot.generated_text .substr (pos, std::string::npos);
2261
- slot.n_sent_text += result.text_to_send .size ();
2262
- // add the token to slot queue and cache
2263
- } else {
2264
- result.text_to_send = " " ;
2265
- }
2253
+ if (full_stop_reached)
2254
+ {
2255
+ slot.stop = STOP_TYPE_WORD;
2256
+ slot.has_next_token = false ;
2257
+ SLT_DBG (slot, " stopped by word, n_decoded = %d, n_predict = %d\n " , slot.n_decoded , slot.params .n_predict );
2258
+ }
2266
2259
2267
- slot.add_token (result);
2268
- if (slot.params .stream ) {
2269
- send_partial_response (slot, result);
2270
- }
2260
+ if (partial_stop_reached || start_string_missing)
2261
+ {
2262
+ result.text_to_send = " " ;
2263
+ }
2264
+ else
2265
+ {
2266
+ size_t valid_generated_len = validate_utf8 (slot.generated_text );
2267
+ size_t available_data = valid_generated_len - slot.n_sent_text ;
2268
+ result.text_to_send = slot.generated_text .substr (slot.n_sent_text , available_data);
2269
+ slot.n_sent_text += result.text_to_send .size ();
2270
+ }
2271
+
2272
+ slot.add_token (result);
2273
+
2274
+ if (slot.params .stream && !result.text_to_send .empty ()) {
2275
+ send_partial_response (slot, result);
2271
2276
}
2272
2277
2273
2278
if (incomplete) {
2274
2279
slot.has_next_token = true ;
2275
2280
}
2276
2281
2277
2282
// check the limits
2278
- if (slot.n_decoded > 0 && slot. has_next_token && !slot. has_budget (params_base) ) {
2283
+ if (slot.has_next_token && token_budget_exhausted ) {
2279
2284
slot.stop = STOP_TYPE_LIMIT;
2280
2285
slot.has_next_token = false ;
2281
2286
0 commit comments