Skip to content

Commit d49b387

Browse files
committed
[Inference] Added option to tokenize the prompt before sending it to the llama.cpp server in order to avoid double BOS at the beginning of the prompt (see discussions at ggml-org/llama.cpp#7107 (comment) and ggml-org/llama.cpp#7332)
1 parent 9258682 commit d49b387

File tree

1 file changed

+12
-0
lines changed

1 file changed

+12
-0
lines changed

thinkbench/inference.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,9 +420,11 @@ class LlamaCppServerInferenceBackend(InferenceBackend):
420420
n_batch: int = 4096
421421
n_parallel: int = 1
422422
continuous_batching: bool = False
423+
tokenize_before: bool = True
423424

424425
process: Popen = None
425426
completion_url_template: Template = Template("http://localhost:${port}/completion")
427+
tokenization_url_template: Template = Template("http://localhost:${port}/tokenize")
426428
properties_url_template: Template = Template("http://localhost:${port}/props")
427429
headers = {'content-type': 'application/json'}
428430

@@ -582,6 +584,16 @@ def create_completion(self, prompt: str, completion_config: CompletionConfig, de
582584
if type(decoder) == GreedyConstrainedDecoder:
583585
completion_config.temperature = -1.0 # Return probs even when using greedy decoding
584586

587+
if self.tokenize_before:
588+
prompt = self.session.post(
589+
url=self.tokenization_url_template.substitute(port=self.port),
590+
headers=self.headers,
591+
json={
592+
"content": prompt,
593+
"add_special": False
594+
}
595+
).json()["tokens"]
596+
585597
request = {
586598
"prompt": prompt,
587599
"id_slot": additional_params["id_slot"], # ensure that a thread only uses its own server slot

0 commit comments

Comments
 (0)