[Inference] Added option to tokenize the prompt before sending it to the llama.cpp server in order to avoid double BOS at the beginning of the prompt (see discussions at ggml-org/llama.cpp#7107 (comment) and ggml-org/llama.cpp#7332)

reuank · reuank · commit d49b3875d6cc · 2024-05-16T23:00:40.000+02:00
diff --git a/thinkbench/inference.py b/thinkbench/inference.py
@@ -420,9 +420,11 @@ class LlamaCppServerInferenceBackend(InferenceBackend):
     n_batch: int = 4096
     n_parallel: int = 1
     continuous_batching: bool = False
+    tokenize_before: bool = True
 
     process: Popen = None
     completion_url_template: Template = Template("http://localhost:${port}/completion")
+    tokenization_url_template: Template = Template("http://localhost:${port}/tokenize")
     properties_url_template: Template = Template("http://localhost:${port}/props")
     headers = {'content-type': 'application/json'}
 
@@ -582,6 +584,16 @@ def create_completion(self, prompt: str, completion_config: CompletionConfig, de
         if type(decoder) == GreedyConstrainedDecoder:
             completion_config.temperature = -1.0  # Return probs even when using greedy decoding
 
+        if self.tokenize_before:
+            prompt = self.session.post(
+                url=self.tokenization_url_template.substitute(port=self.port),
+                headers=self.headers,
+                json={
+                    "content": prompt,
+                    "add_special": False
+                }
+            ).json()["tokens"]
+
         request = {
             "prompt": prompt,
             "id_slot": additional_params["id_slot"],  # ensure that a thread only uses its own server slot