From 21124db242be8b49ba1e4d28ebfaa4282e477d8a Mon Sep 17 00:00:00 2001 From: windspirit95 Date: Fri, 29 Mar 2024 15:04:52 +0900 Subject: [PATCH 1/5] Add logprobs return in ChatCompletionResponse --- llama_cpp/llama.py | 4 ++++ llama_cpp/llama_chat_format.py | 5 +++++ llama_cpp/llama_types.py | 1 + llama_cpp/server/types.py | 9 +++++++++ 4 files changed, 19 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fed84d579..e2d0644fe 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1641,6 +1641,9 @@ def create_chat_completion( handler = self.chat_handler or llama_chat_format.get_chat_completion_handler( self.chat_format ) + _logprobs = top_logprobs + if not logprobs: + _logprobs = 0 return handler( llama=self, messages=messages, @@ -1653,6 +1656,7 @@ def create_chat_completion( top_k=top_k, min_p=min_p, typical_p=typical_p, + logprobs=_logprobs, stream=stream, stop=stop, seed=seed, diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ccf4fd0b7..06cf9ced7 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -231,6 +231,7 @@ def _convert_text_completion_to_chat( "role": "assistant", "content": completion["choices"][0]["text"], }, + "logprobs": completion["choices"][0]["logprobs"], "finish_reason": completion["choices"][0]["finish_reason"], } ], @@ -254,6 +255,7 @@ def _convert_text_completion_chunks_to_chat( "delta": { "role": "assistant", }, + "logprobs": None, "finish_reason": None, } ], @@ -273,6 +275,7 @@ def _convert_text_completion_chunks_to_chat( if chunk["choices"][0]["finish_reason"] is None else {} ), + "logprobs": chunk["choices"][0]["logprobs"], "finish_reason": chunk["choices"][0]["finish_reason"], } ], @@ -487,6 +490,7 @@ def chat_completion_handler( temperature: float = 0.2, top_p: float = 0.95, top_k: int = 40, + logprobs: int = 0, min_p: float = 0.05, typical_p: float = 1.0, stream: bool = False, @@ -576,6 +580,7 @@ def chat_completion_handler( top_k=top_k, min_p=min_p, typical_p=typical_p, + logprobs=logprobs, stream=stream, stop=stop, seed=seed, diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 1b1befebe..87e000f14 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -84,6 +84,7 @@ class ChatCompletionFunction(TypedDict): class ChatCompletionResponseChoice(TypedDict): index: int message: "ChatCompletionResponseMessage" + logprobs: Optional[CompletionLogprobs] finish_reason: Optional[str] diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index 378f8d74c..028aad185 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -209,6 +209,15 @@ class CreateChatCompletionRequest(BaseModel): default=None, description="The maximum number of tokens to generate. Defaults to inf", ) + logprobs: Optional[bool] = Field( + default=True, + description="Whether to output the logprobs or not. Default is True" + ) + top_logprobs: Optional[int] = Field( + default=None, + ge=0, + description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.", + ) temperature: float = temperature_field top_p: float = top_p_field min_p: float = min_p_field From 9c3d35fce118a9f49fbbc1ba86ebcd0dc165b2e2 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 30 Mar 2024 19:42:03 -0400 Subject: [PATCH 2/5] Fix duplicate field --- llama_cpp/server/types.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index 028aad185..eb246a01e 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -130,7 +130,6 @@ class CreateCompletionRequest(BaseModel): presence_penalty: Optional[float] = presence_penalty_field frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) - logprobs: Optional[int] = Field(None) seed: Optional[int] = Field(None) # ignored or currently unsupported From f884796b3340410d142bce6f6f00c859e9eb6234 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 30 Mar 2024 19:42:24 -0400 Subject: [PATCH 3/5] Set default to false --- llama_cpp/server/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index eb246a01e..ce9c87a69 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -209,7 +209,7 @@ class CreateChatCompletionRequest(BaseModel): description="The maximum number of tokens to generate. Defaults to inf", ) logprobs: Optional[bool] = Field( - default=True, + default=False, description="Whether to output the logprobs or not. Default is True" ) top_logprobs: Optional[int] = Field( From f3751816fe42b1431021452498728d10e860b649 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 30 Mar 2024 19:45:39 -0400 Subject: [PATCH 4/5] Simplify check --- llama_cpp/llama.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index e2d0644fe..66caaa958 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1641,9 +1641,6 @@ def create_chat_completion( handler = self.chat_handler or llama_chat_format.get_chat_completion_handler( self.chat_format ) - _logprobs = top_logprobs - if not logprobs: - _logprobs = 0 return handler( llama=self, messages=messages, @@ -1656,7 +1653,7 @@ def create_chat_completion( top_k=top_k, min_p=min_p, typical_p=typical_p, - logprobs=_logprobs, + logprobs=top_logprobs if logprobs else None, stream=stream, stop=stop, seed=seed, From c53690353c97ceb2ce44f259d1d85d9a6e28e3b5 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 31 Mar 2024 13:27:53 -0400 Subject: [PATCH 5/5] Add server example --- llama_cpp/server/app.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index d5e2f7ee8..815ed3c5e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -405,6 +405,18 @@ async def create_chat_completion( } }, }, + "logprobs": { + "summary": "Logprobs", + "value": { + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], + "logprobs": True, + "top_logprobs": 10 + }, + }, } ), llama_proxy: LlamaProxy = Depends(get_llama_proxy),