add support for llama2 70b

bretello · bretello · commit fae32b5f97a8 · 2023-07-24T15:52:06.000+02:00
diff --git a/README.md b/README.md
@@ -135,6 +135,14 @@ For instance, if you want to work with larger contexts, you can expand the conte
 llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
 ```
 
+### Loading llama-2 70b
+
+Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
+
+```python
+llm = Llama(model_path="./models/7B/ggml-model.bin", n_gqa=8)
+```
+
 ## Web Server
 
 `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -216,6 +216,7 @@ def __init__(
         embedding: bool = False,
         n_threads: Optional[int] = None,
         n_batch: int = 512,
+        n_gqa: Optional[int] = None,  # must be 8 for llama2 70b
         last_n_tokens_size: int = 64,
         lora_base: Optional[str] = None,
         lora_path: Optional[str] = None,
@@ -260,6 +261,8 @@ def __init__(
 
         self.params = llama_cpp.llama_context_default_params()
         self.params.n_ctx = n_ctx
+        if n_gqa is not None:
+            self.params.n_gqa = n_gqa
         self.params.n_gpu_layers = n_gpu_layers
         self.params.seed = seed
         self.params.f16_kv = f16_kv
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -162,6 +162,7 @@ class llama_token_data_array(Structure):
 #     uint32_t seed;                         // RNG seed, -1 for random
 #     int32_t  n_ctx;                        // text context
 #     int32_t  n_batch;                      // prompt processing batch size
+#     int32_t  n_gqa                         // grouped-query attention (TEMP - will be moved to model hparams)
 #     int32_t  n_gpu_layers;                 // number of layers to store in VRAM
 #     int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
 #     const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
@@ -190,6 +191,7 @@ class llama_context_params(Structure):
         ("seed", c_uint32),
         ("n_ctx", c_int32),
         ("n_batch", c_int32),
+        ("n_gqa", c_int32),
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
         ("tensor_split", POINTER(c_float)),