Skip to content

Commit fae32b5

Browse files
committed
add support for llama2 70b
1 parent 15062f1 commit fae32b5

File tree

3 files changed

+13
-0
lines changed

3 files changed

+13
-0
lines changed

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,14 @@ For instance, if you want to work with larger contexts, you can expand the conte
135135
llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
136136
```
137137

138+
### Loading llama-2 70b
139+
140+
Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
141+
142+
```python
143+
llm = Llama(model_path="./models/7B/ggml-model.bin", n_gqa=8)
144+
```
145+
138146
## Web Server
139147

140148
`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.

llama_cpp/llama.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ def __init__(
216216
embedding: bool = False,
217217
n_threads: Optional[int] = None,
218218
n_batch: int = 512,
219+
n_gqa: Optional[int] = None, # must be 8 for llama2 70b
219220
last_n_tokens_size: int = 64,
220221
lora_base: Optional[str] = None,
221222
lora_path: Optional[str] = None,
@@ -260,6 +261,8 @@ def __init__(
260261

261262
self.params = llama_cpp.llama_context_default_params()
262263
self.params.n_ctx = n_ctx
264+
if n_gqa is not None:
265+
self.params.n_gqa = n_gqa
263266
self.params.n_gpu_layers = n_gpu_layers
264267
self.params.seed = seed
265268
self.params.f16_kv = f16_kv

llama_cpp/llama_cpp.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ class llama_token_data_array(Structure):
162162
# uint32_t seed; // RNG seed, -1 for random
163163
# int32_t n_ctx; // text context
164164
# int32_t n_batch; // prompt processing batch size
165+
# int32_t n_gqa // grouped-query attention (TEMP - will be moved to model hparams)
165166
# int32_t n_gpu_layers; // number of layers to store in VRAM
166167
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
167168
# const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
@@ -190,6 +191,7 @@ class llama_context_params(Structure):
190191
("seed", c_uint32),
191192
("n_ctx", c_int32),
192193
("n_batch", c_int32),
194+
("n_gqa", c_int32),
193195
("n_gpu_layers", c_int32),
194196
("main_gpu", c_int32),
195197
("tensor_split", POINTER(c_float)),

0 commit comments

Comments
 (0)