Skip to content

Commit 68c5ac6

Browse files
committed
Merge branch 'master' into compilade/lazy-convert-hf
2 parents 62303e7 + bcdee0d commit 68c5ac6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+1151
-771
lines changed

.flake8

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,17 @@
11
[flake8]
22
max-line-length = 125
33
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
4-
exclude = examples/*,examples/*/**,*/**/__init__.py
4+
exclude =
5+
# Do not traverse examples
6+
examples,
7+
# Do not include package initializers
8+
__init__.py,
9+
# No need to traverse our git directory
10+
.git,
11+
# There's no value in checking cache directories
12+
__pycache__,
13+
# No need to include the build path
14+
build,
15+
# This contains builds that we don't want to check
16+
dist # This is generated with `python build .` for package releases
17+
# max-complexity = 10

.github/workflows/bench.yml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,19 @@ jobs:
5252
ftype: q4_0
5353
pr_comment_enabled: "true"
5454

55-
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
55+
if: |
56+
inputs.gpu-series == 'Standard_NC4as_T4_v3'
57+
|| (
58+
github.event_name == 'schedule'
59+
&& github.ref_name == 'master'
60+
&& github.repository_owner == 'ggerganov'
61+
)
62+
|| github.event_name == 'pull_request_target'
63+
|| (
64+
github.event_name == 'push'
65+
&& github.event.ref == 'refs/heads/master'
66+
&& github.repository_owner == 'ggerganov'
67+
)
5668
steps:
5769
- name: Clone
5870
id: checkout

Makefile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
7777
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
7878
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
7979
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
80-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
8280
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
8381
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
8482
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
83+
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
8584
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
8685
continue; \
8786
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \

README.md

Lines changed: 10 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,8 @@ Building the program with BLAS support may lead to some performance improvements
712712
713713
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
714714
715+
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
716+
715717
```bash
716718
# obtain the official LLaMA model weights and place them in ./models
717719
ls ./models
@@ -977,48 +979,20 @@ Here is a demo of an interactive session running on Pixel 5 phone:
977979
978980
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
979981
980-
#### Building the Project using Termux (F-Droid)
981-
Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
982-
983-
Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
984-
985-
If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
986-
```
987-
apt install libopenblas
988-
```
989-
990-
Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
991-
```
992-
apt install ocl-icd opencl-headers opencl-clhpp clinfo
993-
```
994-
995-
In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
982+
#### Build on Android using Termux
983+
[Termux](https://github.com/termux/termux-app#installation) is an alternative to execute `llama.cpp` on an Android device (no root required).
996984
```
997-
cmake .
998-
make
999-
cp libclblast.so* $PREFIX/lib
1000-
cp ./include/clblast.h ../llama.cpp
985+
apt update && apt upgrade -y
986+
apt install git
1001987
```
1002988
1003-
Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
989+
It's recommended to move your model inside the `~/` directory for best performance:
1004990
```
1005-
cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
1006-
cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
1007-
make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
1008-
```
1009-
1010-
Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
991+
cd storage/downloads
992+
mv model.gguf ~/
1011993
```
1012-
GGML_OPENCL_PLATFORM=0
1013-
GGML_OPENCL_DEVICE=0
1014-
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
1015-
```
1016-
1017-
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
1018-
1019-
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
1020994
1021-
Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
995+
[Follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
1022996
1023997
### Docker
1024998

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ int32_t get_num_physical_cores() {
7676
// enumerate the set of thread siblings, num entries is num cores
7777
std::unordered_set<std::string> siblings;
7878
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
79-
std::ifstream thread_siblings("/sys/devices/system/cpu"
79+
std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
8080
+ std::to_string(cpu) + "/topology/thread_siblings");
8181
if (!thread_siblings.is_open()) {
8282
break; // no more cpus

convert-hf-to-gguf-update.py

100644100755
Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env python3
2+
13
# This script downloads the tokenizer models of the specified models from Huggingface and
24
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
35
#
@@ -31,6 +33,7 @@
3133
from enum import IntEnum, auto
3234
from transformers import AutoTokenizer
3335

36+
logging.basicConfig(level=logging.DEBUG)
3437
logger = logging.getLogger("convert-hf-to-gguf-update")
3538

3639

@@ -62,6 +65,8 @@ class TOKENIZER_TYPE(IntEnum):
6265
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
6366
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
6467
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
68+
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
69+
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
6570
]
6671

6772
# make directory "models/tokenizers" if it doesn't exist
@@ -102,6 +107,14 @@ def download_file_with_auth(url, token, save_path):
102107
save_path = f"models/tokenizers/{name}/tokenizer.json"
103108
download_file_with_auth(url, token, save_path)
104109

110+
# if downloaded file is less than 1KB, we likely need to download an LFS instead
111+
if os.path.getsize(save_path) < 1024:
112+
# remove the file
113+
os.remove(save_path)
114+
url = f"{repo}/resolve/main/tokenizer.json"
115+
save_path = f"models/tokenizers/{name}/tokenizer.json"
116+
download_file_with_auth(url, token, save_path)
117+
105118
if tokt == TOKENIZER_TYPE.SPM:
106119
url = f"{repo}/resolve/main/tokenizer.model"
107120
save_path = f"models/tokenizers/{name}/tokenizer.model"
@@ -158,8 +171,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
158171
chktok = tokenizer.encode(chktxt)
159172
chkhsh = sha256(str(chktok).encode()).hexdigest()
160173
161-
print(f"chktok: {{chktok}}")
162-
print(f"chkhsh: {{chkhsh}}")
174+
logger.debug(f"chktok: {{chktok}}")
175+
logger.debug(f"chkhsh: {{chkhsh}}")
163176
164177
res = None
165178
@@ -168,22 +181,22 @@ def get_vocab_base_pre(self, tokenizer) -> str:
168181
# don't edit the hashes manually!
169182
{src_ifs}
170183
if res is None:
171-
print("\\n")
172-
print("**************************************************************************************")
173-
print("** WARNING: The BPE pre-tokenizer was not recognized!")
174-
print("** There are 2 possible reasons for this:")
175-
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
176-
print("** - the pre-tokenization config has changed upstream")
177-
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
178-
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
179-
print("**")
180-
print(f"** chkhsh: {{chkhsh}}")
181-
print("**************************************************************************************")
182-
print("\\n")
184+
logger.warning("\\n")
185+
logger.warning("**************************************************************************************")
186+
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
187+
logger.warning("** There are 2 possible reasons for this:")
188+
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
189+
logger.warning("** - the pre-tokenization config has changed upstream")
190+
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
191+
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
192+
logger.warning("**")
193+
logger.warning(f"** chkhsh: {{chkhsh}}")
194+
logger.warning("**************************************************************************************")
195+
logger.warning("\\n")
183196
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
184197
185-
print(f"tokenizer.ggml.pre: {{repr(res)}}")
186-
print(f"chkhsh: {{chkhsh}}")
198+
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
199+
logger.debug(f"chkhsh: {{chkhsh}}")
187200
188201
return res
189202
"""
@@ -197,6 +210,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
197210
# generate tests for each tokenizer model
198211

199212
tests = [
213+
"ied 4 ½ months",
214+
"Führer",
200215
"",
201216
" ",
202217
" ",
@@ -281,6 +296,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
281296
for model in models:
282297
name = model["name"]
283298

284-
logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
299+
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
285300

286301
logger.info("\n")

convert-hf-to-gguf.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
391391
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
392392
# ref: https://huggingface.co/openai-community/gpt2
393393
res = "gpt-2"
394+
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
395+
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
396+
res = "refact"
397+
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
398+
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
399+
res = "command-r"
394400

395401
if res is None:
396402
logger.warning("\n")
@@ -407,7 +413,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
407413
logger.warning("\n")
408414
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
409415

410-
logger.debug(f"tokenizer.ggml.pre: {res}")
416+
logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
411417
logger.debug(f"chkhsh: {chkhsh}")
412418

413419
return res

convert-lora-to-ggml.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
1717
import gguf
1818

19+
logging.basicConfig(level=logging.DEBUG)
1920
logger = logging.getLogger("lora-to-gguf")
2021

2122
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}

examples/gguf-split/gguf-split.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ struct split_params {
3232
int n_split_tensors = 128;
3333
std::string input;
3434
std::string output;
35+
bool no_tensor_first_split = false;
3536
bool dry_run = false;
3637
};
3738

@@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) {
4950
printf(" --merge merge multiple GGUF to a single GGUF\n");
5051
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
5152
printf(" --split-max-size N(M|G) max size per split\n");
53+
printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
5254
printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
5355
printf("\n");
5456
}
@@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
100102
arg_found = true;
101103
params.dry_run = true;
102104
}
105+
if (arg == "--no-tensor-first-split") {
106+
arg_found = true;
107+
params.no_tensor_first_split = true;
108+
}
103109

104110
if (is_op_set) {
105111
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
@@ -200,10 +206,10 @@ struct split_strategy {
200206
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
201207
int i_split = -1;
202208
struct gguf_context * ctx_out = NULL;
203-
auto new_ctx_out = [&]() {
209+
auto new_ctx_out = [&](bool allow_no_tensors) {
204210
i_split++;
205211
if (ctx_out != NULL) {
206-
if (gguf_get_n_tensors(ctx_out) == 0) {
212+
if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
207213
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
208214
exit(EXIT_FAILURE);
209215
}
@@ -220,7 +226,12 @@ struct split_strategy {
220226
};
221227

222228
// initialize ctx_out for the first split
223-
new_ctx_out();
229+
new_ctx_out(false);
230+
231+
// skip first split if no_tensor_first_split is set
232+
if (params.no_tensor_first_split) {
233+
new_ctx_out(true);
234+
}
224235

225236
// process tensors one by one
226237
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
@@ -230,7 +241,7 @@ struct split_strategy {
230241
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
231242
size_t next_tensors_size = curr_tensors_size + n_bytes;
232243
if (should_split(i, next_tensors_size)) {
233-
new_ctx_out();
244+
new_ctx_out(false);
234245
curr_tensors_size = n_bytes;
235246
} else {
236247
curr_tensors_size = next_tensors_size;

examples/gguf-split/tests.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,15 @@ $MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
5555
echo PASS
5656
echo
5757

58-
# 4. Split with no tensor in metadata
59-
#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
60-
#echo PASS
61-
#echo
58+
# 4. Split with no tensors in the first split
59+
$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
60+
echo PASS
61+
echo
6262

6363
# 4b. Test the sharded model is loading properly
64-
#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
65-
#echo PASS
66-
#echo
64+
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
65+
echo PASS
66+
echo
6767

6868
# 5. Merge
6969
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf

examples/llama-bench/llama-bench.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ struct cmd_params {
178178
std::vector<std::vector<float>> tensor_split;
179179
std::vector<bool> use_mmap;
180180
std::vector<bool> embeddings;
181+
ggml_numa_strategy numa;
181182
int reps;
182183
bool verbose;
183184
output_formats output_format;
@@ -200,6 +201,7 @@ static const cmd_params cmd_params_defaults = {
200201
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
201202
/* use_mmap */ {true},
202203
/* embeddings */ {false},
204+
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
203205
/* reps */ 5,
204206
/* verbose */ false,
205207
/* output_format */ MARKDOWN
@@ -224,6 +226,7 @@ static void print_usage(int /* argc */, char ** argv) {
224226
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
225227
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
226228
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
229+
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
227230
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
228231
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
229232
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@@ -396,6 +399,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
396399
}
397400
auto p = split<bool>(argv[i], split_delim);
398401
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
402+
} else if (arg == "--numa") {
403+
if (++i >= argc) {
404+
invalid_param = true;
405+
break;
406+
} else {
407+
std::string value(argv[i]);
408+
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
409+
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
410+
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
411+
else { invalid_param = true; break; }
412+
}
399413
} else if (arg == "-fa" || arg == "--flash-attn") {
400414
if (++i >= argc) {
401415
invalid_param = true;
@@ -1215,6 +1229,7 @@ int main(int argc, char ** argv) {
12151229
llama_log_set(llama_null_log_callback, NULL);
12161230
}
12171231
llama_backend_init();
1232+
llama_numa_init(params.numa);
12181233

12191234
// initialize printer
12201235
std::unique_ptr<printer> p;

0 commit comments

Comments
 (0)