Skip to content

Commit 229ffff

Browse files
jklj077ggerganov
andauthored
llama : add BPE pre-tokenization for Qwen2 (#7114)
* Add BPE pre-tokenization for Qwen2. * minor : fixes --------- Co-authored-by: Ren Xuancheng <17811943+jklj077@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 1fd9c17 commit 229ffff

File tree

8 files changed

+167
-2
lines changed

8 files changed

+167
-2
lines changed

convert-hf-to-gguf-update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ class TOKENIZER_TYPE(IntEnum):
6767
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
6868
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
6969
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
70+
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
7071
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
7172
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
7273
]

convert-hf-to-gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
314314
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
315315
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
316316
res = "command-r"
317+
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
318+
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
319+
res = "qwen2"
317320
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
318321
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
319322
res = "olmo"

llama.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4391,6 +4391,9 @@ static void llm_load_vocab(
43914391
} else if (
43924392
tokenizer_pre == "command-r") {
43934393
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
4394+
} else if (
4395+
tokenizer_pre == "qwen2") {
4396+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
43944397
} else if (
43954398
tokenizer_pre == "olmo") {
43964399
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
@@ -12263,6 +12266,13 @@ struct llm_tokenizer_bpe {
1226312266
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
1226412267
});
1226512268
break;
12269+
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
12270+
word_collection = unicode_regex_split(text, {
12271+
// original regex from tokenizer.json
12272+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
12273+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12274+
});
12275+
break;
1226612276
default:
1226712277
// default regex for BPE tokenization pre-processing
1226812278
word_collection = unicode_regex_split(text, {

llama.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,9 @@ extern "C" {
8181
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
8282
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
8383
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
84-
LLAMA_VOCAB_PRE_TYPE_OLMO = 10,
85-
LLAMA_VOCAB_PRE_TYPE_DBRX = 11,
84+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
85+
LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
86+
LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
8687
};
8788

8889
// note: these values should be synchronized with ggml_rope

models/ggml-vocab-qwen2.gguf

5.65 MB
Binary file not shown.

models/ggml-vocab-qwen2.gguf.inp

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
__ggml_vocab_test__
13+
14+
__ggml_vocab_test__
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
__ggml_vocab_test__
22+
23+
24+
25+
26+
__ggml_vocab_test__
27+
28+
29+
__ggml_vocab_test__
30+
Hello world
31+
__ggml_vocab_test__
32+
Hello world
33+
__ggml_vocab_test__
34+
Hello World
35+
__ggml_vocab_test__
36+
Hello World
37+
__ggml_vocab_test__
38+
Hello World!
39+
__ggml_vocab_test__
40+
Hello, world!
41+
__ggml_vocab_test__
42+
Hello, world!
43+
__ggml_vocab_test__
44+
this is 🦙.cpp
45+
__ggml_vocab_test__
46+
w048 7tuijk dsdfhu
47+
__ggml_vocab_test__
48+
нещо на Български
49+
__ggml_vocab_test__
50+
កាន់តែពិសេសអាចខលចេញ
51+
__ggml_vocab_test__
52+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
__ggml_vocab_test__
62+
Hello
63+
__ggml_vocab_test__
64+
Hello
65+
Hello
66+
__ggml_vocab_test__
67+
(
68+
__ggml_vocab_test__
69+
70+
=
71+
__ggml_vocab_test__
72+
' era
73+
__ggml_vocab_test__
74+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75+
__ggml_vocab_test__
76+
3
77+
__ggml_vocab_test__
78+
33
79+
__ggml_vocab_test__
80+
333
81+
__ggml_vocab_test__
82+
3333
83+
__ggml_vocab_test__
84+
33333
85+
__ggml_vocab_test__
86+
333333
87+
__ggml_vocab_test__
88+
3333333
89+
__ggml_vocab_test__
90+
33333333
91+
__ggml_vocab_test__
92+
333333333
93+
__ggml_vocab_test__
94+
95+
96+
97+
98+
99+
100+
101+
102+
103+
104+
105+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
106+
__ggml_vocab_test__

models/ggml-vocab-qwen2.gguf.out

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
1122 220 19 220 26062 3951
2+
37 50753 261
3+
4+
220
5+
256
6+
262
7+
197
8+
198
9+
271
10+
1406
11+
1572
12+
9707 1879
13+
21927 1879
14+
9707 4337
15+
21927 4337
16+
21927 4337 0
17+
9707 11 1879 0
18+
21927 11 1879 0
19+
419 374 11162 99 247 13 10821
20+
86 15 19 23 220 22 83 1963 41808 11472 2940 16739
21+
78762 14144 1456 13073 63471 33594 3038 133178 79012
22+
146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
23+
145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
24+
9707
25+
21927
26+
220 21927
27+
256 21927
28+
262 21927
29+
262 21927 198 262 21927
30+
320
31+
198 284
32+
6 11385
33+
9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
34+
18
35+
18 18
36+
18 18 18
37+
18 18 18 18
38+
18 18 18 18 18
39+
18 18 18 18 18 18
40+
18 18 18 18 18 18 18
41+
18 18 18 18 18 18 18 18
42+
18 18 18 18 18 18 18 18 18
43+
198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE
8484
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
8585
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
8686
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
87+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
8788

8889
# build test-tokenizer-1-bpe target once and add many tests
8990
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)

0 commit comments

Comments
 (0)