Skip to content

Commit 70005bd

Browse files
committed
tests : use Python to generate tokenizer tests for C++
1 parent dfa058e commit 70005bd

File tree

2 files changed

+46
-35
lines changed

2 files changed

+46
-35
lines changed

tests/test-tokenizer-0.cpp

Lines changed: 34 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6,42 +6,34 @@
66
#include <map>
77
#include <vector>
88

9+
// generate using test-tokenizer-0.py
910
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
1011
static std::map<std::string, std::vector<llama_token>> _k_tests = {
11-
{ "" , { }, },
12-
{ " ", { 259, }, },
13-
{ " ", { 1678, }, },
14-
{ " ", { 268, }, },
15-
{ "\t", { 29871, 12, }, },
16-
{ "\n", { 29871, 13, }, },
17-
{ "\t\n", { 29871, 12, 13, }, },
18-
{ "Hello world", { 15043, 3186, }, },
19-
{ " Hello world", { 29871, 15043, 3186, }, },
20-
{ "Hello World", { 15043, 2787, }, },
21-
{ " Hello World", { 29871, 15043, 2787, }, },
22-
{ " Hello World!", { 29871, 15043, 2787, 29991, }, },
23-
{ "Hello, world!", { 15043, 29892, 3186, 29991, }, },
24-
{ " Hello, world!", { 29871, 15043, 29892, 3186, 29991, }, },
25-
{ " this is 🦙.cpp", { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
26-
{ "w048 7tuijk dsdfhu", { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
27-
{ "нещо на Български", { 1538, 4851, 665, 1386, 29713, 1305, }, },
28-
{ "កាន់តែពិសេសអាចខលចេញ",
29-
{ 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161,
30-
146, 228, 162, 133, 228, 161, 153, 228, 161, 186,
31-
31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228,
32-
161, 136, 228, 161, 132, 228, 161, 158, 228, 161,
33-
136, 228, 162, 132, 228, 161, 140, }, },
34-
{ "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
35-
{ 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871,
36-
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
37-
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
38-
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
39-
{ "Hello", { 15043, }, },
40-
{ " Hello", { 29871, 15043, }, },
41-
{ " Hello", { 259, 15043, }, },
42-
{ " Hello", { 1678, 15043, }, },
43-
{ " Hello", { 268, 15043, }, },
44-
{ " Hello\n Hello", { 268, 15043, 13, 1678, 15043, }, },
12+
{ "" , { }, },
13+
{ " " , { 259, }, },
14+
{ " " , { 1678, }, },
15+
{ " " , { 268, }, },
16+
{ "\t" , { 29871, 12, }, },
17+
{ "\n" , { 29871, 13, }, },
18+
{ "\t\n" , { 29871, 12, 13, }, },
19+
{ "Hello world" , { 15043, 3186, }, },
20+
{ " Hello world" , { 29871, 15043, 3186, }, },
21+
{ "Hello World" , { 15043, 2787, }, },
22+
{ " Hello World" , { 29871, 15043, 2787, }, },
23+
{ " Hello World!" , { 29871, 15043, 2787, 29991, }, },
24+
{ "Hello, world!" , { 15043, 29892, 3186, 29991, }, },
25+
{ " Hello, world!" , { 29871, 15043, 29892, 3186, 29991, }, },
26+
{ " this is 🦙.cpp" , { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
27+
{ "w048 7tuijk dsdfhu" , { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
28+
{ "нещо на Български" , { 1538, 4851, 665, 1386, 29713, 1305, }, },
29+
{ "កាន់តែពិសេសអាចខលចេញ" , { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, 136, 228, 162, 132, 228, 161, 140, }, },
30+
{ "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
31+
{ "Hello" , { 15043, }, },
32+
{ " Hello" , { 29871, 15043, }, },
33+
{ " Hello" , { 259, 15043, }, },
34+
{ " Hello" , { 1678, 15043, }, },
35+
{ " Hello" , { 268, 15043, }, },
36+
{ " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, },
4537
};
4638

4739
return _k_tests;
@@ -99,7 +91,14 @@ int main(int argc, char **argv) {
9991
const std::vector<llama_token> res_bos = llama_tokenize(ctx, test_kv.first, true);
10092
const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
10193

102-
fprintf(stderr, "%s : '%s' tokenized to '%s'\n", __func__, test_kv.first.c_str(), llama_detokenize(ctx, res_bos).c_str());
94+
printf("\n");
95+
printf("src: '%s'\n", test_kv.first.c_str());
96+
printf("res: '%s'\n", llama_detokenize(ctx, res_bos).c_str());
97+
printf("tok: ");
98+
for (const auto & tok : res_bos) {
99+
printf("%d ", tok);
100+
}
101+
printf("\n");
103102

104103
bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;
105104

tests/test-tokenizer-0.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,15 @@
5656
print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
5757
print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello'
5858
print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello'
59+
60+
print("\n\ntests for C++:\n")
61+
for text in tests:
62+
res = tokenizer.encode(text, add_bos=False)
63+
64+
k = text.replace('\n', '\\n')
65+
k = k.replace('\t', '\\t')
66+
k = '"' + k + '"'
67+
print("{ %-24s, { " % k, end='')
68+
for x in res:
69+
print("%7d," % x, end='')
70+
print(" }, },")

0 commit comments

Comments
 (0)