21
21
# TODO: automate the update of convert-hf-to-gguf.py
22
22
#
23
23
24
+ import json
25
+ import logging
24
26
import os
25
- import requests
26
27
import sys
27
- import json
28
-
29
- from hashlib import sha256
30
28
from enum import IntEnum , auto
29
+ from hashlib import sha256
30
+
31
+ import requests
32
+ from transformers import AutoTokenizer
33
+
34
+ logger = logging .getLogger ("convert-hf-to-gguf-update" )
35
+
31
36
32
37
class TOKENIZER_TYPE (IntEnum ):
33
38
SPM = auto ()
34
39
BPE = auto ()
35
40
WPM = auto ()
36
41
42
+
37
43
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
38
44
# will be updated with time - contributions welcome
39
45
chktxt = '\n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ```````\" \" \" \" ......!!!!!!?????? I\' ve been \' told he\' s there, \' RE you sure? \' M not sure I\' ll make it, \' D you like some tea? We\' Ve a\' lL'
40
46
41
47
if len (sys .argv ) == 2 :
42
48
token = sys .argv [1 ]
43
49
else :
44
- print ("Usage: python convert-hf-to-gguf-update.py <huggingface_token>" )
50
+ logger . info ("Usage: python convert-hf-to-gguf-update.py <huggingface_token>" )
45
51
sys .exit (1 )
46
52
47
53
# TODO: add models here, base models preferred
48
54
models = [
49
- { "name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
50
- { "name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
51
- { "name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
52
- { "name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
53
- { "name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
54
- { "name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
55
- { "name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
56
- { "name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
57
- { "name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
58
- { "name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
59
- { "name" : "phi" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/microsoft/phi-1" , },
60
- { "name" : "stablelm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b" , },
61
- { "name" : "qwen" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Qwen/Qwen-tokenizer" , },
62
- { "name" : "mistral-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2" , },
63
- { "name" : "mistral-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2" , },
64
- { "name" : "mixtral-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" , },
65
- { "name" : "mixtral-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" , },
55
+ {"name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
56
+ {"name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
57
+ {"name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
58
+ {"name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
59
+ {"name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
60
+ {"name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
61
+ {"name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
62
+ {"name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
63
+ {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
64
+ {"name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
65
+ {"name" : "phi" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/microsoft/phi-1" , },
66
+ {"name" : "stablelm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b" , },
67
+ {"name" : "qwen" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Qwen/Qwen-tokenizer" , },
68
+ {"name" : "mistral-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2" , },
69
+ {"name" : "mistral-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2" , },
70
+ {"name" : "mixtral-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" , },
71
+ {"name" : "mixtral-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" , },
72
+ {"name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
73
+ {"name" : "command-r" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" , },
66
74
]
67
75
68
76
# make directory "models/tokenizers" if it doesn't exist
69
77
if not os .path .exists ("models/tokenizers" ):
70
78
os .makedirs ("models/tokenizers" )
71
79
80
+
72
81
def download_file_with_auth (url , token , save_path ):
73
82
headers = {"Authorization" : f"Bearer { token } " }
74
83
response = requests .get (url , headers = headers )
75
84
if response .status_code == 200 :
76
85
with open (save_path , 'wb' ) as f :
77
86
f .write (response .content )
78
- print (f"File { save_path } downloaded successfully" )
87
+ logger . info (f"File { save_path } downloaded successfully" )
79
88
else :
80
- print (f"Failed to download file. Status code: { response .status_code } " )
89
+ logger .info (f"Failed to download file. Status code: { response .status_code } " )
90
+
81
91
82
92
# download the tokenizer models
83
93
for model in models :
@@ -88,10 +98,10 @@ def download_file_with_auth(url, token, save_path):
88
98
if not os .path .exists (f"models/tokenizers/{ name } " ):
89
99
os .makedirs (f"models/tokenizers/{ name } " )
90
100
else :
91
- print (f"Directory models/tokenizers/{ name } already exists - skipping" )
101
+ logger . info (f"Directory models/tokenizers/{ name } already exists - skipping" )
92
102
continue
93
103
94
- print (f"Downloading { name } to models/tokenizers/{ name } " )
104
+ logger . info (f"Downloading { name } to models/tokenizers/{ name } " )
95
105
96
106
url = f"{ repo } /raw/main/config.json"
97
107
save_path = f"models/tokenizers/{ name } /config.json"
@@ -122,76 +132,76 @@ def download_file_with_auth(url, token, save_path):
122
132
continue
123
133
124
134
# create the tokenizer
125
- from transformers import AutoTokenizer
126
135
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
127
136
128
137
chktok = tokenizer .encode (chktxt )
129
138
chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
130
139
131
- print (f"model: { name } " )
132
- print (f"tokt: { tokt } " )
133
- print (f"repo: { model ['repo' ]} " )
134
- print (f"chktok: { chktok } " )
135
- print (f"chkhsh: { chkhsh } " )
140
+ logger . info (f"model: { name } " )
141
+ logger . info (f"tokt: { tokt } " )
142
+ logger . info (f"repo: { model ['repo' ]} " )
143
+ logger . info (f"chktok: { chktok } " )
144
+ logger . info (f"chkhsh: { chkhsh } " )
136
145
137
146
# print the "pre_tokenizer" content from the tokenizer.json
138
147
with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
139
148
cfg = json .load (f )
140
149
pre_tokenizer = cfg ["pre_tokenizer" ]
141
- print ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
150
+ logger . info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
142
151
143
- print ( f" \n " )
152
+ logger . info ( " " )
144
153
145
154
src_ifs += f" if chkhsh == \" { chkhsh } \" :\n "
146
155
src_ifs += f" # ref: { model ['repo' ]} \n "
147
156
src_ifs += f" res = \" { name } \" \n "
148
157
149
- src_func = ""
150
- src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n "
151
- src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n "
152
- src_func += " # is specific for the BPE pre-tokenizer used by the model\n "
153
- src_func += " # we will use this unique identifier to write a \" tokenizer.ggml.pre\" entry in the GGUF file which we can\n "
154
- src_func += " # use in llama.cpp to implement the same pre-tokenizer\n "
155
- src_func += "\n "
156
- src_func += f" chktxt = { repr (chktxt )} \n "
157
- src_func += "\n "
158
- src_func += " chktok = tokenizer.encode(chktxt)\n "
159
- src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n "
160
- src_func += "\n "
161
- src_func += " print(f\" chktok: {chktok}\" )\n "
162
- src_func += " print(f\" chkhsh: {chkhsh}\" )\n "
163
- src_func += "\n "
164
- src_func += " res = None\n "
165
- src_func += "\n "
166
- src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n "
167
- src_func += " # or pull the latest version of the model from Huggingface\n "
168
- src_func += " # don't edit the hashes manually!\n "
169
- src_func += f"{ src_ifs } \n "
170
- src_func += " if res is None:\n "
171
- src_func += " print(\" \\ n\" )\n "
172
- src_func += " print(\" **************************************************************************************\" )\n "
173
- src_func += " print(\" ** WARNING: The BPE pre-tokenizer was not recognized!\" )\n "
174
- src_func += " print(\" ** There are 2 possible reasons for this:\" )\n "
175
- src_func += " print(\" ** - the model has not been added to convert-hf-to-gguf-update.py yet\" )\n "
176
- src_func += " print(\" ** - the pre-tokenization config has changed upstream\" )\n "
177
- src_func += " print(\" ** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\" )\n "
178
- src_func += " print(\" ** ref: https://github.com/ggerganov/llama.cpp/pull/6920\" )\n "
179
- src_func += " print(\" **\" )\n "
180
- src_func += " print(f\" ** chkhsh: {chkhsh}\" )\n "
181
- src_func += " print(\" **************************************************************************************\" )\n "
182
- src_func += " print(\" \\ n\" )\n "
183
- src_func += " raise NotImplementedError(\" BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\" )\n "
184
- src_func += "\n "
185
- src_func += " print(f\" tokenizer.ggml.pre: {res}\" )\n "
186
- src_func += " print(f\" chkhsh: {chkhsh}\" )\n "
187
- src_func += "\n "
188
- src_func += " return res\n "
189
-
190
- print (src_func )
191
-
192
- print ("\n " )
193
- print ("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!" )
194
- print ("\n " )
158
+ src_func = f"""
159
+ def get_vocab_base_pre(self, tokenizer) -> str:
160
+ # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
161
+ # is specific for the BPE pre-tokenizer used by the model
162
+ # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
163
+ # use in llama.cpp to implement the same pre-tokenizer
164
+
165
+ chktxt = { repr (chktxt )}
166
+
167
+ chktok = tokenizer.encode(chktxt)
168
+ chkhsh = sha256(str(chktok).encode()).hexdigest()
169
+
170
+ print(f"chktok: {{chktok}}")
171
+ print(f"chkhsh: {{chkhsh}}")
172
+
173
+ res = None
174
+
175
+ # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
176
+ # or pull the latest version of the model from Huggingface
177
+ # don't edit the hashes manually!
178
+ { src_ifs }
179
+ if res is None:
180
+ print("\\ n")
181
+ print("**************************************************************************************")
182
+ print("** WARNING: The BPE pre-tokenizer was not recognized!")
183
+ print("** There are 2 possible reasons for this:")
184
+ print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
185
+ print("** - the pre-tokenization config has changed upstream")
186
+ print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
187
+ print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
188
+ print("**")
189
+ print(f"** chkhsh: {{chkhsh}}")
190
+ print("**************************************************************************************")
191
+ print("\\ n")
192
+ raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
193
+
194
+ print(f"tokenizer.ggml.pre: {{repr(res)}}")
195
+ print(f"chkhsh: {{chkhsh}}")
196
+
197
+ return res
198
+ """
199
+
200
+ print (src_func ) # noqa: NP100
201
+
202
+ logger .info ("\n " )
203
+ logger .info ("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!" )
204
+ logger .info ("\n " )
195
205
196
206
# generate tests for each tokenizer model
197
207
@@ -257,7 +267,6 @@ def download_file_with_auth(url, token, save_path):
257
267
tokt = model ["tokt" ]
258
268
259
269
# create the tokenizer
260
- from transformers import AutoTokenizer
261
270
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
262
271
263
272
with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" , encoding = "utf-8" ) as f :
@@ -272,23 +281,22 @@ def download_file_with_auth(url, token, save_path):
272
281
f .write (f" { r } " )
273
282
f .write ("\n " )
274
283
275
- print (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
284
+ logger . info (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
276
285
277
286
# generate commands for creating vocab files
278
- print () # pad output
279
-
280
287
shscript = "#!/usr/bin/env bash\n \n "
288
+ logging .info ("\n Run the following commands to generate the vocab files for testing:\n " )
289
+ with open ("generate-vocab.sh" , "w" , encoding = "utf-8" ) as f :
290
+ f .writelines (shscript )
291
+ logging .info (f"Wrote { len (shscript )} bytes to generate-vocab.sh" )
292
+
281
293
for model in models :
282
294
name = model ["name" ]
283
295
tmpline = f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only\n "
284
296
shscript += tmpline
285
- print (tmpline , end = "" ) # remove dupped lines
286
-
287
- with open ("generate-vocab.sh" , "w" , encoding = "utf-8" ) as f :
288
- f .writelines (shscript )
289
- print (f"Wrote { len (shscript )} bytes to generate-vocab.sh" )
297
+ logging .info (tmpline )
298
+ logging .info (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
290
299
291
- print () # pad output
292
- print ("Run the following commands to generate the vocab files for testing:" )
293
- print ("Enable execution: chmod +x generate-vocab.sh" )
294
- print ("Execute with ./generate-vocab.sh" )
300
+ logging .info ("Run the following commands to generate the vocab files for testing:" )
301
+ logging .info ("Enable execution: chmod +x generate-vocab.sh" )
302
+ logging .info ("Execute with ./generate-vocab.sh" )
0 commit comments