21
21
# TODO: automate the update of convert-hf-to-gguf.py
22
22
#
23
23
24
+ import logging
24
25
import os
25
26
import requests
26
27
import sys
27
28
import json
28
29
29
30
from hashlib import sha256
30
31
from enum import IntEnum , auto
32
+ from transformers import AutoTokenizer
33
+
34
+ logger = logging .getLogger ("convert-hf-to-gguf-update" )
35
+
31
36
32
37
class TOKENIZER_TYPE (IntEnum ):
33
38
SPM = auto ()
34
39
BPE = auto ()
35
40
WPM = auto ()
36
41
42
+
37
43
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
38
44
# will be updated with time - contributions welcome
39
45
chktxt = '\n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ```````\" \" \" \" ......!!!!!!?????? I\' ve been \' told he\' s there, \' RE you sure? \' M not sure I\' ll make it, \' D you like some tea? We\' Ve a\' lL'
40
46
41
47
if len (sys .argv ) == 2 :
42
48
token = sys .argv [1 ]
43
49
else :
44
- print ("Usage: python convert-hf-to-gguf-update.py <huggingface_token>" )
50
+ logger . info ("Usage: python convert-hf-to-gguf-update.py <huggingface_token>" )
45
51
sys .exit (1 )
46
52
47
53
# TODO: add models here, base models preferred
48
54
models = [
49
- { "name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
50
- { "name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
51
- { "name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
52
- { "name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
53
- { "name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
54
- { "name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
55
- { "name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
56
- { "name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
57
- { "name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
58
- { "name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
59
- ]
55
+ { "name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
56
+ { "name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
57
+ { "name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
58
+ { "name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
59
+ { "name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
60
+ { "name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
61
+ { "name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
62
+ { "name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
63
+ { "name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
64
+ { "name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
65
+ ]
60
66
61
67
# make directory "models/tokenizers" if it doesn't exist
62
68
if not os .path .exists ("models/tokenizers" ):
63
69
os .makedirs ("models/tokenizers" )
64
70
71
+
65
72
def download_file_with_auth (url , token , save_path ):
66
73
headers = {"Authorization" : f"Bearer { token } " }
67
74
response = requests .get (url , headers = headers )
68
75
if response .status_code == 200 :
69
76
with open (save_path , 'wb' ) as f :
70
77
f .write (response .content )
71
- print (f"File { save_path } downloaded successfully" )
78
+ logger . info (f"File { save_path } downloaded successfully" )
72
79
else :
73
- print (f"Failed to download file. Status code: { response .status_code } " )
80
+ logger .info (f"Failed to download file. Status code: { response .status_code } " )
81
+
74
82
75
83
# download the tokenizer models
76
84
for model in models :
@@ -81,10 +89,10 @@ def download_file_with_auth(url, token, save_path):
81
89
if not os .path .exists (f"models/tokenizers/{ name } " ):
82
90
os .makedirs (f"models/tokenizers/{ name } " )
83
91
else :
84
- print (f"Directory models/tokenizers/{ name } already exists - skipping" )
92
+ logger . info (f"Directory models/tokenizers/{ name } already exists - skipping" )
85
93
continue
86
94
87
- print (f"Downloading { name } to models/tokenizers/{ name } " )
95
+ logger . info (f"Downloading { name } to models/tokenizers/{ name } " )
88
96
89
97
url = f"{ repo } /raw/main/config.json"
90
98
save_path = f"models/tokenizers/{ name } /config.json"
@@ -115,76 +123,75 @@ def download_file_with_auth(url, token, save_path):
115
123
continue
116
124
117
125
# create the tokenizer
118
- from transformers import AutoTokenizer
119
126
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
120
127
121
128
chktok = tokenizer .encode (chktxt )
122
129
chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
123
130
124
- print (f"model: { name } " )
125
- print (f"tokt: { tokt } " )
126
- print (f"repo: { model ['repo' ]} " )
127
- print (f"chktok: { chktok } " )
128
- print (f"chkhsh: { chkhsh } " )
131
+ logger . info (f"model: { name } " )
132
+ logger . info (f"tokt: { tokt } " )
133
+ logger . info (f"repo: { model ['repo' ]} " )
134
+ logger . info (f"chktok: { chktok } " )
135
+ logger . info (f"chkhsh: { chkhsh } " )
129
136
130
137
# print the "pre_tokenizer" content from the tokenizer.json
131
138
with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
132
139
cfg = json .load (f )
133
140
pre_tokenizer = cfg ["pre_tokenizer" ]
134
- print ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
141
+ logger . info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
135
142
136
- print ( f" \n " )
143
+ logger . info ( " " )
137
144
138
145
src_ifs += f" if chkhsh == \" { chkhsh } \" :\n "
139
146
src_ifs += f" # ref: { model ['repo' ]} \n "
140
147
src_ifs += f" res = \" { name } \" \n "
141
148
142
- src_func = ""
143
- src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n "
144
- src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n "
145
- src_func += " # is specific for the BPE pre-tokenizer used by the model\n "
146
- src_func += " # we will use this unique identifier to write a \" tokenizer.ggml.pre\" entry in the GGUF file which we can\n "
147
- src_func += " # use in llama.cpp to implement the same pre-tokenizer\n "
148
- src_func += "\n "
149
- src_func += f" chktxt = { repr (chktxt )} \n "
150
- src_func += "\n "
151
- src_func += " chktok = tokenizer.encode(chktxt)\n "
152
- src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n "
153
- src_func += "\n "
154
- src_func += " print(f\" chktok: {chktok}\" )\n "
155
- src_func += " print(f\" chkhsh: {chkhsh}\" )\n "
156
- src_func += "\n "
157
- src_func += " res = None\n "
158
- src_func += "\n "
159
- src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n "
160
- src_func += " # or pull the latest version of the model from Huggingface\n "
161
- src_func += " # don't edit the hashes manually!\n "
162
- src_func += f"{ src_ifs } \n "
163
- src_func += " if res is None:\n "
164
- src_func += " print(\" \\ n\" )\n "
165
- src_func += " print(\" **************************************************************************************\" )\n "
166
- src_func += " print(\" ** WARNING: The BPE pre-tokenizer was not recognized!\" )\n "
167
- src_func += " print(\" ** There are 2 possible reasons for this:\" )\n "
168
- src_func += " print(\" ** - the model has not been added to convert-hf-to-gguf-update.py yet\" )\n "
169
- src_func += " print(\" ** - the pre-tokenization config has changed upstream\" )\n "
170
- src_func += " print(\" ** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\" )\n "
171
- src_func += " print(\" ** ref: https://github.com/ggerganov/llama.cpp/pull/6920\" )\n "
172
- src_func += " print(\" **\" )\n "
173
- src_func += " print(f\" ** chkhsh: {chkhsh}\" )\n "
174
- src_func += " print(\" **************************************************************************************\" )\n "
175
- src_func += " print(\" \\ n\" )\n "
176
- src_func += " raise NotImplementedError(\" BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\" )\n "
177
- src_func += "\n "
178
- src_func += " print(f\" tokenizer.ggml.pre: {res}\" )\n "
179
- src_func += " print(f\" chkhsh: {chkhsh}\" )\n "
180
- src_func += "\n "
181
- src_func += " return res\n "
182
-
183
- print (src_func )
184
-
185
- print ("\n " )
186
- print ("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!" )
187
- print ("\n " )
149
+ src_func = "" # noqa: E222
150
+ src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n " # noqa: E222
151
+ src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n " # noqa: E222
152
+ src_func += " # is specific for the BPE pre-tokenizer used by the model\n " # noqa: E222
153
+ src_func += " # we will use this unique identifier to write a \" tokenizer.ggml.pre\" entry in the GGUF file which we can\n " # noqa: E222
154
+ src_func += " # use in llama.cpp to implement the same pre-tokenizer\n " # noqa: E222
155
+ src_func += "\n " # noqa: E222
156
+ src_func += f" chktxt = { repr (chktxt )} \n " # noqa: E222
157
+ src_func += "\n " # noqa: E222
158
+ src_func += " chktok = tokenizer.encode(chktxt)\n " # noqa: E222
159
+ src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n " # noqa: E222
160
+ src_func += "\n " # noqa: E222
161
+ src_func += " print(f\" chktok: {chktok}\" )\n " # noqa: E222
162
+ src_func += " print(f\" chkhsh: {chkhsh}\" )\n " # noqa: E222
163
+ src_func += "\n " # noqa: E222
164
+ src_func += " res = None\n " # noqa: E222
165
+ src_func += "\n " # noqa: E222
166
+ src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n " # noqa: E222
167
+ src_func += " # or pull the latest version of the model from Huggingface\n " # noqa: E222
168
+ src_func += " # don't edit the hashes manually!\n " # noqa: E222
169
+ src_func += f"{ src_ifs } \n " # noqa: E222
170
+ src_func += " if res is None:\n " # noqa: E222
171
+ src_func += " print(\" \\ n\" )\n " # noqa: E222
172
+ src_func += " print(\" **************************************************************************************\" )\n " # noqa: E222
173
+ src_func += " print(\" ** WARNING: The BPE pre-tokenizer was not recognized!\" )\n " # noqa: E222
174
+ src_func += " print(\" ** There are 2 possible reasons for this:\" )\n " # noqa: E222
175
+ src_func += " print(\" ** - the model has not been added to convert-hf-to-gguf-update.py yet\" )\n " # noqa: E222
176
+ src_func += " print(\" ** - the pre-tokenization config has changed upstream\" )\n " # noqa: E222
177
+ src_func += " print(\" ** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\" )\n " # noqa: E222
178
+ src_func += " print(\" ** ref: https://github.com/ggerganov/llama.cpp/pull/6920\" )\n " # noqa: E222
179
+ src_func += " print(\" **\" )\n " # noqa: E222
180
+ src_func += " print(f\" ** chkhsh: {chkhsh}\" )\n " # noqa: E222
181
+ src_func += " print(\" **************************************************************************************\" )\n " # noqa: E222
182
+ src_func += " print(\" \\ n\" )\n " # noqa: E222
183
+ src_func += " raise NotImplementedError(\" BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\" )\n " # noqa: E222
184
+ src_func += "\n " # noqa: E222
185
+ src_func += " print(f\" tokenizer.ggml.pre: {res}\" )\n " # noqa: E222
186
+ src_func += " print(f\" chkhsh: {chkhsh}\" )\n " # noqa: E222
187
+ src_func += "\n " # noqa: E222
188
+ src_func += " return res\n " # noqa: E222
189
+
190
+ print (src_func ) # noqa: NP100
191
+
192
+ logger . info ("\n " )
193
+ logger . info ("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!" )
194
+ logger . info ("\n " )
188
195
189
196
# generate tests for each tokenizer model
190
197
@@ -250,7 +257,6 @@ def download_file_with_auth(url, token, save_path):
250
257
tokt = model ["tokt" ]
251
258
252
259
# create the tokenizer
253
- from transformers import AutoTokenizer
254
260
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
255
261
256
262
with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" , encoding = "utf-8" ) as f :
@@ -265,15 +271,15 @@ def download_file_with_auth(url, token, save_path):
265
271
f .write (f" { r } " )
266
272
f .write ("\n " )
267
273
268
- print (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
274
+ logger . info (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
269
275
270
276
# generate commands for creating vocab files
271
277
272
- print ("\n Run the following commands to generate the vocab files for testing:\n " )
278
+ logger . info ("\n Run the following commands to generate the vocab files for testing:\n " )
273
279
274
280
for model in models :
275
281
name = model ["name" ]
276
282
277
- print (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
283
+ logger . info (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
278
284
279
- print ("\n " )
285
+ logger . info ("\n " )
0 commit comments