Skip to content

Commit fcc5a5e

Browse files
committed
*.py: fix flake8 warnings
1 parent 5e5e74e commit fcc5a5e

File tree

6 files changed

+103
-95
lines changed

6 files changed

+103
-95
lines changed

.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[flake8]
22
max-line-length = 125
33
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
4-
exclude = examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py
4+
exclude = examples/*,examples/*/**,*/**/__init__.py

convert-hf-to-gguf-update.py

Lines changed: 81 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -21,56 +21,64 @@
2121
# TODO: automate the update of convert-hf-to-gguf.py
2222
#
2323

24+
import logging
2425
import os
2526
import requests
2627
import sys
2728
import json
2829

2930
from hashlib import sha256
3031
from enum import IntEnum, auto
32+
from transformers import AutoTokenizer
33+
34+
logger = logging.getLogger("convert-hf-to-gguf-update")
35+
3136

3237
class TOKENIZER_TYPE(IntEnum):
3338
SPM = auto()
3439
BPE = auto()
3540
WPM = auto()
3641

42+
3743
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
3844
# will be updated with time - contributions welcome
3945
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
4046

4147
if len(sys.argv) == 2:
4248
token = sys.argv[1]
4349
else:
44-
print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
50+
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
4551
sys.exit(1)
4652

4753
# TODO: add models here, base models preferred
4854
models = [
49-
{ "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
50-
{ "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
51-
{ "name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
52-
{ "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
53-
{ "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
54-
{ "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
55-
{ "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
56-
{ "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
57-
{ "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
58-
{ "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
59-
]
55+
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
56+
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
57+
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
58+
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
59+
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
60+
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
61+
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
62+
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
63+
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
64+
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
65+
]
6066

6167
# make directory "models/tokenizers" if it doesn't exist
6268
if not os.path.exists("models/tokenizers"):
6369
os.makedirs("models/tokenizers")
6470

71+
6572
def download_file_with_auth(url, token, save_path):
6673
headers = {"Authorization": f"Bearer {token}"}
6774
response = requests.get(url, headers=headers)
6875
if response.status_code == 200:
6976
with open(save_path, 'wb') as f:
7077
f.write(response.content)
71-
print(f"File {save_path} downloaded successfully")
78+
logger.info(f"File {save_path} downloaded successfully")
7279
else:
73-
print(f"Failed to download file. Status code: {response.status_code}")
80+
logger.info(f"Failed to download file. Status code: {response.status_code}")
81+
7482

7583
# download the tokenizer models
7684
for model in models:
@@ -81,10 +89,10 @@ def download_file_with_auth(url, token, save_path):
8189
if not os.path.exists(f"models/tokenizers/{name}"):
8290
os.makedirs(f"models/tokenizers/{name}")
8391
else:
84-
print(f"Directory models/tokenizers/{name} already exists - skipping")
92+
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
8593
continue
8694

87-
print(f"Downloading {name} to models/tokenizers/{name}")
95+
logger.info(f"Downloading {name} to models/tokenizers/{name}")
8896

8997
url = f"{repo}/raw/main/config.json"
9098
save_path = f"models/tokenizers/{name}/config.json"
@@ -115,76 +123,75 @@ def download_file_with_auth(url, token, save_path):
115123
continue
116124

117125
# create the tokenizer
118-
from transformers import AutoTokenizer
119126
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
120127

121128
chktok = tokenizer.encode(chktxt)
122129
chkhsh = sha256(str(chktok).encode()).hexdigest()
123130

124-
print(f"model: {name}")
125-
print(f"tokt: {tokt}")
126-
print(f"repo: {model['repo']}")
127-
print(f"chktok: {chktok}")
128-
print(f"chkhsh: {chkhsh}")
131+
logger.info(f"model: {name}")
132+
logger.info(f"tokt: {tokt}")
133+
logger.info(f"repo: {model['repo']}")
134+
logger.info(f"chktok: {chktok}")
135+
logger.info(f"chkhsh: {chkhsh}")
129136

130137
# print the "pre_tokenizer" content from the tokenizer.json
131138
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
132139
cfg = json.load(f)
133140
pre_tokenizer = cfg["pre_tokenizer"]
134-
print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
141+
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
135142

136-
print(f"\n")
143+
logger.info("")
137144

138145
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
139146
src_ifs += f" # ref: {model['repo']}\n"
140147
src_ifs += f" res = \"{name}\"\n"
141148

142-
src_func = ""
143-
src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n"
144-
src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
145-
src_func += " # is specific for the BPE pre-tokenizer used by the model\n"
146-
src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
147-
src_func += " # use in llama.cpp to implement the same pre-tokenizer\n"
148-
src_func += "\n"
149-
src_func += f" chktxt = {repr(chktxt)}\n"
150-
src_func += "\n"
151-
src_func += " chktok = tokenizer.encode(chktxt)\n"
152-
src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
153-
src_func += "\n"
154-
src_func += " print(f\"chktok: {chktok}\")\n"
155-
src_func += " print(f\"chkhsh: {chkhsh}\")\n"
156-
src_func += "\n"
157-
src_func += " res = None\n"
158-
src_func += "\n"
159-
src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n"
160-
src_func += " # or pull the latest version of the model from Huggingface\n"
161-
src_func += " # don't edit the hashes manually!\n"
162-
src_func += f"{src_ifs}\n"
163-
src_func += " if res is None:\n"
164-
src_func += " print(\"\\n\")\n"
165-
src_func += " print(\"**************************************************************************************\")\n"
166-
src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
167-
src_func += " print(\"** There are 2 possible reasons for this:\")\n"
168-
src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n"
169-
src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n"
170-
src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n"
171-
src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n"
172-
src_func += " print(\"**\")\n"
173-
src_func += " print(f\"** chkhsh: {chkhsh}\")\n"
174-
src_func += " print(\"**************************************************************************************\")\n"
175-
src_func += " print(\"\\n\")\n"
176-
src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
177-
src_func += "\n"
178-
src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n"
179-
src_func += " print(f\"chkhsh: {chkhsh}\")\n"
180-
src_func += "\n"
181-
src_func += " return res\n"
182-
183-
print(src_func)
184-
185-
print("\n")
186-
print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
187-
print("\n")
149+
src_func = "" # noqa: E222
150+
src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n" # noqa: E222
151+
src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n" # noqa: E222
152+
src_func += " # is specific for the BPE pre-tokenizer used by the model\n" # noqa: E222
153+
src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n" # noqa: E222
154+
src_func += " # use in llama.cpp to implement the same pre-tokenizer\n" # noqa: E222
155+
src_func += "\n" # noqa: E222
156+
src_func += f" chktxt = {repr(chktxt)}\n" # noqa: E222
157+
src_func += "\n" # noqa: E222
158+
src_func += " chktok = tokenizer.encode(chktxt)\n" # noqa: E222
159+
src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n" # noqa: E222
160+
src_func += "\n" # noqa: E222
161+
src_func += " print(f\"chktok: {chktok}\")\n" # noqa: E222
162+
src_func += " print(f\"chkhsh: {chkhsh}\")\n" # noqa: E222
163+
src_func += "\n" # noqa: E222
164+
src_func += " res = None\n" # noqa: E222
165+
src_func += "\n" # noqa: E222
166+
src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n" # noqa: E222
167+
src_func += " # or pull the latest version of the model from Huggingface\n" # noqa: E222
168+
src_func += " # don't edit the hashes manually!\n" # noqa: E222
169+
src_func += f"{src_ifs}\n" # noqa: E222
170+
src_func += " if res is None:\n" # noqa: E222
171+
src_func += " print(\"\\n\")\n" # noqa: E222
172+
src_func += " print(\"**************************************************************************************\")\n" # noqa: E222
173+
src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n" # noqa: E222
174+
src_func += " print(\"** There are 2 possible reasons for this:\")\n" # noqa: E222
175+
src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n" # noqa: E222
176+
src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n" # noqa: E222
177+
src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n" # noqa: E222
178+
src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n" # noqa: E222
179+
src_func += " print(\"**\")\n" # noqa: E222
180+
src_func += " print(f\"** chkhsh: {chkhsh}\")\n" # noqa: E222
181+
src_func += " print(\"**************************************************************************************\")\n" # noqa: E222
182+
src_func += " print(\"\\n\")\n" # noqa: E222
183+
src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n" # noqa: E222
184+
src_func += "\n" # noqa: E222
185+
src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n" # noqa: E222
186+
src_func += " print(f\"chkhsh: {chkhsh}\")\n" # noqa: E222
187+
src_func += "\n" # noqa: E222
188+
src_func += " return res\n" # noqa: E222
189+
190+
print(src_func) # noqa: NP100
191+
192+
logger.info("\n")
193+
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
194+
logger.info("\n")
188195

189196
# generate tests for each tokenizer model
190197

@@ -250,7 +257,6 @@ def download_file_with_auth(url, token, save_path):
250257
tokt = model["tokt"]
251258

252259
# create the tokenizer
253-
from transformers import AutoTokenizer
254260
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
255261

256262
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
@@ -265,15 +271,15 @@ def download_file_with_auth(url, token, save_path):
265271
f.write(f" {r}")
266272
f.write("\n")
267273

268-
print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
274+
logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
269275

270276
# generate commands for creating vocab files
271277

272-
print("\nRun the following commands to generate the vocab files for testing:\n")
278+
logger.info("\nRun the following commands to generate the vocab files for testing:\n")
273279

274280
for model in models:
275281
name = model["name"]
276282

277-
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
283+
logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
278284

279-
print("\n")
285+
logger.info("\n")

convert-hf-to-gguf.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -276,8 +276,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
276276
chktok = tokenizer.encode(chktxt)
277277
chkhsh = sha256(str(chktok).encode()).hexdigest()
278278

279-
print(f"chktok: {chktok}")
280-
print(f"chkhsh: {chkhsh}")
279+
logger.debug(f"chktok: {chktok}")
280+
logger.debug(f"chkhsh: {chkhsh}")
281281

282282
res = None
283283

@@ -310,22 +310,22 @@ def get_vocab_base_pre(self, tokenizer) -> str:
310310
res = "gpt-2"
311311

312312
if res is None:
313-
print("\n")
314-
print("**************************************************************************************")
315-
print("** WARNING: The BPE pre-tokenizer was not recognized!")
316-
print("** There are 2 possible reasons for this:")
317-
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
318-
print("** - the pre-tokenization config has changed upstream")
319-
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
320-
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
321-
print("**")
322-
print(f"** chkhsh: {chkhsh}")
323-
print("**************************************************************************************")
324-
print("\n")
313+
logger.warning("\n")
314+
logger.warning("**************************************************************************************")
315+
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
316+
logger.warning("** There are 2 possible reasons for this:")
317+
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
318+
logger.warning("** - the pre-tokenization config has changed upstream")
319+
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
320+
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
321+
logger.warning("**")
322+
logger.warning(f"** chkhsh: {chkhsh}")
323+
logger.warning("**************************************************************************************")
324+
logger.warning("\n")
325325
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
326326

327-
print(f"tokenizer.ggml.pre: {res}")
328-
print(f"chkhsh: {chkhsh}")
327+
logger.debug(f"tokenizer.ggml.pre: {res}")
328+
logger.debug(f"chkhsh: {chkhsh}")
329329

330330
return res
331331

gguf-py/gguf/gguf_reader.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#
55
from __future__ import annotations
66

7+
import logging
78
import os
89
from collections import OrderedDict
910
from typing import Any, Literal, NamedTuple, TypeVar, Union
@@ -27,6 +28,7 @@
2728
GGUFValueType,
2829
)
2930

31+
logger = logging.getLogger(__name__)
3032

3133
READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
3234

@@ -142,7 +144,7 @@ def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
142144
# TODO: add option to generate error on duplicate keys
143145
# raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
144146

145-
print(f'Warning: Duplicate key {field.name} at offset {field.offset}')
147+
logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
146148
self.fields[field.name + '_{}'.format(field.offset)] = field
147149
else:
148150
self.fields[field.name] = field

tests/test-tokenizer-0-bpe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from transformers import AutoTokenizer
1414

15-
logger = logging.getLogger("convert")
15+
logger = logging.getLogger("test-tokenizer-0-bpe")
1616

1717
parser = argparse.ArgumentParser()
1818
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")

tests/test-tokenizer-0-spm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from sentencepiece import SentencePieceProcessor
1414

15-
logger = logging.getLogger("test-tokenizer-0-llama")
15+
logger = logging.getLogger("test-tokenizer-0-spm")
1616

1717
parser = argparse.ArgumentParser()
1818
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")

0 commit comments

Comments
 (0)