*.py: fix flake8 warnings

mofosyne · mofosyne · commit fcc5a5e0fe06 · 2024-04-30T19:05:32.000+10:00
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
 max-line-length = 125
 ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
-exclude = examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py
+exclude = examples/*,examples/*/**,*/**/__init__.py
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -21,56 +21,64 @@
 # TODO: automate the update of convert-hf-to-gguf.py
 #
 
+import logging
 import os
 import requests
 import sys
 import json
 
 from hashlib import sha256
 from enum import IntEnum, auto
+from transformers import AutoTokenizer
+
+logger = logging.getLogger("convert-hf-to-gguf-update")
+
 
 class TOKENIZER_TYPE(IntEnum):
     SPM = auto()
     BPE = auto()
     WPM = auto()
 
+
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 
 if len(sys.argv) == 2:
     token = sys.argv[1]
 else:
-    print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
+    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
     sys.exit(1)
 
 # TODO: add models here, base models preferred
 models = [
-        { "name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
-        { "name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
-        { "name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
-        { "name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
-        { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
-        { "name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
-        { "name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
-        { "name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
-        { "name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
-        { "name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-        ]
+    {"name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
+    {"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
+    {"name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
+    {"name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
+    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
+    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
+    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
+    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
+    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+]
 
 # make directory "models/tokenizers" if it doesn't exist
 if not os.path.exists("models/tokenizers"):
     os.makedirs("models/tokenizers")
 
+
 def download_file_with_auth(url, token, save_path):
     headers = {"Authorization": f"Bearer {token}"}
     response = requests.get(url, headers=headers)
     if response.status_code == 200:
         with open(save_path, 'wb') as f:
             f.write(response.content)
-        print(f"File {save_path} downloaded successfully")
+        logger.info(f"File {save_path} downloaded successfully")
     else:
-        print(f"Failed to download file. Status code: {response.status_code}")
+        logger.info(f"Failed to download file. Status code: {response.status_code}")
+
 
 # download the tokenizer models
 for model in models:
@@ -81,10 +89,10 @@ def download_file_with_auth(url, token, save_path):
     if not os.path.exists(f"models/tokenizers/{name}"):
         os.makedirs(f"models/tokenizers/{name}")
     else:
-        print(f"Directory models/tokenizers/{name} already exists - skipping")
+        logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
         continue
 
-    print(f"Downloading {name} to models/tokenizers/{name}")
+    logger.info(f"Downloading {name} to models/tokenizers/{name}")
 
     url = f"{repo}/raw/main/config.json"
     save_path = f"models/tokenizers/{name}/config.json"
@@ -115,76 +123,75 @@ def download_file_with_auth(url, token, save_path):
         continue
 
     # create the tokenizer
-    from transformers import AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
 
     chktok = tokenizer.encode(chktxt)
     chkhsh = sha256(str(chktok).encode()).hexdigest()
 
-    print(f"model: {name}")
-    print(f"tokt: {tokt}")
-    print(f"repo: {model['repo']}")
-    print(f"chktok: {chktok}")
-    print(f"chkhsh: {chkhsh}")
+    logger.info(f"model: {name}")
+    logger.info(f"tokt: {tokt}")
+    logger.info(f"repo: {model['repo']}")
+    logger.info(f"chktok: {chktok}")
+    logger.info(f"chkhsh: {chkhsh}")
 
     # print the "pre_tokenizer" content from the tokenizer.json
     with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
         cfg = json.load(f)
         pre_tokenizer = cfg["pre_tokenizer"]
-        print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
+        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
 
-    print(f"\n")
+    logger.info("")
 
     src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
     src_ifs += f"            # ref: {model['repo']}\n"
     src_ifs += f"            res = \"{name}\"\n"
 
-src_func = ""
-src_func +=  "    def get_vocab_base_pre(self, tokenizer) -> str:\n"
-src_func +=  "        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
-src_func +=  "        # is specific for the BPE pre-tokenizer used by the model\n"
-src_func +=  "        # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
-src_func +=  "        # use in llama.cpp to implement the same pre-tokenizer\n"
-src_func +=  "\n"
-src_func += f"        chktxt = {repr(chktxt)}\n"
-src_func +=  "\n"
-src_func +=  "        chktok = tokenizer.encode(chktxt)\n"
-src_func +=  "        chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
-src_func +=  "\n"
-src_func +=  "        print(f\"chktok: {chktok}\")\n"
-src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
-src_func +=  "\n"
-src_func +=  "        res = None\n"
-src_func +=  "\n"
-src_func +=  "        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n"
-src_func +=  "        #       or pull the latest version of the model from Huggingface\n"
-src_func +=  "        #       don't edit the hashes manually!\n"
-src_func += f"{src_ifs}\n"
-src_func +=  "        if res is None:\n"
-src_func +=  "            print(\"\\n\")\n"
-src_func +=  "            print(\"**************************************************************************************\")\n"
-src_func +=  "            print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
-src_func +=  "            print(\"**          There are 2 possible reasons for this:\")\n"
-src_func +=  "            print(\"**          - the model has not been added to convert-hf-to-gguf-update.py yet\")\n"
-src_func +=  "            print(\"**          - the pre-tokenization config has changed upstream\")\n"
-src_func +=  "            print(\"**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n"
-src_func +=  "            print(\"** ref:     https://github.com/ggerganov/llama.cpp/pull/6920\")\n"
-src_func +=  "            print(\"**\")\n"
-src_func +=  "            print(f\"** chkhsh:  {chkhsh}\")\n"
-src_func +=  "            print(\"**************************************************************************************\")\n"
-src_func +=  "            print(\"\\n\")\n"
-src_func +=  "            raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
-src_func +=  "\n"
-src_func +=  "        print(f\"tokenizer.ggml.pre: {res}\")\n"
-src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
-src_func +=  "\n"
-src_func +=  "        return res\n"
-
-print(src_func)
-
-print("\n")
-print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
-print("\n")
+src_func = "" # noqa: E222
+src_func +=  "    def get_vocab_base_pre(self, tokenizer) -> str:\n" # noqa: E222
+src_func +=  "        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n" # noqa: E222
+src_func +=  "        # is specific for the BPE pre-tokenizer used by the model\n" # noqa: E222
+src_func +=  "        # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n" # noqa: E222
+src_func +=  "        # use in llama.cpp to implement the same pre-tokenizer\n" # noqa: E222
+src_func +=  "\n" # noqa: E222
+src_func += f"        chktxt = {repr(chktxt)}\n" # noqa: E222
+src_func +=  "\n" # noqa: E222
+src_func +=  "        chktok = tokenizer.encode(chktxt)\n" # noqa: E222
+src_func +=  "        chkhsh = sha256(str(chktok).encode()).hexdigest()\n" # noqa: E222
+src_func +=  "\n" # noqa: E222
+src_func +=  "        print(f\"chktok: {chktok}\")\n" # noqa: E222
+src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n" # noqa: E222
+src_func +=  "\n" # noqa: E222
+src_func +=  "        res = None\n" # noqa: E222
+src_func +=  "\n" # noqa: E222
+src_func +=  "        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n" # noqa: E222
+src_func +=  "        #       or pull the latest version of the model from Huggingface\n" # noqa: E222
+src_func +=  "        #       don't edit the hashes manually!\n" # noqa: E222
+src_func += f"{src_ifs}\n" # noqa: E222
+src_func +=  "        if res is None:\n" # noqa: E222
+src_func +=  "            print(\"\\n\")\n" # noqa: E222
+src_func +=  "            print(\"**************************************************************************************\")\n" # noqa: E222
+src_func +=  "            print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n" # noqa: E222
+src_func +=  "            print(\"**          There are 2 possible reasons for this:\")\n" # noqa: E222
+src_func +=  "            print(\"**          - the model has not been added to convert-hf-to-gguf-update.py yet\")\n" # noqa: E222
+src_func +=  "            print(\"**          - the pre-tokenization config has changed upstream\")\n" # noqa: E222
+src_func +=  "            print(\"**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n" # noqa: E222
+src_func +=  "            print(\"** ref:     https://github.com/ggerganov/llama.cpp/pull/6920\")\n" # noqa: E222
+src_func +=  "            print(\"**\")\n" # noqa: E222
+src_func +=  "            print(f\"** chkhsh:  {chkhsh}\")\n" # noqa: E222
+src_func +=  "            print(\"**************************************************************************************\")\n" # noqa: E222
+src_func +=  "            print(\"\\n\")\n" # noqa: E222
+src_func +=  "            raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n" # noqa: E222
+src_func +=  "\n" # noqa: E222
+src_func +=  "        print(f\"tokenizer.ggml.pre: {res}\")\n" # noqa: E222
+src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n" # noqa: E222
+src_func +=  "\n" # noqa: E222
+src_func +=  "        return res\n" # noqa: E222
+
+print(src_func) # noqa: NP100
+
+logger.info("\n")
+logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
+logger.info("\n")
 
 # generate tests for each tokenizer model
 
@@ -250,7 +257,6 @@ def download_file_with_auth(url, token, save_path):
     tokt = model["tokt"]
 
     # create the tokenizer
-    from transformers import AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
 
     with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
@@ -265,15 +271,15 @@ def download_file_with_auth(url, token, save_path):
                 f.write(f" {r}")
             f.write("\n")
 
-    print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
+    logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
 
 # generate commands for creating vocab files
 
-print("\nRun the following commands to generate the vocab files for testing:\n")
+logger.info("\nRun the following commands to generate the vocab files for testing:\n")
 
 for model in models:
     name = model["name"]
 
-    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
+    logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
 
-print("\n")
+logger.info("\n")
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -276,8 +276,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         chktok = tokenizer.encode(chktxt)
         chkhsh = sha256(str(chktok).encode()).hexdigest()
 
-        print(f"chktok: {chktok}")
-        print(f"chkhsh: {chkhsh}")
+        logger.debug(f"chktok: {chktok}")
+        logger.debug(f"chkhsh: {chkhsh}")
 
         res = None
 
@@ -310,22 +310,22 @@ def get_vocab_base_pre(self, tokenizer) -> str:
             res = "gpt-2"
 
         if res is None:
-            print("\n")
-            print("**************************************************************************************")
-            print("** WARNING: The BPE pre-tokenizer was not recognized!")
-            print("**          There are 2 possible reasons for this:")
-            print("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
-            print("**          - the pre-tokenization config has changed upstream")
-            print("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
-            print("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
-            print("**")
-            print(f"** chkhsh:  {chkhsh}")
-            print("**************************************************************************************")
-            print("\n")
+            logger.warning("\n")
+            logger.warning("**************************************************************************************")
+            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
+            logger.warning("**          There are 2 possible reasons for this:")
+            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
+            logger.warning("**          - the pre-tokenization config has changed upstream")
+            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
+            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("**")
+            logger.warning(f"** chkhsh:  {chkhsh}")
+            logger.warning("**************************************************************************************")
+            logger.warning("\n")
             raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
 
-        print(f"tokenizer.ggml.pre: {res}")
-        print(f"chkhsh: {chkhsh}")
+        logger.debug(f"tokenizer.ggml.pre: {res}")
+        logger.debug(f"chkhsh: {chkhsh}")
 
         return res
 
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
@@ -4,6 +4,7 @@
 #
 from __future__ import annotations
 
+import logging
 import os
 from collections import OrderedDict
 from typing import Any, Literal, NamedTuple, TypeVar, Union
@@ -27,6 +28,7 @@
     GGUFValueType,
 )
 
+logger = logging.getLogger(__name__)
 
 READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
 
@@ -142,7 +144,7 @@ def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
             # TODO: add option to generate error on duplicate keys
             # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
 
-            print(f'Warning: Duplicate key {field.name} at offset {field.offset}')
+            logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
             self.fields[field.name + '_{}'.format(field.offset)] = field
         else:
             self.fields[field.name] = field
diff --git a/tests/test-tokenizer-0-bpe.py b/tests/test-tokenizer-0-bpe.py
@@ -12,7 +12,7 @@
 
 from transformers import AutoTokenizer
 
-logger = logging.getLogger("convert")
+logger = logging.getLogger("test-tokenizer-0-bpe")
 
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
diff --git a/tests/test-tokenizer-0-spm.py b/tests/test-tokenizer-0-spm.py
@@ -12,7 +12,7 @@
 
 from sentencepiece import SentencePieceProcessor
 
-logger = logging.getLogger("test-tokenizer-0-llama")
+logger = logging.getLogger("test-tokenizer-0-spm")
 
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")