diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 27983fadf4ac5..53333ff22e59f 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -22,17 +22,16 @@ # TODO: generate tokenizer tests for llama.cpp # +import json import logging import os import pathlib import re - -import requests import sys -import json - -from hashlib import sha256 from enum import IntEnum, auto +from hashlib import sha256 + +import requests from transformers import AutoTokenizer logging.basicConfig(level=logging.DEBUG) @@ -72,6 +71,12 @@ class TOKENIZER_TYPE(IntEnum): {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, + {"name": "phi", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-1", }, + {"name": "stablelm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", }, + {"name": "mistral-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", }, + {"name": "mistral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", }, + {"name": "mixtral-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", }, + {"name": "mixtral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", }, {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, @@ -314,12 +319,18 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*") # generate commands for creating vocab files - -logger.info("\nRun the following commands to generate the vocab files for testing:\n") +shscript = "#!/usr/bin/env bash\n\n" for model in models: name = model["name"] + tmpline = f"python3 convert-hf-to-gguf.py models/tokenizers/{name} --outfile models/ggml-vocab-{name}.gguf --vocab-only\n" + shscript += tmpline + logging.info(tmpline.strip()) - print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100 +with open("generate-vocab.sh", "w", encoding="utf-8") as f: + f.writelines(shscript) + logging.info(f"Wrote {len(shscript)} bytes to generate-vocab.sh") -logger.info("\n") +logging.info("Run the following command to generate the vocab files for testing:") +logging.info("Enable execution: chmod +x generate-vocab.sh") +logging.info("Execute with ./generate-vocab.sh") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5ba3161c76b96..805dd6a2f07b6 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2,17 +2,27 @@ from __future__ import annotations -import logging import argparse import contextlib import json +import logging import os import re import sys from enum import IntEnum -from pathlib import Path from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ContextManager, + Iterable, + Iterator, + Sequence, + TypeVar, + cast, +) import numpy as np import torch @@ -446,6 +456,18 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" + if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": + # ref: https://huggingface.co/microsoft/phi-1 + res = "phi" + if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": + # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b + res = "stablelm" + if chkhsh == "e750a9b14dfed9b73287639bd1ecda50c38fa6011138f2f609804c6dab9ed5c2": + # ref: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2 + res = "mistral-bpe" + if chkhsh == "e750a9b14dfed9b73287639bd1ecda50c38fa6011138f2f609804c6dab9ed5c2": + # ref: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + res = "mixtral-bpe" if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": # ref: https://huggingface.co/smallcloudai/Refact-1_6-base res = "refact" @@ -1703,6 +1725,7 @@ def set_gguf_parameters(self): n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_name("Phi2") + self.gguf_writer.add_tokenizer_pre("gpt-2") self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_embedding_length(n_embd) diff --git a/generate-vocab.sh b/generate-vocab.sh new file mode 100755 index 0000000000000..0df4d6d91b0fb --- /dev/null +++ b/generate-vocab.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +python3 convert-hf-to-gguf.py models/tokenizers/llama-spm --outfile models/ggml-vocab-llama-spm.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/llama-bpe --outfile models/ggml-vocab-llama-bpe.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/phi-3 --outfile models/ggml-vocab-phi-3.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/deepseek-llm --outfile models/ggml-vocab-deepseek-llm.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/deepseek-coder --outfile models/ggml-vocab-deepseek-coder.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/falcon --outfile models/ggml-vocab-falcon.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/bert-bge --outfile models/ggml-vocab-bert-bge.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/mpt --outfile models/ggml-vocab-mpt.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/starcoder --outfile models/ggml-vocab-starcoder.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/gpt-2 --outfile models/ggml-vocab-gpt-2.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/phi --outfile models/ggml-vocab-phi.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/stablelm --outfile models/ggml-vocab-stablelm.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/mistral-bpe --outfile models/ggml-vocab-mistral-bpe.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/mistral-spm --outfile models/ggml-vocab-mistral-spm.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/mixtral-bpe --outfile models/ggml-vocab-mixtral-bpe.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/mixtral-spm --outfile models/ggml-vocab-mixtral-spm.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/refact --outfile models/ggml-vocab-refact.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/command-r --outfile models/ggml-vocab-command-r.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/qwen2 --outfile models/ggml-vocab-qwen2.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/olmo --outfile models/ggml-vocab-olmo.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/dbrx --outfile models/ggml-vocab-dbrx.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/jina-v2-en --outfile models/ggml-vocab-jina-v2-en.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/jina-v2-es --outfile models/ggml-vocab-jina-v2-es.gguf --vocab-only +python3 convert-hf-to-gguf.py models/tokenizers/jina-v2-de --outfile models/ggml-vocab-jina-v2-de.gguf --vocab-only diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 8e1cac9152f55..4457fc9868a37 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -384,7 +384,7 @@ class TensorNameMap: mapping: dict[str, tuple[MODEL_TENSOR, str]] - def __init__(self, arch: MODEL_ARCH, n_blocks: int): + def __init__(self, arch: MODEL_ARCH, n_blocks: int, n_experts: int = 60): self.mapping = {} for tensor, keys in self.mappings_cfg.items(): if tensor not in MODEL_TENSORS[arch]: @@ -398,7 +398,6 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int): if tensor not in MODEL_TENSORS[arch]: continue # TODO: make this configurable - n_experts = 60 for xid in range(n_experts): tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) self.mapping[tensor_name] = (tensor, tensor_name) diff --git a/llama.cpp b/llama.cpp index e11f0ac4b72dc..aa4bd5fba4ffc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4458,6 +4458,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "command-r") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; + } else if ( + tokenizer_pre == "qwen") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN; } else if ( tokenizer_pre == "qwen2") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; @@ -12354,6 +12357,12 @@ struct llm_tokenizer_bpe { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }); break; + case LLAMA_VOCAB_PRE_TYPE_QWEN: + word_collection = unicode_regex_split(text, { + // original regex from tokenization_qwen.py + "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }); + break; case LLAMA_VOCAB_PRE_TYPE_QWEN2: word_collection = unicode_regex_split(text, { // original regex from tokenizer.json diff --git a/llama.h b/llama.h index 612e32c4ea058..6fe22e488f83c 100644 --- a/llama.h +++ b/llama.h @@ -81,9 +81,10 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, LLAMA_VOCAB_PRE_TYPE_REFACT = 8, LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, - LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10, - LLAMA_VOCAB_PRE_TYPE_OLMO = 11, - LLAMA_VOCAB_PRE_TYPE_DBRX = 12, + LLAMA_VOCAB_PRE_TYPE_QWEN = 10, + LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, + LLAMA_VOCAB_PRE_TYPE_OLMO = 12, + LLAMA_VOCAB_PRE_TYPE_DBRX = 13, }; // note: these values should be synchronized with ggml_rope diff --git a/models/ggml-vocab-bert-bge.gguf b/models/ggml-vocab-bert-bge.gguf index b2cbd5df6882d..3dad13a8bc773 100644 Binary files a/models/ggml-vocab-bert-bge.gguf and b/models/ggml-vocab-bert-bge.gguf differ diff --git a/models/ggml-vocab-command-r.gguf b/models/ggml-vocab-command-r.gguf index b553eab330591..8d75b773d60f0 100644 Binary files a/models/ggml-vocab-command-r.gguf and b/models/ggml-vocab-command-r.gguf differ diff --git a/models/ggml-vocab-deepseek-coder.gguf b/models/ggml-vocab-deepseek-coder.gguf index 6728cd747249e..002a2872a06e7 100644 Binary files a/models/ggml-vocab-deepseek-coder.gguf and b/models/ggml-vocab-deepseek-coder.gguf differ diff --git a/models/ggml-vocab-deepseek-llm.gguf b/models/ggml-vocab-deepseek-llm.gguf index 5d66091c44b6f..f18fd3232fcaa 100644 Binary files a/models/ggml-vocab-deepseek-llm.gguf and b/models/ggml-vocab-deepseek-llm.gguf differ diff --git a/models/ggml-vocab-falcon.gguf b/models/ggml-vocab-falcon.gguf index 334d50da51ba5..66ecef4177ab1 100644 Binary files a/models/ggml-vocab-falcon.gguf and b/models/ggml-vocab-falcon.gguf differ diff --git a/models/ggml-vocab-gpt-2.gguf b/models/ggml-vocab-gpt-2.gguf index 5ea85cf52e7de..dd415e35d79c9 100644 Binary files a/models/ggml-vocab-gpt-2.gguf and b/models/ggml-vocab-gpt-2.gguf differ diff --git a/models/ggml-vocab-llama-bpe.gguf b/models/ggml-vocab-llama-bpe.gguf index e51a99118bc43..c880feb9e3d19 100644 Binary files a/models/ggml-vocab-llama-bpe.gguf and b/models/ggml-vocab-llama-bpe.gguf differ diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp index 9380bf355202a..0a89107c60d7f 100644 --- a/models/ggml-vocab-llama-bpe.gguf.inp +++ b/models/ggml-vocab-llama-bpe.gguf.inp @@ -104,5 +104,3 @@ __ggml_vocab_test__ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL __ggml_vocab_test__ - Việt -__ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out index 1f3607fb6a378..1f00e3812e227 100644 --- a/models/ggml-vocab-llama-bpe.gguf.out +++ b/models/ggml-vocab-llama-bpe.gguf.out @@ -41,4 +41,3 @@ 8765 8765 1644 8765 8765 8765 198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43 - 101798 diff --git a/models/ggml-vocab-llama-spm.gguf b/models/ggml-vocab-llama-spm.gguf index 658295a5df741..2c051acb932cb 100644 Binary files a/models/ggml-vocab-llama-spm.gguf and b/models/ggml-vocab-llama-spm.gguf differ diff --git a/models/ggml-vocab-mpt.gguf b/models/ggml-vocab-mpt.gguf index f42f56dec9294..783e4f6b945a0 100644 Binary files a/models/ggml-vocab-mpt.gguf and b/models/ggml-vocab-mpt.gguf differ diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf index f8022a385e4aa..24cdfb5fb7803 100644 Binary files a/models/ggml-vocab-phi-3.gguf and b/models/ggml-vocab-phi-3.gguf differ diff --git a/models/ggml-vocab-qwen2.gguf b/models/ggml-vocab-qwen2.gguf index 541e475bc9453..a3c795c03ced6 100644 Binary files a/models/ggml-vocab-qwen2.gguf and b/models/ggml-vocab-qwen2.gguf differ diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf index 52afcf01aeb73..e15b9dab744e5 100644 Binary files a/models/ggml-vocab-refact.gguf and b/models/ggml-vocab-refact.gguf differ diff --git a/models/ggml-vocab-stablelm.gguf b/models/ggml-vocab-stablelm.gguf index ebb0cdb7d6a4a..310cec63f2fd4 100644 Binary files a/models/ggml-vocab-stablelm.gguf and b/models/ggml-vocab-stablelm.gguf differ diff --git a/models/ggml-vocab-starcoder.gguf b/models/ggml-vocab-starcoder.gguf index 7a7e7742ab1fc..1682d0ae6ce75 100644 Binary files a/models/ggml-vocab-starcoder.gguf and b/models/ggml-vocab-starcoder.gguf differ diff --git a/requirements/requirements-convert-hf-to-gguf-update.txt b/requirements/requirements-convert-hf-to-gguf-update.txt index 6ac4026107fbe..ed118375c8afe 100644 --- a/requirements/requirements-convert-hf-to-gguf-update.txt +++ b/requirements/requirements-convert-hf-to-gguf-update.txt @@ -1,2 +1,3 @@ -r ./requirements-convert.txt torch~=2.1.1 +tiktoken~=0.6.0