From df4580e7c2dcf9f5aef939d2982cd0b0a6b786db Mon Sep 17 00:00:00 2001 From: glide-the Date: Tue, 22 Apr 2025 11:49:56 +0800 Subject: [PATCH 1/5] Fix ChatGLMModel for glm-4-9b cannot find tokenizer merges in model file --- convert_hf_to_gguf.py | 59 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2bf97475f78dd..c4f4a11581c02 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -538,7 +538,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size @@ -738,6 +738,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": # ref: https://huggingface.co/THUDM/glm-4-9b-hf res = "glm4" + if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": + # ref: https://huggingface.co/THUDM/glm-4-9b-hf + res = "glm4" if res is None: logger.warning("\n") @@ -5022,16 +5025,60 @@ def set_vocab(self): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) + vocab_size = hparams.get("padded_vocab_size",hparams.get("vocab_size")) assert max(tokenizer.get_vocab().values()) < vocab_size - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + # NOTE: this was added for Gemma. + # Encoding and decoding the tokens above isn't sufficient for this case. + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + + special_vocab=gguf.SpecialVocab( + self.dir_model, + load_merges=False, + n_vocab=vocab_size + ) # only add special tokens when they were not already loaded from config.json + + #TODO In llama.cpp, special tokens are mapped one-to-one between a token and a coordinate. However, in reality, a transformer might associate a special token like eos_token_id with multiple tokens. + # Currently, llama.cpp only supports a one-to-one mapping. + # This can lead to an issue where the model fails to terminate properly. + # I'm still unclear about how llama.cpp handles special_token and what the exact call chain is! + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|observation|>"]) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|user|>"]) special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # this one is usually not in config.json anyway @@ -5045,7 +5092,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) self.gguf_writer.add_embedding_length(n_embed) self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed))) - self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"])) + self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))) self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count_kv(n_head_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5)) From 4ce6630582a4eb4a0a387f5185587d4720024cf6 Mon Sep 17 00:00:00 2001 From: glide-the Date: Tue, 22 Apr 2025 17:37:57 +0800 Subject: [PATCH 2/5] update tokenizer_model --- convert_hf_to_gguf.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c4f4a11581c02..9c3d64108abb8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -538,7 +538,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size @@ -738,9 +738,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": # ref: https://huggingface.co/THUDM/glm-4-9b-hf res = "glm4" - if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": - # ref: https://huggingface.co/THUDM/glm-4-9b-hf - res = "glm4" if res is None: logger.warning("\n") @@ -5025,7 +5022,7 @@ def set_vocab(self): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size",hparams.get("vocab_size")) + vocab_size = hparams.get("padded_vocab_size", hparams.get("vocab_size")) assert max(tokenizer.get_vocab().values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) @@ -5052,16 +5049,12 @@ def set_vocab(self): if added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) - else: - # NOTE: this was added for Gemma. - # Encoding and decoding the tokens above isn't sufficient for this case. - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces - toktypes.append(gguf.TokenType.USER_DEFINED) + else: toktypes.append(gguf.TokenType.NORMAL) tokens.append(token) - self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) @@ -5076,11 +5069,9 @@ def set_vocab(self): #TODO In llama.cpp, special tokens are mapped one-to-one between a token and a coordinate. However, in reality, a transformer might associate a special token like eos_token_id with multiple tokens. # Currently, llama.cpp only supports a one-to-one mapping. # This can lead to an issue where the model fails to terminate properly. - # I'm still unclear about how llama.cpp handles special_token and what the exact call chain is! - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|observation|>"]) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|user|>"]) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) + # You can see a temporary workaround here. https://github.com/ggml-org/llama.cpp/issues/9606 + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # this one is usually not in config.json anyway special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) From 164e34e6b06904f78fecc5f034e07d068e6c9d8b Mon Sep 17 00:00:00 2001 From: glide-the Date: Tue, 22 Apr 2025 17:47:10 +0800 Subject: [PATCH 3/5] update tokenizer_model name --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9c3d64108abb8..579fad060cdab 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5054,7 +5054,7 @@ def set_vocab(self): toktypes.append(gguf.TokenType.NORMAL) tokens.append(token) - self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) From 1606e810933815eb6eda95a5e4b238e0fe534d3a Mon Sep 17 00:00:00 2001 From: glide-the Date: Wed, 23 Apr 2025 11:54:20 +0800 Subject: [PATCH 4/5] Supports compatibility with GLM variant models, including both LLaMA and GPT-2 style tokenizers. --- convert_hf_to_gguf.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 579fad060cdab..fb0fcc87d9333 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5054,16 +5054,28 @@ def set_vocab(self): toktypes.append(gguf.TokenType.NORMAL) tokens.append(token) - self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - - special_vocab=gguf.SpecialVocab( - self.dir_model, - load_merges=False, - n_vocab=vocab_size - ) + try: + # for https://huggingface.co/THUDM/glm-4-9b + special_vocab=gguf.SpecialVocab( + self.dir_model, + load_merges=True, + n_vocab=vocab_size + ) + + self.gguf_writer.add_tokenizer_model("gpt2") + except Exception as e: + logger.warning(f'Failed to load special tokens: {e}') + # for https://huggingface.co/THUDM/glm-4-9b-hf + special_vocab=gguf.SpecialVocab( + self.dir_model, + load_merges=False, + n_vocab=vocab_size + ) + self.gguf_writer.add_tokenizer_model("llama") + # only add special tokens when they were not already loaded from config.json #TODO In llama.cpp, special tokens are mapped one-to-one between a token and a coordinate. However, in reality, a transformer might associate a special token like eos_token_id with multiple tokens. From 7b42c07d1c881330556571f477bd8717f207063f Mon Sep 17 00:00:00 2001 From: glide-the Date: Wed, 23 Apr 2025 13:08:43 +0800 Subject: [PATCH 5/5] Supports compatibility with GLM variant models, including both LLaMA and GPT-2 style tokenizers. --- convert_hf_to_gguf.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index fb0fcc87d9333..92e2480431b97 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5058,14 +5058,19 @@ def set_vocab(self): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) try: + tokenizer_file = self.dir_model / 'tokenizer.json' + if not tokenizer_file.is_file(): + raise ValueError("tokenizer.json not found") + # for https://huggingface.co/THUDM/glm-4-9b special_vocab=gguf.SpecialVocab( self.dir_model, load_merges=True, n_vocab=vocab_size ) - + self.gguf_writer.add_tokenizer_model("gpt2") + except Exception as e: logger.warning(f'Failed to load special tokens: {e}') # for https://huggingface.co/THUDM/glm-4-9b-hf