From e913ac9c382b6fecca3287d573cb269d9a627482 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Sat, 30 Mar 2024 10:34:40 +0800 Subject: [PATCH 1/6] for new minicpm --- convert-hf-to-gguf.py | 1 + examples/main/main.cpp | 6 ++++-- gguf-py/gguf/constants.py | 1 + llama.cpp | 3 ++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c5d2d0b7813d1..78a2b1c67f88f 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1559,6 +1559,7 @@ def set_vocab(self): self.gguf_writer.add_add_space_prefix(add_prefix) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + print(special_vocab) old_eos = special_vocab.special_token_ids["eos"] if "chat" in os.path.basename(self.dir_model.absolute()): # For the chat model, we replace the eos with '<|im_end|>'. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index e2d07a6319d50..2a19817d97990 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -795,7 +795,9 @@ int main(int argc, char ** argv) { } // deal with end of text token in interactive mode - if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { + auto last_token = llama_sampling_last(ctx_sampling); + if (last_token == llama_token_eos(model) || last_token == 122753) + { LOG("found EOS token\n"); if (params.interactive) { @@ -920,7 +922,7 @@ int main(int argc, char ** argv) { } // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) { + if (!embd.empty() && (embd.back() == llama_token_eos(model) || embd.back() == 122753) && !(params.instruct || params.interactive || params.chatml)) { LOG_TEE(" [end of text]\n"); break; } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4ab026482a19e..ee26224d40b3d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -548,6 +548,7 @@ class MODEL_TENSOR(IntEnum): ], MODEL_ARCH.MINICPM: [ MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, diff --git a/llama.cpp b/llama.cpp index 892d46fbcfcec..8363233fa94da 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4375,6 +4375,7 @@ static bool llm_load_tensors( case LLM_ARCH_MINICPM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); // output { @@ -8699,7 +8700,7 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); From 9ecc666b94ca377ea4c4ce7706ca87239eab3d77 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Mon, 1 Apr 2024 15:07:42 +0800 Subject: [PATCH 2/6] compatible with old and new minicpm versions --- convert-hf-to-gguf.py | 1 + gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 3 +++ llama.cpp | 15 +++++++++++++-- 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 78a2b1c67f88f..d00481d02ec86 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1097,6 +1097,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_tie_lm_head(self.hparams["tie_lm_head"]) def set_vocab(self): self._set_vocab_hf() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ee26224d40b3d..9007c8addb468 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -43,6 +43,7 @@ class LLM: EXPERT_USED_COUNT = "{arch}.expert_used_count" POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" + TIE_LM_HEAD = "{arch}.tie_lm_head" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -805,6 +806,7 @@ def get_type(val: Any) -> GGUFValueType: KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT +KEY_TIE_LM_HEAD = Keys.LLM.TIE_LM_HEAD # attention KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 2ae6c814b52de..10465ab5e3109 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -342,6 +342,9 @@ def add_feed_forward_length(self, length: int) -> None: def add_parallel_residual(self, use: bool) -> None: self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) + + def add_tie_lm_head(self, tie_lm_head: bool) -> None: + self.add_bool(Keys.LLM.TIE_LM_HEAD.format(arch=self.arch), tie_lm_head) def add_head_count(self, count: int) -> None: self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) diff --git a/llama.cpp b/llama.cpp index 8363233fa94da..7337fc02c9d95 100644 --- a/llama.cpp +++ b/llama.cpp @@ -276,6 +276,7 @@ enum llm_kv { LLM_KV_EXPERT_USED_COUNT, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, + LLM_KV_TIE_LM_HEAD, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -320,6 +321,7 @@ enum llm_kv { LLM_KV_TOKENIZER_ADD_PREFIX, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, + }; static const std::map LLM_KV_NAMES = { @@ -345,6 +347,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, + { LLM_KV_TIE_LM_HEAD, "%s.tie_lm_head" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -1707,6 +1710,7 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; + bool tie_lm_head = true; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; @@ -3503,6 +3507,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); + ml.get_key(LLM_KV_TIE_LM_HEAD, hparams.tie_lm_head, false); GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); @@ -4375,7 +4380,9 @@ static bool llm_load_tensors( case LLM_ARCH_MINICPM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + if (!hparams.tie_lm_head){ + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + } // output { @@ -8700,7 +8707,11 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + if (hparams.tie_lm_head){ + cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + }else{ + cur = ggml_mul_mat(ctx0, model.output, cur); + } cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); From 8502a015a59b651ebe27b2d95376e27138ecaf2c Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Mon, 1 Apr 2024 17:00:07 +0800 Subject: [PATCH 3/6] remove eos --- examples/main/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 2a19817d97990..806af272509d8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -796,7 +796,7 @@ int main(int argc, char ** argv) { // deal with end of text token in interactive mode auto last_token = llama_sampling_last(ctx_sampling); - if (last_token == llama_token_eos(model) || last_token == 122753) + if (last_token == llama_token_eos(model)) { LOG("found EOS token\n"); @@ -922,7 +922,7 @@ int main(int argc, char ** argv) { } // end of text token - if (!embd.empty() && (embd.back() == llama_token_eos(model) || embd.back() == 122753) && !(params.instruct || params.interactive || params.chatml)) { + if (!embd.empty() && (embd.back() == llama_token_eos(model)) && !(params.instruct || params.interactive || params.chatml)) { LOG_TEE(" [end of text]\n"); break; } From 582b13c96669e899bfd18f87ced8c9d1d3194670 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Thu, 11 Apr 2024 15:35:18 +0800 Subject: [PATCH 4/6] for old config --- convert-hf-to-gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index d00481d02ec86..861dfd6437005 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1097,7 +1097,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_tie_lm_head(self.hparams["tie_lm_head"]) + if "tie_lm_head" in self.hparams: + self.gguf_writer.add_tie_lm_head(self.hparams["tie_lm_head"]) def set_vocab(self): self._set_vocab_hf() @@ -1560,7 +1561,6 @@ def set_vocab(self): self.gguf_writer.add_add_space_prefix(add_prefix) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - print(special_vocab) old_eos = special_vocab.special_token_ids["eos"] if "chat" in os.path.basename(self.dir_model.absolute()): # For the chat model, we replace the eos with '<|im_end|>'. From 4f61b3066e6ddd194008901820c1d2d1111534f4 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Thu, 11 Apr 2024 15:48:59 +0800 Subject: [PATCH 5/6] recover main.cpp --- examples/main/main.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 806af272509d8..e2d07a6319d50 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -795,9 +795,7 @@ int main(int argc, char ** argv) { } // deal with end of text token in interactive mode - auto last_token = llama_sampling_last(ctx_sampling); - if (last_token == llama_token_eos(model)) - { + if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { LOG("found EOS token\n"); if (params.interactive) { @@ -922,7 +920,7 @@ int main(int argc, char ** argv) { } // end of text token - if (!embd.empty() && (embd.back() == llama_token_eos(model)) && !(params.instruct || params.interactive || params.chatml)) { + if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) { LOG_TEE(" [end of text]\n"); break; } From 1cd0a03720afdb5613d1f2a3cdeb6937add93e71 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Tue, 16 Apr 2024 20:34:21 +0800 Subject: [PATCH 6/6] fix --- llama.cpp | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/llama.cpp b/llama.cpp index 86ded5ba85d1b..440103ed34433 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4601,18 +4601,7 @@ static bool llm_load_tensors( } // output - { - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - if (model.arch != LLM_ARCH_MINICPM){ - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); - // if output is NULL, init from the input tok embed - if (model.output == NULL) { - model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); - } - } - } + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_layer = ctx_for_layer(i);