From e913ac9c382b6fecca3287d573cb269d9a627482 Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@modelbest.cn>
Date: Sat, 30 Mar 2024 10:34:40 +0800
Subject: [PATCH 1/6] for new minicpm

---
 convert-hf-to-gguf.py     | 1 +
 examples/main/main.cpp    | 6 ++++--
 gguf-py/gguf/constants.py | 1 +
 llama.cpp                 | 3 ++-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index c5d2d0b7813d1..78a2b1c67f88f 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1559,6 +1559,7 @@ def set_vocab(self):
         self.gguf_writer.add_add_space_prefix(add_prefix)
 
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        print(special_vocab)
         old_eos = special_vocab.special_token_ids["eos"]
         if "chat" in os.path.basename(self.dir_model.absolute()):
             # For the chat model, we replace the eos with '<|im_end|>'.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index e2d07a6319d50..2a19817d97990 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -795,7 +795,9 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of text token in interactive mode
-            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+            auto last_token = llama_sampling_last(ctx_sampling);
+            if (last_token == llama_token_eos(model) || last_token == 122753)
+            {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
@@ -920,7 +922,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
+        if (!embd.empty() && (embd.back() == llama_token_eos(model) || embd.back() == 122753) && !(params.instruct || params.interactive || params.chatml)) {
             LOG_TEE(" [end of text]\n");
             break;
         }
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 4ab026482a19e..ee26224d40b3d 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -548,6 +548,7 @@ class MODEL_TENSOR(IntEnum):
     ],
     MODEL_ARCH.MINICPM: [
         MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_NORM,
diff --git a/llama.cpp b/llama.cpp
index 892d46fbcfcec..8363233fa94da 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4375,6 +4375,7 @@ static bool llm_load_tensors(
             case LLM_ARCH_MINICPM:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, false);
 
                     // output
                     {
@@ -8699,7 +8700,7 @@ struct llm_build_context {
         cb(cur, "lmhead_scaling", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
+        cur = ggml_mul_mat(ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);

From 9ecc666b94ca377ea4c4ce7706ca87239eab3d77 Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@modelbest.cn>
Date: Mon, 1 Apr 2024 15:07:42 +0800
Subject: [PATCH 2/6] compatible with old and new minicpm versions

---
 convert-hf-to-gguf.py       |  1 +
 gguf-py/gguf/constants.py   |  2 ++
 gguf-py/gguf/gguf_writer.py |  3 +++
 llama.cpp                   | 15 +++++++++++++--
 4 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 78a2b1c67f88f..d00481d02ec86 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1097,6 +1097,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
         self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_tie_lm_head(self.hparams["tie_lm_head"])
 
     def set_vocab(self):
         self._set_vocab_hf()
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index ee26224d40b3d..9007c8addb468 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -43,6 +43,7 @@ class LLM:
         EXPERT_USED_COUNT     = "{arch}.expert_used_count"
         POOLING_TYPE          = "{arch}.pooling_type"
         LOGIT_SCALE           = "{arch}.logit_scale"
+        TIE_LM_HEAD           = "{arch}.tie_lm_head"
 
     class Attention:
         HEAD_COUNT        = "{arch}.attention.head_count"
@@ -805,6 +806,7 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_FEED_FORWARD_LENGTH   = Keys.LLM.FEED_FORWARD_LENGTH
 KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
 KEY_TENSOR_DATA_LAYOUT    = Keys.LLM.TENSOR_DATA_LAYOUT
+KEY_TIE_LM_HEAD           = Keys.LLM.TIE_LM_HEAD
 
 # attention
 KEY_ATTENTION_HEAD_COUNT        = Keys.Attention.HEAD_COUNT
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 2ae6c814b52de..10465ab5e3109 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -342,6 +342,9 @@ def add_feed_forward_length(self, length: int) -> None:
 
     def add_parallel_residual(self, use: bool) -> None:
         self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
+    
+    def add_tie_lm_head(self, tie_lm_head: bool) -> None:
+        self.add_bool(Keys.LLM.TIE_LM_HEAD.format(arch=self.arch), tie_lm_head)
 
     def add_head_count(self, count: int) -> None:
         self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
diff --git a/llama.cpp b/llama.cpp
index 8363233fa94da..7337fc02c9d95 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -276,6 +276,7 @@ enum llm_kv {
     LLM_KV_EXPERT_USED_COUNT,
     LLM_KV_POOLING_TYPE,
     LLM_KV_LOGIT_SCALE,
+    LLM_KV_TIE_LM_HEAD,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -320,6 +321,7 @@ enum llm_kv {
     LLM_KV_TOKENIZER_ADD_PREFIX,
     LLM_KV_TOKENIZER_HF_JSON,
     LLM_KV_TOKENIZER_RWKV,
+
 };
 
 static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -345,6 +347,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
     { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"          },
     { LLM_KV_LOGIT_SCALE,                   "%s.logit_scale"           },
+    { LLM_KV_TIE_LM_HEAD,                   "%s.tie_lm_head"           },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
@@ -1707,6 +1710,7 @@ struct llama_hparams {
 
     bool causal_attn = true;
     bool need_kq_pos = false;
+    bool tie_lm_head = true;
 
     enum llama_pooling_type      pooling_type            = LLAMA_POOLING_TYPE_NONE;
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
@@ -3503,6 +3507,7 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_BLOCK_COUNT,          hparams.n_layer);
     ml.get_key(LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);
+    ml.get_key(LLM_KV_TIE_LM_HEAD,          hparams.tie_lm_head, false);
 
     GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
     GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@@ -4375,7 +4380,9 @@ static bool llm_load_tensors(
             case LLM_ARCH_MINICPM:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, false);
+                    if (!hparams.tie_lm_head){
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, false);
+                    }
 
                     // output
                     {
@@ -8700,7 +8707,11 @@ struct llm_build_context {
         cb(cur, "lmhead_scaling", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        if (hparams.tie_lm_head){
+            cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
+        }else{
+            cur = ggml_mul_mat(ctx0, model.output, cur);
+        }
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);

From 8502a015a59b651ebe27b2d95376e27138ecaf2c Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@modelbest.cn>
Date: Mon, 1 Apr 2024 17:00:07 +0800
Subject: [PATCH 3/6] remove eos

---
 examples/main/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 2a19817d97990..806af272509d8 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -796,7 +796,7 @@ int main(int argc, char ** argv) {
 
             // deal with end of text token in interactive mode
             auto last_token = llama_sampling_last(ctx_sampling);
-            if (last_token == llama_token_eos(model) || last_token == 122753)
+            if (last_token == llama_token_eos(model))
             {
                 LOG("found EOS token\n");
 
@@ -922,7 +922,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of text token
-        if (!embd.empty() && (embd.back() == llama_token_eos(model) || embd.back() == 122753) && !(params.instruct || params.interactive || params.chatml)) {
+        if (!embd.empty() && (embd.back() == llama_token_eos(model)) && !(params.instruct || params.interactive || params.chatml)) {
             LOG_TEE(" [end of text]\n");
             break;
         }

From 582b13c96669e899bfd18f87ced8c9d1d3194670 Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@modelbest.cn>
Date: Thu, 11 Apr 2024 15:35:18 +0800
Subject: [PATCH 4/6] for old config

---
 convert-hf-to-gguf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index d00481d02ec86..861dfd6437005 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1097,7 +1097,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
         self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_tie_lm_head(self.hparams["tie_lm_head"])
+        if "tie_lm_head" in self.hparams:
+            self.gguf_writer.add_tie_lm_head(self.hparams["tie_lm_head"])
 
     def set_vocab(self):
         self._set_vocab_hf()
@@ -1560,7 +1561,6 @@ def set_vocab(self):
         self.gguf_writer.add_add_space_prefix(add_prefix)
 
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        print(special_vocab)
         old_eos = special_vocab.special_token_ids["eos"]
         if "chat" in os.path.basename(self.dir_model.absolute()):
             # For the chat model, we replace the eos with '<|im_end|>'.

From 4f61b3066e6ddd194008901820c1d2d1111534f4 Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@modelbest.cn>
Date: Thu, 11 Apr 2024 15:48:59 +0800
Subject: [PATCH 5/6] recover main.cpp

---
 examples/main/main.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 806af272509d8..e2d07a6319d50 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -795,9 +795,7 @@ int main(int argc, char ** argv) {
             }
 
             // deal with end of text token in interactive mode
-            auto last_token = llama_sampling_last(ctx_sampling);
-            if (last_token == llama_token_eos(model))
-            {
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
                 LOG("found EOS token\n");
 
                 if (params.interactive) {
@@ -922,7 +920,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of text token
-        if (!embd.empty() && (embd.back() == llama_token_eos(model)) && !(params.instruct || params.interactive || params.chatml)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
             LOG_TEE(" [end of text]\n");
             break;
         }

From 1cd0a03720afdb5613d1f2a3cdeb6937add93e71 Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@modelbest.cn>
Date: Tue, 16 Apr 2024 20:34:21 +0800
Subject: [PATCH 6/6] fix

---
 llama.cpp | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 86ded5ba85d1b..440103ed34433 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4601,18 +4601,7 @@ static bool llm_load_tensors(
                     }
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        if (model.arch != LLM_ARCH_MINICPM){
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
-                            // if output is NULL, init from the input tok embed
-                            if (model.output == NULL) {
-                                model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                                ml.n_created--; // artificial tensor
-                                ml.size_data += ggml_nbytes(model.output);
-                            }
-                        }
-                    }
+                    model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
 
                     for (int i = 0; i < n_layer; ++i) {
                         ggml_context * ctx_layer = ctx_for_layer(i);