fix LLaVA side effect

Achazwl · anrongqiqo@qiyuanlab.com · commit a76fbcd05054 · 2024-05-08T20:13:15.000+08:00
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -1638,8 +1638,6 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
         self.gguf_writer.add_file_type(self.ftype)
-        if "tie_word_embeddings" in self.hparams:
-            self.gguf_writer.add_tie_lm_head(self.hparams["tie_word_embeddings"])
 
     def set_vocab(self):
         self._set_vocab_llama_hf()
diff --git a/examples/minicpmv/minicpm-surgery.py b/examples/minicpmv/minicpm-surgery.py
@@ -47,7 +47,8 @@
     d = json.load(f)
     d.pop("auto_map")
     d["tokenizer_class"] = "LlamaTokenizer"
-    d.pop("add_prefix_space")
+    if "add_prefix_space" in d:
+        d.pop("add_prefix_space")
 with open(f"{args.model}/MiniCPM/tokenizer_config.json", "w") as f:
     json.dump(d, f, indent=2)
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -44,7 +44,6 @@ class LLM:
         EXPERT_USED_COUNT     = "{arch}.expert_used_count"
         POOLING_TYPE          = "{arch}.pooling_type"
         LOGIT_SCALE           = "{arch}.logit_scale"
-        TIE_LM_HEAD           = "{arch}.tie_lm_head"
 
     class Attention:
         HEAD_COUNT        = "{arch}.attention.head_count"
@@ -901,7 +900,6 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_FEED_FORWARD_LENGTH   = Keys.LLM.FEED_FORWARD_LENGTH
 KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
 KEY_TENSOR_DATA_LAYOUT    = Keys.LLM.TENSOR_DATA_LAYOUT
-KEY_TIE_LM_HEAD           = Keys.LLM.TIE_LM_HEAD
 
 # attention
 KEY_ATTENTION_HEAD_COUNT        = Keys.Attention.HEAD_COUNT
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -350,9 +350,6 @@ def add_feed_forward_length(self, length: int) -> None:
     def add_parallel_residual(self, use: bool) -> None:
         self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
 
-    def add_tie_lm_head(self, tie_lm_head: bool) -> None:
-        self.add_bool(Keys.LLM.TIE_LM_HEAD.format(arch=self.arch), tie_lm_head)
-
     def add_head_count(self, count: int) -> None:
         self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
 
diff --git a/llama.cpp b/llama.cpp
@@ -285,7 +285,6 @@ enum llm_kv {
     LLM_KV_EXPERT_USED_COUNT,
     LLM_KV_POOLING_TYPE,
     LLM_KV_LOGIT_SCALE,
-    LLM_KV_TIE_LM_HEAD,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -362,7 +361,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
     { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"          },
     { LLM_KV_LOGIT_SCALE,                   "%s.logit_scale"           },
-    { LLM_KV_TIE_LM_HEAD,                   "%s.tie_lm_head"           },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
@@ -1827,7 +1825,6 @@ struct llama_hparams {
 
     bool causal_attn = true;
     bool need_kq_pos = false;
-    bool tie_lm_head = true;
 
     enum llama_pooling_type      pooling_type            = LLAMA_POOLING_TYPE_NONE;
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
@@ -3314,6 +3311,7 @@ struct llama_model_loader {
         ggml_set_name(tensor, ggml_get_name(cur));
 
         n_created++;
+        printf("%s: created tensor '%s'\n", __func__, ggml_get_name(tensor));
 
         return tensor;
     }
@@ -3382,6 +3380,8 @@ struct llama_model_loader {
         ggml_set_name(tensor, name.c_str());
 
         n_created++;
+        printf("%s: created tensor '%s'\n", __func__, name.c_str());
+
 
         return tensor;
     }
@@ -3699,7 +3699,6 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_BLOCK_COUNT,          hparams.n_layer);
     ml.get_key(LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);
-    ml.get_key(LLM_KV_TIE_LM_HEAD,          hparams.tie_lm_head,   false);
 
     GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
     GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@@ -4711,8 +4710,12 @@ static bool llm_load_tensors(
             case LLM_ARCH_MINICPM:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    if (!hparams.tie_lm_head){
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
+                    model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
                     }
 
                     // output
@@ -4793,6 +4796,7 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese GROK\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -4922,6 +4926,7 @@ static bool llm_load_tensors(
                         if (!model.output) {
                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese FALCON\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5127,6 +5132,7 @@ static bool llm_load_tensors(
                         if (!model.output) {
                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese MPT\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5249,6 +5255,7 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese QWEN2\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5539,6 +5546,7 @@ static bool llm_load_tensors(
                     model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                     model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
                     ml.n_created--; // artificial tensor
+                    printf("created tensor decrese GEMMA\n");
                     ml.size_data += ggml_nbytes(model.output);
 
                     const int64_t n_ff          = hparams.n_ff;
@@ -5579,6 +5587,7 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese STARCODER2\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
 
@@ -5635,6 +5644,7 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese MAMBA\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5698,6 +5708,7 @@ static bool llm_load_tensors(
                         // init output from the input tok embed
                         model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                         ml.n_created--; // artificial tensor
+                        printf("created tensor decrese COMMAND_R\n");
                         ml.size_data += ggml_nbytes(model.output);
                     }
 
@@ -5735,6 +5746,7 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese OLMO\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -9656,11 +9668,7 @@ struct llm_build_context {
         cb(cur, "lmhead_scaling", -1);
 
         // lm_head
-        if (hparams.tie_lm_head){
-            cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
-        } else {
-            cur = ggml_mul_mat(ctx0, model.output, cur);
-        }
+        cur = ggml_mul_mat(ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);