Skip to content

Commit a76fbcd

Browse files
Achazwlanrongqiqo@qiyuanlab.com
authored and
anrongqiqo@qiyuanlab.com
committed
fix LLaVA side effect
1 parent 36bff51 commit a76fbcd

File tree

5 files changed

+21
-19
lines changed

5 files changed

+21
-19
lines changed

convert-hf-to-gguf.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,8 +1638,6 @@ def set_gguf_parameters(self):
16381638
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
16391639
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
16401640
self.gguf_writer.add_file_type(self.ftype)
1641-
if "tie_word_embeddings" in self.hparams:
1642-
self.gguf_writer.add_tie_lm_head(self.hparams["tie_word_embeddings"])
16431641

16441642
def set_vocab(self):
16451643
self._set_vocab_llama_hf()

examples/minicpmv/minicpm-surgery.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@
4747
d = json.load(f)
4848
d.pop("auto_map")
4949
d["tokenizer_class"] = "LlamaTokenizer"
50-
d.pop("add_prefix_space")
50+
if "add_prefix_space" in d:
51+
d.pop("add_prefix_space")
5152
with open(f"{args.model}/MiniCPM/tokenizer_config.json", "w") as f:
5253
json.dump(d, f, indent=2)
5354

gguf-py/gguf/constants.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ class LLM:
4444
EXPERT_USED_COUNT = "{arch}.expert_used_count"
4545
POOLING_TYPE = "{arch}.pooling_type"
4646
LOGIT_SCALE = "{arch}.logit_scale"
47-
TIE_LM_HEAD = "{arch}.tie_lm_head"
4847

4948
class Attention:
5049
HEAD_COUNT = "{arch}.attention.head_count"
@@ -901,7 +900,6 @@ def get_type(val: Any) -> GGUFValueType:
901900
KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH
902901
KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
903902
KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
904-
KEY_TIE_LM_HEAD = Keys.LLM.TIE_LM_HEAD
905903

906904
# attention
907905
KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT

gguf-py/gguf/gguf_writer.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -350,9 +350,6 @@ def add_feed_forward_length(self, length: int) -> None:
350350
def add_parallel_residual(self, use: bool) -> None:
351351
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
352352

353-
def add_tie_lm_head(self, tie_lm_head: bool) -> None:
354-
self.add_bool(Keys.LLM.TIE_LM_HEAD.format(arch=self.arch), tie_lm_head)
355-
356353
def add_head_count(self, count: int) -> None:
357354
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
358355

llama.cpp

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,6 @@ enum llm_kv {
285285
LLM_KV_EXPERT_USED_COUNT,
286286
LLM_KV_POOLING_TYPE,
287287
LLM_KV_LOGIT_SCALE,
288-
LLM_KV_TIE_LM_HEAD,
289288

290289
LLM_KV_ATTENTION_HEAD_COUNT,
291290
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -362,7 +361,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
362361
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
363362
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
364363
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
365-
{ LLM_KV_TIE_LM_HEAD, "%s.tie_lm_head" },
366364

367365
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
368366
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -1827,7 +1825,6 @@ struct llama_hparams {
18271825

18281826
bool causal_attn = true;
18291827
bool need_kq_pos = false;
1830-
bool tie_lm_head = true;
18311828

18321829
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
18331830
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -3314,6 +3311,7 @@ struct llama_model_loader {
33143311
ggml_set_name(tensor, ggml_get_name(cur));
33153312

33163313
n_created++;
3314+
printf("%s: created tensor '%s'\n", __func__, ggml_get_name(tensor));
33173315

33183316
return tensor;
33193317
}
@@ -3382,6 +3380,8 @@ struct llama_model_loader {
33823380
ggml_set_name(tensor, name.c_str());
33833381

33843382
n_created++;
3383+
printf("%s: created tensor '%s'\n", __func__, name.c_str());
3384+
33853385

33863386
return tensor;
33873387
}
@@ -3699,7 +3699,6 @@ static void llm_load_hparams(
36993699
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
37003700
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
37013701
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
3702-
ml.get_key(LLM_KV_TIE_LM_HEAD, hparams.tie_lm_head, false);
37033702

37043703
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
37053704
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@@ -4711,8 +4710,12 @@ static bool llm_load_tensors(
47114710
case LLM_ARCH_MINICPM:
47124711
{
47134712
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4714-
if (!hparams.tie_lm_head){
4715-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4713+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4714+
// if output is NULL, init from the input tok embed
4715+
if (model.output == NULL) {
4716+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4717+
ml.n_created--; // artificial tensor
4718+
ml.size_data += ggml_nbytes(model.output);
47164719
}
47174720

47184721
// output
@@ -4793,6 +4796,7 @@ static bool llm_load_tensors(
47934796
if (model.output == NULL) {
47944797
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
47954798
ml.n_created--; // artificial tensor
4799+
printf("created tensor decrese GROK\n");
47964800
ml.size_data += ggml_nbytes(model.output);
47974801
}
47984802
}
@@ -4922,6 +4926,7 @@ static bool llm_load_tensors(
49224926
if (!model.output) {
49234927
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
49244928
ml.n_created--; // artificial tensor
4929+
printf("created tensor decrese FALCON\n");
49254930
ml.size_data += ggml_nbytes(model.output);
49264931
}
49274932
}
@@ -5127,6 +5132,7 @@ static bool llm_load_tensors(
51275132
if (!model.output) {
51285133
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
51295134
ml.n_created--; // artificial tensor
5135+
printf("created tensor decrese MPT\n");
51305136
ml.size_data += ggml_nbytes(model.output);
51315137
}
51325138
}
@@ -5249,6 +5255,7 @@ static bool llm_load_tensors(
52495255
if (model.output == NULL) {
52505256
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
52515257
ml.n_created--; // artificial tensor
5258+
printf("created tensor decrese QWEN2\n");
52525259
ml.size_data += ggml_nbytes(model.output);
52535260
}
52545261
}
@@ -5539,6 +5546,7 @@ static bool llm_load_tensors(
55395546
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
55405547
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
55415548
ml.n_created--; // artificial tensor
5549+
printf("created tensor decrese GEMMA\n");
55425550
ml.size_data += ggml_nbytes(model.output);
55435551

55445552
const int64_t n_ff = hparams.n_ff;
@@ -5579,6 +5587,7 @@ static bool llm_load_tensors(
55795587
if (model.output == NULL) {
55805588
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
55815589
ml.n_created--; // artificial tensor
5590+
printf("created tensor decrese STARCODER2\n");
55825591
ml.size_data += ggml_nbytes(model.output);
55835592
}
55845593

@@ -5635,6 +5644,7 @@ static bool llm_load_tensors(
56355644
if (model.output == NULL) {
56365645
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
56375646
ml.n_created--; // artificial tensor
5647+
printf("created tensor decrese MAMBA\n");
56385648
ml.size_data += ggml_nbytes(model.output);
56395649
}
56405650
}
@@ -5698,6 +5708,7 @@ static bool llm_load_tensors(
56985708
// init output from the input tok embed
56995709
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
57005710
ml.n_created--; // artificial tensor
5711+
printf("created tensor decrese COMMAND_R\n");
57015712
ml.size_data += ggml_nbytes(model.output);
57025713
}
57035714

@@ -5735,6 +5746,7 @@ static bool llm_load_tensors(
57355746
if (model.output == NULL) {
57365747
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
57375748
ml.n_created--; // artificial tensor
5749+
printf("created tensor decrese OLMO\n");
57385750
ml.size_data += ggml_nbytes(model.output);
57395751
}
57405752
}
@@ -9656,11 +9668,7 @@ struct llm_build_context {
96569668
cb(cur, "lmhead_scaling", -1);
96579669

96589670
// lm_head
9659-
if (hparams.tie_lm_head){
9660-
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
9661-
} else {
9662-
cur = ggml_mul_mat(ctx0, model.output, cur);
9663-
}
9671+
cur = ggml_mul_mat(ctx0, model.output, cur);
96649672
cb(cur, "result_output", -1);
96659673

96669674
ggml_build_forward_expand(gf, cur);

0 commit comments

Comments
 (0)