@@ -285,7 +285,6 @@ enum llm_kv {
285
285
LLM_KV_EXPERT_USED_COUNT,
286
286
LLM_KV_POOLING_TYPE,
287
287
LLM_KV_LOGIT_SCALE,
288
- LLM_KV_TIE_LM_HEAD,
289
288
290
289
LLM_KV_ATTENTION_HEAD_COUNT,
291
290
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -362,7 +361,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
362
361
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
363
362
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
364
363
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
365
- { LLM_KV_TIE_LM_HEAD, "%s.tie_lm_head" },
366
364
367
365
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
368
366
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -1827,7 +1825,6 @@ struct llama_hparams {
1827
1825
1828
1826
bool causal_attn = true;
1829
1827
bool need_kq_pos = false;
1830
- bool tie_lm_head = true;
1831
1828
1832
1829
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
1833
1830
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -3314,6 +3311,7 @@ struct llama_model_loader {
3314
3311
ggml_set_name(tensor, ggml_get_name(cur));
3315
3312
3316
3313
n_created++;
3314
+ printf("%s: created tensor '%s'\n", __func__, ggml_get_name(tensor));
3317
3315
3318
3316
return tensor;
3319
3317
}
@@ -3382,6 +3380,8 @@ struct llama_model_loader {
3382
3380
ggml_set_name(tensor, name.c_str());
3383
3381
3384
3382
n_created++;
3383
+ printf("%s: created tensor '%s'\n", __func__, name.c_str());
3384
+
3385
3385
3386
3386
return tensor;
3387
3387
}
@@ -3699,7 +3699,6 @@ static void llm_load_hparams(
3699
3699
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
3700
3700
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
3701
3701
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
3702
- ml.get_key(LLM_KV_TIE_LM_HEAD, hparams.tie_lm_head, false);
3703
3702
3704
3703
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
3705
3704
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@@ -4711,8 +4710,12 @@ static bool llm_load_tensors(
4711
4710
case LLM_ARCH_MINICPM:
4712
4711
{
4713
4712
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4714
- if (!hparams.tie_lm_head){
4715
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4713
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
4714
+ // if output is NULL, init from the input tok embed
4715
+ if (model.output == NULL) {
4716
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4717
+ ml.n_created--; // artificial tensor
4718
+ ml.size_data += ggml_nbytes(model.output);
4716
4719
}
4717
4720
4718
4721
// output
@@ -4793,6 +4796,7 @@ static bool llm_load_tensors(
4793
4796
if (model.output == NULL) {
4794
4797
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4795
4798
ml.n_created--; // artificial tensor
4799
+ printf("created tensor decrese GROK\n");
4796
4800
ml.size_data += ggml_nbytes(model.output);
4797
4801
}
4798
4802
}
@@ -4922,6 +4926,7 @@ static bool llm_load_tensors(
4922
4926
if (!model.output) {
4923
4927
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
4924
4928
ml.n_created--; // artificial tensor
4929
+ printf("created tensor decrese FALCON\n");
4925
4930
ml.size_data += ggml_nbytes(model.output);
4926
4931
}
4927
4932
}
@@ -5127,6 +5132,7 @@ static bool llm_load_tensors(
5127
5132
if (!model.output) {
5128
5133
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
5129
5134
ml.n_created--; // artificial tensor
5135
+ printf("created tensor decrese MPT\n");
5130
5136
ml.size_data += ggml_nbytes(model.output);
5131
5137
}
5132
5138
}
@@ -5249,6 +5255,7 @@ static bool llm_load_tensors(
5249
5255
if (model.output == NULL) {
5250
5256
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5251
5257
ml.n_created--; // artificial tensor
5258
+ printf("created tensor decrese QWEN2\n");
5252
5259
ml.size_data += ggml_nbytes(model.output);
5253
5260
}
5254
5261
}
@@ -5539,6 +5546,7 @@ static bool llm_load_tensors(
5539
5546
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
5540
5547
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
5541
5548
ml.n_created--; // artificial tensor
5549
+ printf("created tensor decrese GEMMA\n");
5542
5550
ml.size_data += ggml_nbytes(model.output);
5543
5551
5544
5552
const int64_t n_ff = hparams.n_ff;
@@ -5579,6 +5587,7 @@ static bool llm_load_tensors(
5579
5587
if (model.output == NULL) {
5580
5588
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5581
5589
ml.n_created--; // artificial tensor
5590
+ printf("created tensor decrese STARCODER2\n");
5582
5591
ml.size_data += ggml_nbytes(model.output);
5583
5592
}
5584
5593
@@ -5635,6 +5644,7 @@ static bool llm_load_tensors(
5635
5644
if (model.output == NULL) {
5636
5645
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5637
5646
ml.n_created--; // artificial tensor
5647
+ printf("created tensor decrese MAMBA\n");
5638
5648
ml.size_data += ggml_nbytes(model.output);
5639
5649
}
5640
5650
}
@@ -5698,6 +5708,7 @@ static bool llm_load_tensors(
5698
5708
// init output from the input tok embed
5699
5709
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5700
5710
ml.n_created--; // artificial tensor
5711
+ printf("created tensor decrese COMMAND_R\n");
5701
5712
ml.size_data += ggml_nbytes(model.output);
5702
5713
}
5703
5714
@@ -5735,6 +5746,7 @@ static bool llm_load_tensors(
5735
5746
if (model.output == NULL) {
5736
5747
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
5737
5748
ml.n_created--; // artificial tensor
5749
+ printf("created tensor decrese OLMO\n");
5738
5750
ml.size_data += ggml_nbytes(model.output);
5739
5751
}
5740
5752
}
@@ -9656,11 +9668,7 @@ struct llm_build_context {
9656
9668
cb(cur, "lmhead_scaling", -1);
9657
9669
9658
9670
// lm_head
9659
- if (hparams.tie_lm_head){
9660
- cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
9661
- } else {
9662
- cur = ggml_mul_mat(ctx0, model.output, cur);
9663
- }
9671
+ cur = ggml_mul_mat(ctx0, model.output, cur);
9664
9672
cb(cur, "result_output", -1);
9665
9673
9666
9674
ggml_build_forward_expand(gf, cur);
0 commit comments