@@ -1366,6 +1366,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1366
1366
} break;
1367
1367
case LLM_ARCH_GRANITE:
1368
1368
case LLM_ARCH_GRANITE_MOE:
1369
+ case LLM_ARCH_GRANITE_MOE_SHARED:
1369
1370
{
1370
1371
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1371
1372
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -1379,6 +1380,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1379
1380
// Add additional layer/vocab/etc checks here for other model sizes
1380
1381
default: type = LLM_TYPE_UNKNOWN;
1381
1382
}
1383
+
1384
+ // For Granite MoE Shared
1385
+ if (arch == LLM_ARCH_GRANITE_MOE_SHARED) {
1386
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
1387
+ }
1382
1388
} break;
1383
1389
case LLM_ARCH_CHAMELEON:
1384
1390
{
@@ -1701,6 +1707,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1701
1707
case LLM_ARCH_MINICPM:
1702
1708
case LLM_ARCH_GRANITE:
1703
1709
case LLM_ARCH_GRANITE_MOE:
1710
+ case LLM_ARCH_GRANITE_MOE_SHARED:
1704
1711
{
1705
1712
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1706
1713
@@ -1753,6 +1760,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1753
1760
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1754
1761
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1755
1762
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1763
+
1764
+ // For Granite MoE Shared
1765
+ if (arch == LLM_ARCH_GRANITE_MOE_SHARED) {
1766
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1767
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1768
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
1769
+ }
1756
1770
}
1757
1771
}
1758
1772
} break;
@@ -4355,10 +4369,14 @@ void llama_model::print_info() const {
4355
4369
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4356
4370
}
4357
4371
4358
- if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
4372
+ if (arch == LLM_ARCH_MINICPM ||
4373
+ arch == LLM_ARCH_GRANITE ||
4374
+ arch == LLM_ARCH_GRANITE_MOE ||
4375
+ arch == LLM_ARCH_GRANITE_MOE_SHARED) {
4359
4376
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
4360
4377
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
4361
4378
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
4379
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
4362
4380
}
4363
4381
4364
4382
if (arch == LLM_ARCH_BAILINGMOE) {
@@ -4642,6 +4660,20 @@ struct llm_build_llama : public llm_graph_context {
4642
4660
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
4643
4661
il);
4644
4662
cb(cur, "ffn_moe_out", il);
4663
+
4664
+ // For Granite MoE Shared
4665
+ if (model.arch == LLM_ARCH_GRANITE_MOE_SHARED) {
4666
+ ggml_tensor * ffn_shexp = build_ffn(cur,
4667
+ model.layers[il].ffn_up_shexp, NULL, NULL,
4668
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
4669
+ model.layers[il].ffn_down_shexp, NULL, NULL,
4670
+ NULL,
4671
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
4672
+ cb(ffn_shexp, "ffn_shexp", il);
4673
+
4674
+ cur = ggml_add(ctx0, cur, ffn_shexp);
4675
+ cb(cur, "ffn_out", il);
4676
+ }
4645
4677
}
4646
4678
4647
4679
// For Granite architecture
@@ -12880,6 +12912,7 @@ llm_graph_result_ptr llama_model::build_graph(
12880
12912
case LLM_ARCH_MINICPM:
12881
12913
case LLM_ARCH_GRANITE:
12882
12914
case LLM_ARCH_GRANITE_MOE:
12915
+ case LLM_ARCH_GRANITE_MOE_SHARED:
12883
12916
{
12884
12917
llm = std::make_unique<llm_build_llama>(*this, params, gf);
12885
12918
} break;
@@ -13257,6 +13290,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13257
13290
case LLM_ARCH_GLM4:
13258
13291
case LLM_ARCH_GRANITE:
13259
13292
case LLM_ARCH_GRANITE_MOE:
13293
+ case LLM_ARCH_GRANITE_MOE_SHARED:
13260
13294
case LLM_ARCH_CHAMELEON:
13261
13295
case LLM_ARCH_BAILINGMOE:
13262
13296
return LLAMA_ROPE_TYPE_NORM;
0 commit comments