Skip to content

Commit 0ee167e

Browse files
committed
feat: First WIP cut at model arch in cpp
The hparam and architecture plumbing should be correct, but the implementation of the shared experts seems to still be broken. Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
1 parent 5a98306 commit 0ee167e

File tree

2 files changed

+36
-1
lines changed

2 files changed

+36
-1
lines changed

src/llama-arch.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1500,6 +1500,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
15001500
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
15011501
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
15021502
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1503+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
15031504
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
15041505
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
15051506
},

src/llama-model.cpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1366,6 +1366,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
13661366
} break;
13671367
case LLM_ARCH_GRANITE:
13681368
case LLM_ARCH_GRANITE_MOE:
1369+
case LLM_ARCH_GRANITE_MOE_SHARED:
13691370
{
13701371
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
13711372
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -1379,6 +1380,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
13791380
// Add additional layer/vocab/etc checks here for other model sizes
13801381
default: type = LLM_TYPE_UNKNOWN;
13811382
}
1383+
1384+
// For Granite MoE Shared
1385+
if (arch == LLM_ARCH_GRANITE_MOE_SHARED) {
1386+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
1387+
}
13821388
} break;
13831389
case LLM_ARCH_CHAMELEON:
13841390
{
@@ -1701,6 +1707,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
17011707
case LLM_ARCH_MINICPM:
17021708
case LLM_ARCH_GRANITE:
17031709
case LLM_ARCH_GRANITE_MOE:
1710+
case LLM_ARCH_GRANITE_MOE_SHARED:
17041711
{
17051712
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
17061713

@@ -1753,6 +1760,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
17531760
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
17541761
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
17551762
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1763+
1764+
// For Granite MoE Shared
1765+
if (arch == LLM_ARCH_GRANITE_MOE_SHARED) {
1766+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1767+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1768+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
1769+
}
17561770
}
17571771
}
17581772
} break;
@@ -4355,10 +4369,14 @@ void llama_model::print_info() const {
43554369
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
43564370
}
43574371

4358-
if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
4372+
if (arch == LLM_ARCH_MINICPM ||
4373+
arch == LLM_ARCH_GRANITE ||
4374+
arch == LLM_ARCH_GRANITE_MOE ||
4375+
arch == LLM_ARCH_GRANITE_MOE_SHARED) {
43594376
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
43604377
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
43614378
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
4379+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
43624380
}
43634381

43644382
if (arch == LLM_ARCH_BAILINGMOE) {
@@ -4642,6 +4660,20 @@ struct llm_build_llama : public llm_graph_context {
46424660
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
46434661
il);
46444662
cb(cur, "ffn_moe_out", il);
4663+
4664+
// For Granite MoE Shared
4665+
if (model.arch == LLM_ARCH_GRANITE_MOE_SHARED) {
4666+
ggml_tensor * ffn_shexp = build_ffn(cur,
4667+
model.layers[il].ffn_up_shexp, NULL, NULL,
4668+
model.layers[il].ffn_gate_shexp, NULL, NULL,
4669+
model.layers[il].ffn_down_shexp, NULL, NULL,
4670+
NULL,
4671+
LLM_FFN_SILU, LLM_FFN_PAR, il);
4672+
cb(ffn_shexp, "ffn_shexp", il);
4673+
4674+
cur = ggml_add(ctx0, cur, ffn_shexp);
4675+
cb(cur, "ffn_out", il);
4676+
}
46454677
}
46464678

46474679
// For Granite architecture
@@ -12880,6 +12912,7 @@ llm_graph_result_ptr llama_model::build_graph(
1288012912
case LLM_ARCH_MINICPM:
1288112913
case LLM_ARCH_GRANITE:
1288212914
case LLM_ARCH_GRANITE_MOE:
12915+
case LLM_ARCH_GRANITE_MOE_SHARED:
1288312916
{
1288412917
llm = std::make_unique<llm_build_llama>(*this, params, gf);
1288512918
} break;
@@ -13257,6 +13290,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1325713290
case LLM_ARCH_GLM4:
1325813291
case LLM_ARCH_GRANITE:
1325913292
case LLM_ARCH_GRANITE_MOE:
13293+
case LLM_ARCH_GRANITE_MOE_SHARED:
1326013294
case LLM_ARCH_CHAMELEON:
1326113295
case LLM_ARCH_BAILINGMOE:
1326213296
return LLAMA_ROPE_TYPE_NORM;

0 commit comments

Comments
 (0)