fix: Fix the input to the shared experts

gabe-l-hart · gabe-l-hart · commit 97de56d88693 · 2025-05-02T10:18:00.000-06:00
I had misread that the shared experts take the inputs _before_ the standard
MoE layer and was feeding the output of the MoE to the shared experts.

Branch: GraniteMoEShared

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -4648,7 +4648,7 @@ struct llm_build_llama : public llm_graph_context {
                         LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = build_moe_ffn(cur,
+                ggml_tensor * moe_out = build_moe_ffn(cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -4659,7 +4659,7 @@ struct llm_build_llama : public llm_graph_context {
                         false, 0.0,
                         LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                         il);
-                cb(cur, "ffn_moe_out", il);
+                cb(moe_out, "ffn_moe_out", il);
 
                 // For Granite MoE Shared
                 if (model.arch == LLM_ARCH_GRANITE_MOE_SHARED) {
@@ -4671,8 +4671,10 @@ struct llm_build_llama : public llm_graph_context {
                         LLM_FFN_SILU, LLM_FFN_PAR, il);
                     cb(ffn_shexp, "ffn_shexp", il);
 
-                    cur = ggml_add(ctx0, cur, ffn_shexp);
+                    cur = ggml_add(ctx0, moe_out, ffn_shexp);
                     cb(cur, "ffn_out", il);
+                } else {
+                    cur = moe_out;
                 }
             }