Skip to content

Commit d590cd4

Browse files
authored
model : Granite MoE shared (#13269)
* feat: Add GGUF conversion for granitemoeshared Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: hparam and arch plumbing for granitemoeshared Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Split MoE fused tensors for shared experts in conversion Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: First WIP cut at model arch in cpp The hparam and architecture plumbing should be correct, but the implementation of the shared experts seems to still be broken. Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Cleaner (maybe more correct?) splitting for gate/up Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Fix the input to the shared experts I had misread that the shared experts take the inputs _before_ the standard MoE layer and was feeding the output of the MoE to the shared experts. Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Avoid architecture-specific checks for Granite MoE Shared This is a cleaner way that will allow more flexibility in architecture strings going forward. Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Split granite architectures out of llm_build_llama This helps de-clutter the llama-family graph construction and allows granite to diverge further (in preparation for Granite 4). NOTE: I removed the granite scale factors from llm_build_deci because they appear to only be there as copy-paste from llm_build_llama. The HF config does not seem to set those values: https://huggingface.co/Deci/DeciLM-7B/blob/main/config.json Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Fix compiler warning about uninitialized inp_pos This should not have been reachable, but it warns on some compliers Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Consoladate GraniteMoEShared into GraniteMoE for conversion Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Consolidate GraniteMoEShared into GraniteMoE on the c++ side Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
1 parent 1e2809b commit d590cd4

File tree

5 files changed

+235
-35
lines changed

5 files changed

+235
-35
lines changed

convert_hf_to_gguf.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5746,11 +5746,20 @@ def set_gguf_parameters(self):
57465746
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
57475747

57485748

5749-
@ModelBase.register("GraniteMoeForCausalLM")
5749+
@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
57505750
class GraniteMoeModel(GraniteModel):
57515751
"""Conversion for IBM's GraniteMoeForCausalLM"""
57525752
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
57535753

5754+
def set_gguf_parameters(self):
5755+
"""GraniteMoeShared uses GraniteMoe parameters plus the following:
5756+
- shared_intermediate_size
5757+
"""
5758+
super().set_gguf_parameters()
5759+
if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"):
5760+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length)
5761+
logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length)
5762+
57545763
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
57555764
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
57565765
is used. This essentially merges w1 and w3 into a single tensor with 2x
@@ -5761,12 +5770,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
57615770
if name.endswith("block_sparse_moe.input_linear.weight"):
57625771
ffn_dim = self.hparams["intermediate_size"]
57635772
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
5764-
gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
5773+
gate, up = data_torch.split(ffn_dim, dim=-2)
57655774
return [
57665775
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
57675776
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
57685777
]
57695778

5779+
if name.endswith("shared_mlp.input_linear.weight"):
5780+
ffn_dim = self.hparams["shared_intermediate_size"]
5781+
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
5782+
gate, up = data_torch.split(ffn_dim, dim=-2)
5783+
return [
5784+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
5785+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
5786+
]
5787+
57705788
return super().modify_tensors(data_torch, name, bid)
57715789

57725790

gguf-py/gguf/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1905,6 +1905,9 @@ class MODEL_TENSOR(IntEnum):
19051905
MODEL_TENSOR.FFN_GATE_EXP,
19061906
MODEL_TENSOR.FFN_DOWN_EXP,
19071907
MODEL_TENSOR.FFN_UP_EXP,
1908+
MODEL_TENSOR.FFN_GATE_SHEXP,
1909+
MODEL_TENSOR.FFN_UP_SHEXP,
1910+
MODEL_TENSOR.FFN_DOWN_SHEXP,
19081911
],
19091912
MODEL_ARCH.CHAMELEON: [
19101913
MODEL_TENSOR.TOKEN_EMBD,

gguf-py/gguf/tensor_mapping.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ class TensorNameMap:
428428
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
429429
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
430430
"language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
431+
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
431432
),
432433

433434
MODEL_TENSOR.ATTN_Q_NORM: (

src/llama-arch.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1481,6 +1481,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
14811481
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
14821482
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
14831483
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1484+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1485+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1486+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
14841487
},
14851488
},
14861489
{

0 commit comments

Comments
 (0)