Skip to content

Commit 4446994

Browse files
committed
refactor: Split granite architectures out of llm_build_llama
This helps de-clutter the llama-family graph construction and allows granite to diverge further (in preparation for Granite 4). NOTE: I removed the granite scale factors from llm_build_deci because they appear to only be there as copy-paste from llm_build_llama. The HF config does not seem to set those values: https://huggingface.co/Deci/DeciLM-7B/blob/main/config.json Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
1 parent 52d2ed6 commit 4446994

File tree

1 file changed

+196
-51
lines changed

1 file changed

+196
-51
lines changed

src/llama-model.cpp

Lines changed: 196 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4610,11 +4610,6 @@ struct llm_build_llama : public llm_graph_context {
46104610
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
46114611
}
46124612

4613-
// For Granite architecture
4614-
if (hparams.f_residual_scale) {
4615-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4616-
}
4617-
46184613
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
46194614
cb(ffn_inp, "ffn_inp", il);
46204615

@@ -4672,7 +4667,7 @@ struct llm_build_llama : public llm_graph_context {
46724667
LLM_NORM_RMS, il);
46734668
cb(cur, "ffn_norm", il);
46744669

4675-
ggml_tensor * moe_out = build_moe_ffn(cur,
4670+
cur = build_moe_ffn(cur,
46764671
model.layers[il].ffn_gate_inp,
46774672
model.layers[il].ffn_up_exps,
46784673
model.layers[il].ffn_gate_exps,
@@ -4683,28 +4678,7 @@ struct llm_build_llama : public llm_graph_context {
46834678
false, 0.0,
46844679
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
46854680
il);
4686-
cb(moe_out, "ffn_moe_out", il);
4687-
4688-
// For Granite MoE Shared
4689-
if (hparams.n_ff_shexp > 0) {
4690-
ggml_tensor * ffn_shexp = build_ffn(cur,
4691-
model.layers[il].ffn_up_shexp, NULL, NULL,
4692-
model.layers[il].ffn_gate_shexp, NULL, NULL,
4693-
model.layers[il].ffn_down_shexp, NULL, NULL,
4694-
NULL,
4695-
LLM_FFN_SILU, LLM_FFN_PAR, il);
4696-
cb(ffn_shexp, "ffn_shexp", il);
4697-
4698-
cur = ggml_add(ctx0, moe_out, ffn_shexp);
4699-
cb(cur, "ffn_out", il);
4700-
} else {
4701-
cur = moe_out;
4702-
}
4703-
}
4704-
4705-
// For Granite architecture
4706-
if (hparams.f_residual_scale) {
4707-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4681+
cb(cur, "ffn_moe_out", il);
47084682
}
47094683

47104684
cur = ggml_add(ctx0, cur, ffn_inp);
@@ -4729,11 +4703,6 @@ struct llm_build_llama : public llm_graph_context {
47294703
// lm_head
47304704
cur = build_lora_mm(model.output, cur);
47314705

4732-
// For Granite architecture
4733-
if (hparams.f_logit_scale) {
4734-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4735-
}
4736-
47374706
cb(cur, "result_output", -1);
47384707
res->t_logits = cur;
47394708

@@ -4844,11 +4813,6 @@ struct llm_build_deci : public llm_graph_context {
48444813
continue;
48454814
}
48464815

4847-
// For Granite architecture
4848-
if (hparams.f_residual_scale) {
4849-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4850-
}
4851-
48524816
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
48534817
ggml_tensor * ffn_inp = cur;
48544818
if (n_head > 0) {
@@ -4872,11 +4836,6 @@ struct llm_build_deci : public llm_graph_context {
48724836
cb(cur, "ffn_out", il);
48734837
}
48744838

4875-
// For Granite architecture
4876-
if (hparams.f_residual_scale) {
4877-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4878-
}
4879-
48804839
cur = ggml_add(ctx0, cur, ffn_inp);
48814840
cb(cur, "ffn_out", il);
48824841

@@ -4899,11 +4858,6 @@ struct llm_build_deci : public llm_graph_context {
48994858
// lm_head
49004859
cur = build_lora_mm(model.output, cur);
49014860

4902-
// For Granite architecture
4903-
if (hparams.f_logit_scale) {
4904-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4905-
}
4906-
49074861
cb(cur, "result_output", -1);
49084862
res->t_logits = cur;
49094863

@@ -12242,6 +12196,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
1224212196
}
1224312197
};
1224412198

12199+
12200+
struct llm_build_granite : public llm_graph_context {
12201+
llm_build_granite(
12202+
const llama_model & model,
12203+
const llm_graph_params & params,
12204+
ggml_cgraph * gf,
12205+
const bool use_rope = true)
12206+
: llm_graph_context(params) {
12207+
12208+
const int64_t n_embd_head = hparams.n_embd_head_v;
12209+
12210+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12211+
GGML_ASSERT(n_embd_head == hparams.n_rot);
12212+
12213+
ggml_tensor * cur;
12214+
ggml_tensor * inpL;
12215+
12216+
inpL = build_inp_embd(model.tok_embd);
12217+
12218+
// inp_pos - used for rope if enabled
12219+
ggml_tensor * inp_pos;
12220+
if (use_rope) {
12221+
inp_pos = build_inp_pos();
12222+
}
12223+
12224+
auto * inp_attn = build_attn_inp_kv_unified();
12225+
12226+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12227+
for (int il = 0; il < n_layer; ++il) {
12228+
ggml_tensor * inpSA = inpL;
12229+
12230+
// norm
12231+
cur = build_norm(inpL,
12232+
model.layers[il].attn_norm, NULL,
12233+
LLM_NORM_RMS, il);
12234+
cb(cur, "attn_norm", il);
12235+
12236+
// self-attention
12237+
{
12238+
// compute Q and K and (optionally) RoPE them
12239+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12240+
cb(Qcur, "Qcur", il);
12241+
if (model.layers[il].bq) {
12242+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12243+
cb(Qcur, "Qcur", il);
12244+
}
12245+
12246+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12247+
cb(Kcur, "Kcur", il);
12248+
if (model.layers[il].bk) {
12249+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12250+
cb(Kcur, "Kcur", il);
12251+
}
12252+
12253+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12254+
cb(Vcur, "Vcur", il);
12255+
if (model.layers[il].bv) {
12256+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12257+
cb(Vcur, "Vcur", il);
12258+
}
12259+
12260+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12261+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12262+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12263+
12264+
if (use_rope) {
12265+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
12266+
Qcur = ggml_rope_ext(
12267+
ctx0, Qcur, inp_pos, rope_factors,
12268+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12269+
ext_factor, attn_factor, beta_fast, beta_slow
12270+
);
12271+
12272+
Kcur = ggml_rope_ext(
12273+
ctx0, Kcur, inp_pos, rope_factors,
12274+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12275+
ext_factor, attn_factor, beta_fast, beta_slow
12276+
);
12277+
}
12278+
12279+
cb(Qcur, "Qcur", il);
12280+
cb(Kcur, "Kcur", il);
12281+
cb(Vcur, "Vcur", il);
12282+
12283+
cur = build_attn(inp_attn, gf,
12284+
model.layers[il].wo, model.layers[il].bo,
12285+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12286+
cb(cur, "attn_out", il);
12287+
}
12288+
12289+
if (il == n_layer - 1) {
12290+
// skip computing output for unused tokens
12291+
ggml_tensor * inp_out_ids = build_inp_out_ids();
12292+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12293+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12294+
}
12295+
12296+
// For Granite architectures - scale residual
12297+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12298+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12299+
cb(ffn_inp, "ffn_inp", il);
12300+
12301+
// feed-forward network (non-MoE)
12302+
if (model.layers[il].ffn_gate_inp == nullptr) {
12303+
12304+
cur = build_norm(ffn_inp,
12305+
model.layers[il].ffn_norm, NULL,
12306+
LLM_NORM_RMS, il);
12307+
cb(cur, "ffn_norm", il);
12308+
12309+
cur = build_ffn(cur,
12310+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12311+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12312+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12313+
NULL,
12314+
LLM_FFN_SILU, LLM_FFN_PAR, il);
12315+
cb(cur, "ffn_out", il);
12316+
12317+
} else {
12318+
// MoE branch
12319+
cur = build_norm(ffn_inp,
12320+
model.layers[il].ffn_norm, NULL,
12321+
LLM_NORM_RMS, il);
12322+
cb(cur, "ffn_norm", il);
12323+
12324+
ggml_tensor * moe_out = build_moe_ffn(cur,
12325+
model.layers[il].ffn_gate_inp,
12326+
model.layers[il].ffn_up_exps,
12327+
model.layers[il].ffn_gate_exps,
12328+
model.layers[il].ffn_down_exps,
12329+
nullptr,
12330+
n_expert, n_expert_used,
12331+
LLM_FFN_SILU, true,
12332+
false, 0.0,
12333+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12334+
il);
12335+
cb(moe_out, "ffn_moe_out", il);
12336+
12337+
// For Granite MoE Shared
12338+
if (hparams.n_ff_shexp > 0) {
12339+
ggml_tensor * ffn_shexp = build_ffn(cur,
12340+
model.layers[il].ffn_up_shexp, NULL, NULL,
12341+
model.layers[il].ffn_gate_shexp, NULL, NULL,
12342+
model.layers[il].ffn_down_shexp, NULL, NULL,
12343+
NULL,
12344+
LLM_FFN_SILU, LLM_FFN_PAR, il);
12345+
cb(ffn_shexp, "ffn_shexp", il);
12346+
12347+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
12348+
cb(cur, "ffn_out", il);
12349+
} else {
12350+
cur = moe_out;
12351+
}
12352+
}
12353+
12354+
// For Granite architectures - scale residual
12355+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12356+
cur = ggml_add(ctx0, cur, ffn_inp);
12357+
cb(cur, "ffn_out", il);
12358+
12359+
cur = build_cvec(cur, il);
12360+
cb(cur, "l_out", il);
12361+
12362+
// input for next layer
12363+
inpL = cur;
12364+
}
12365+
12366+
cur = inpL;
12367+
12368+
cur = build_norm(cur,
12369+
model.output_norm, NULL,
12370+
LLM_NORM_RMS, -1);
12371+
12372+
cb(cur, "result_norm", -1);
12373+
res->t_embd = cur;
12374+
12375+
// lm_head
12376+
cur = build_lora_mm(model.output, cur);
12377+
12378+
// For Granite architectures - scale logits
12379+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
12380+
cb(cur, "result_output", -1);
12381+
res->t_logits = cur;
12382+
12383+
ggml_build_forward_expand(gf, cur);
12384+
}
12385+
};
12386+
1224512387
// ref: https://github.com/facebookresearch/chameleon
1224612388
// based on the original build_llama() function, changes:
1224712389
// * qk-norm
@@ -12949,9 +13091,6 @@ llm_graph_result_ptr llama_model::build_graph(
1294913091
case LLM_ARCH_LLAMA:
1295013092
case LLM_ARCH_LLAMA4:
1295113093
case LLM_ARCH_MINICPM:
12952-
case LLM_ARCH_GRANITE:
12953-
case LLM_ARCH_GRANITE_MOE:
12954-
case LLM_ARCH_GRANITE_MOE_SHARED:
1295513094
{
1295613095
llm = std::make_unique<llm_build_llama>(*this, params, gf);
1295713096
} break;
@@ -13182,6 +13321,12 @@ llm_graph_result_ptr llama_model::build_graph(
1318213321
{
1318313322
llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
1318413323
} break;
13324+
case LLM_ARCH_GRANITE:
13325+
case LLM_ARCH_GRANITE_MOE:
13326+
case LLM_ARCH_GRANITE_MOE_SHARED:
13327+
{
13328+
llm = std::make_unique<llm_build_granite>(*this, params, gf);
13329+
} break;
1318513330
case LLM_ARCH_CHAMELEON:
1318613331
{
1318713332
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);

0 commit comments

Comments
 (0)