@@ -4610,11 +4610,6 @@ struct llm_build_llama : public llm_graph_context {
4610
4610
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4611
4611
}
4612
4612
4613
- // For Granite architecture
4614
- if (hparams.f_residual_scale) {
4615
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4616
- }
4617
-
4618
4613
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4619
4614
cb(ffn_inp, "ffn_inp", il);
4620
4615
@@ -4672,7 +4667,7 @@ struct llm_build_llama : public llm_graph_context {
4672
4667
LLM_NORM_RMS, il);
4673
4668
cb(cur, "ffn_norm", il);
4674
4669
4675
- ggml_tensor * moe_out = build_moe_ffn(cur,
4670
+ cur = build_moe_ffn(cur,
4676
4671
model.layers[il].ffn_gate_inp,
4677
4672
model.layers[il].ffn_up_exps,
4678
4673
model.layers[il].ffn_gate_exps,
@@ -4683,28 +4678,7 @@ struct llm_build_llama : public llm_graph_context {
4683
4678
false, 0.0,
4684
4679
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
4685
4680
il);
4686
- cb(moe_out, "ffn_moe_out", il);
4687
-
4688
- // For Granite MoE Shared
4689
- if (hparams.n_ff_shexp > 0) {
4690
- ggml_tensor * ffn_shexp = build_ffn(cur,
4691
- model.layers[il].ffn_up_shexp, NULL, NULL,
4692
- model.layers[il].ffn_gate_shexp, NULL, NULL,
4693
- model.layers[il].ffn_down_shexp, NULL, NULL,
4694
- NULL,
4695
- LLM_FFN_SILU, LLM_FFN_PAR, il);
4696
- cb(ffn_shexp, "ffn_shexp", il);
4697
-
4698
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
4699
- cb(cur, "ffn_out", il);
4700
- } else {
4701
- cur = moe_out;
4702
- }
4703
- }
4704
-
4705
- // For Granite architecture
4706
- if (hparams.f_residual_scale) {
4707
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4681
+ cb(cur, "ffn_moe_out", il);
4708
4682
}
4709
4683
4710
4684
cur = ggml_add(ctx0, cur, ffn_inp);
@@ -4729,11 +4703,6 @@ struct llm_build_llama : public llm_graph_context {
4729
4703
// lm_head
4730
4704
cur = build_lora_mm(model.output, cur);
4731
4705
4732
- // For Granite architecture
4733
- if (hparams.f_logit_scale) {
4734
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4735
- }
4736
-
4737
4706
cb(cur, "result_output", -1);
4738
4707
res->t_logits = cur;
4739
4708
@@ -4844,11 +4813,6 @@ struct llm_build_deci : public llm_graph_context {
4844
4813
continue;
4845
4814
}
4846
4815
4847
- // For Granite architecture
4848
- if (hparams.f_residual_scale) {
4849
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4850
- }
4851
-
4852
4816
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
4853
4817
ggml_tensor * ffn_inp = cur;
4854
4818
if (n_head > 0) {
@@ -4872,11 +4836,6 @@ struct llm_build_deci : public llm_graph_context {
4872
4836
cb(cur, "ffn_out", il);
4873
4837
}
4874
4838
4875
- // For Granite architecture
4876
- if (hparams.f_residual_scale) {
4877
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4878
- }
4879
-
4880
4839
cur = ggml_add(ctx0, cur, ffn_inp);
4881
4840
cb(cur, "ffn_out", il);
4882
4841
@@ -4899,11 +4858,6 @@ struct llm_build_deci : public llm_graph_context {
4899
4858
// lm_head
4900
4859
cur = build_lora_mm(model.output, cur);
4901
4860
4902
- // For Granite architecture
4903
- if (hparams.f_logit_scale) {
4904
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4905
- }
4906
-
4907
4861
cb(cur, "result_output", -1);
4908
4862
res->t_logits = cur;
4909
4863
@@ -12242,6 +12196,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12242
12196
}
12243
12197
};
12244
12198
12199
+
12200
+ struct llm_build_granite : public llm_graph_context {
12201
+ llm_build_granite(
12202
+ const llama_model & model,
12203
+ const llm_graph_params & params,
12204
+ ggml_cgraph * gf,
12205
+ const bool use_rope = true)
12206
+ : llm_graph_context(params) {
12207
+
12208
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12209
+
12210
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12211
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12212
+
12213
+ ggml_tensor * cur;
12214
+ ggml_tensor * inpL;
12215
+
12216
+ inpL = build_inp_embd(model.tok_embd);
12217
+
12218
+ // inp_pos - used for rope if enabled
12219
+ ggml_tensor * inp_pos;
12220
+ if (use_rope) {
12221
+ inp_pos = build_inp_pos();
12222
+ }
12223
+
12224
+ auto * inp_attn = build_attn_inp_kv_unified();
12225
+
12226
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12227
+ for (int il = 0; il < n_layer; ++il) {
12228
+ ggml_tensor * inpSA = inpL;
12229
+
12230
+ // norm
12231
+ cur = build_norm(inpL,
12232
+ model.layers[il].attn_norm, NULL,
12233
+ LLM_NORM_RMS, il);
12234
+ cb(cur, "attn_norm", il);
12235
+
12236
+ // self-attention
12237
+ {
12238
+ // compute Q and K and (optionally) RoPE them
12239
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12240
+ cb(Qcur, "Qcur", il);
12241
+ if (model.layers[il].bq) {
12242
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12243
+ cb(Qcur, "Qcur", il);
12244
+ }
12245
+
12246
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12247
+ cb(Kcur, "Kcur", il);
12248
+ if (model.layers[il].bk) {
12249
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12250
+ cb(Kcur, "Kcur", il);
12251
+ }
12252
+
12253
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12254
+ cb(Vcur, "Vcur", il);
12255
+ if (model.layers[il].bv) {
12256
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12257
+ cb(Vcur, "Vcur", il);
12258
+ }
12259
+
12260
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12261
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12262
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12263
+
12264
+ if (use_rope) {
12265
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
12266
+ Qcur = ggml_rope_ext(
12267
+ ctx0, Qcur, inp_pos, rope_factors,
12268
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12269
+ ext_factor, attn_factor, beta_fast, beta_slow
12270
+ );
12271
+
12272
+ Kcur = ggml_rope_ext(
12273
+ ctx0, Kcur, inp_pos, rope_factors,
12274
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12275
+ ext_factor, attn_factor, beta_fast, beta_slow
12276
+ );
12277
+ }
12278
+
12279
+ cb(Qcur, "Qcur", il);
12280
+ cb(Kcur, "Kcur", il);
12281
+ cb(Vcur, "Vcur", il);
12282
+
12283
+ cur = build_attn(inp_attn, gf,
12284
+ model.layers[il].wo, model.layers[il].bo,
12285
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12286
+ cb(cur, "attn_out", il);
12287
+ }
12288
+
12289
+ if (il == n_layer - 1) {
12290
+ // skip computing output for unused tokens
12291
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12292
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12293
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12294
+ }
12295
+
12296
+ // For Granite architectures - scale residual
12297
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12298
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12299
+ cb(ffn_inp, "ffn_inp", il);
12300
+
12301
+ // feed-forward network (non-MoE)
12302
+ if (model.layers[il].ffn_gate_inp == nullptr) {
12303
+
12304
+ cur = build_norm(ffn_inp,
12305
+ model.layers[il].ffn_norm, NULL,
12306
+ LLM_NORM_RMS, il);
12307
+ cb(cur, "ffn_norm", il);
12308
+
12309
+ cur = build_ffn(cur,
12310
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12311
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12312
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12313
+ NULL,
12314
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12315
+ cb(cur, "ffn_out", il);
12316
+
12317
+ } else {
12318
+ // MoE branch
12319
+ cur = build_norm(ffn_inp,
12320
+ model.layers[il].ffn_norm, NULL,
12321
+ LLM_NORM_RMS, il);
12322
+ cb(cur, "ffn_norm", il);
12323
+
12324
+ ggml_tensor * moe_out = build_moe_ffn(cur,
12325
+ model.layers[il].ffn_gate_inp,
12326
+ model.layers[il].ffn_up_exps,
12327
+ model.layers[il].ffn_gate_exps,
12328
+ model.layers[il].ffn_down_exps,
12329
+ nullptr,
12330
+ n_expert, n_expert_used,
12331
+ LLM_FFN_SILU, true,
12332
+ false, 0.0,
12333
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12334
+ il);
12335
+ cb(moe_out, "ffn_moe_out", il);
12336
+
12337
+ // For Granite MoE Shared
12338
+ if (hparams.n_ff_shexp > 0) {
12339
+ ggml_tensor * ffn_shexp = build_ffn(cur,
12340
+ model.layers[il].ffn_up_shexp, NULL, NULL,
12341
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12342
+ model.layers[il].ffn_down_shexp, NULL, NULL,
12343
+ NULL,
12344
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12345
+ cb(ffn_shexp, "ffn_shexp", il);
12346
+
12347
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
12348
+ cb(cur, "ffn_out", il);
12349
+ } else {
12350
+ cur = moe_out;
12351
+ }
12352
+ }
12353
+
12354
+ // For Granite architectures - scale residual
12355
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12356
+ cur = ggml_add(ctx0, cur, ffn_inp);
12357
+ cb(cur, "ffn_out", il);
12358
+
12359
+ cur = build_cvec(cur, il);
12360
+ cb(cur, "l_out", il);
12361
+
12362
+ // input for next layer
12363
+ inpL = cur;
12364
+ }
12365
+
12366
+ cur = inpL;
12367
+
12368
+ cur = build_norm(cur,
12369
+ model.output_norm, NULL,
12370
+ LLM_NORM_RMS, -1);
12371
+
12372
+ cb(cur, "result_norm", -1);
12373
+ res->t_embd = cur;
12374
+
12375
+ // lm_head
12376
+ cur = build_lora_mm(model.output, cur);
12377
+
12378
+ // For Granite architectures - scale logits
12379
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
12380
+ cb(cur, "result_output", -1);
12381
+ res->t_logits = cur;
12382
+
12383
+ ggml_build_forward_expand(gf, cur);
12384
+ }
12385
+ };
12386
+
12245
12387
// ref: https://github.com/facebookresearch/chameleon
12246
12388
// based on the original build_llama() function, changes:
12247
12389
// * qk-norm
@@ -12949,9 +13091,6 @@ llm_graph_result_ptr llama_model::build_graph(
12949
13091
case LLM_ARCH_LLAMA:
12950
13092
case LLM_ARCH_LLAMA4:
12951
13093
case LLM_ARCH_MINICPM:
12952
- case LLM_ARCH_GRANITE:
12953
- case LLM_ARCH_GRANITE_MOE:
12954
- case LLM_ARCH_GRANITE_MOE_SHARED:
12955
13094
{
12956
13095
llm = std::make_unique<llm_build_llama>(*this, params, gf);
12957
13096
} break;
@@ -13182,6 +13321,12 @@ llm_graph_result_ptr llama_model::build_graph(
13182
13321
{
13183
13322
llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
13184
13323
} break;
13324
+ case LLM_ARCH_GRANITE:
13325
+ case LLM_ARCH_GRANITE_MOE:
13326
+ case LLM_ARCH_GRANITE_MOE_SHARED:
13327
+ {
13328
+ llm = std::make_unique<llm_build_granite>(*this, params, gf);
13329
+ } break;
13185
13330
case LLM_ARCH_CHAMELEON:
13186
13331
{
13187
13332
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
0 commit comments