@@ -5161,6 +5161,7 @@ static const char * llama_model_type_name(e_model type) {
5161
5161
case MODEL_1B: return "1B";
5162
5162
case MODEL_1_3B: return "1.3B";
5163
5163
case MODEL_1_4B: return "1.4B";
5164
+ case MODEL_1_6B: return "1.6B";
5164
5165
case MODEL_2B: return "2B";
5165
5166
case MODEL_2_8B: return "2.8B";
5166
5167
case MODEL_3B: return "3B";
@@ -15064,49 +15065,40 @@ struct llm_build_context {
15064
15065
GGML_ASSERT(batch.equal_seqs);
15065
15066
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
15066
15067
15067
- ggml_tensor * input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb) ;
15068
-
15068
+ struct ggml_tensor * cur ;
15069
+ struct ggml_tensor * inpL;
15069
15070
struct ggml_tensor * state_copy = build_inp_s_copy();
15070
15071
struct ggml_tensor * state_mask = build_inp_s_mask();
15071
15072
15072
- ggml_tensor * cur = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
15073
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
15074
+ inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
15073
15075
15074
- for (int layer_i = 0; layer_i < n_layer; ++layer_i ) {
15075
- const llama_layer * layer = &model.layers[layer_i ];
15076
+ for (int il = 0; il < n_layer; ++il ) {
15077
+ const llama_layer * layer = &model.layers[il ];
15076
15078
15077
15079
// (ab)using the KV cache to store the states
15078
15080
struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
15079
- gf, kv_self.k_l[layer_i ], state_copy, state_mask,
15081
+ gf, kv_self.k_l[il ], state_copy, state_mask,
15080
15082
hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
15081
15083
struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
15082
- gf, kv_self.v_l[layer_i ], state_copy, state_mask,
15084
+ gf, kv_self.v_l[il ], state_copy, state_mask,
15083
15085
hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
15084
15086
15085
- cur = ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
15087
+ cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
15088
+ token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
15086
15089
15087
- token_shift = ggml_cont(
15088
- ctx0,
15089
- ggml_permute(
15090
- ctx0,
15091
- ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs),
15092
- 0, 2, 1, 3
15093
- )
15094
- );
15090
+ struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
15091
+ struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
15095
15092
15096
- struct ggml_tensor * att_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, 0);
15097
- struct ggml_tensor * ffn_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, n_embd * n_seqs * ggml_element_size(token_shift));
15098
- att_shift = ggml_reshape_3d(ctx0, att_shift, n_embd, 1, n_seqs);
15099
- ffn_shift = ggml_reshape_3d(ctx0, ffn_shift, n_embd, 1, n_seqs);
15100
-
15101
- struct ggml_tensor * x_norm = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i);
15093
+ struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
15102
15094
struct ggml_tensor * x_prev = ggml_concat(
15103
15095
ctx0,
15104
15096
att_shift,
15105
- ggml_view_3d(ctx0, x_norm , n_embd, n_seq_tokens - 1, n_seqs, x_norm ->nb[1], x_norm ->nb[2], 0),
15097
+ ggml_view_3d(ctx0, x_norm_att , n_embd, n_seq_tokens - 1, n_seqs, x_norm_att ->nb[1], x_norm_att ->nb[2], 0),
15106
15098
1
15107
15099
);
15108
15100
15109
- cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6(ctx0, layer, x_norm , x_prev, &wkv_states));
15101
+ cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6(ctx0, layer, x_norm_att , x_prev, &wkv_states));
15110
15102
ggml_build_forward_expand(gf, cur);
15111
15103
ggml_build_forward_expand(
15112
15104
gf,
@@ -15115,38 +15107,22 @@ struct llm_build_context {
15115
15107
wkv_states,
15116
15108
ggml_view_1d(
15117
15109
ctx0,
15118
- kv_self.v_l[layer_i ],
15110
+ kv_self.v_l[il ],
15119
15111
hparams.n_embd_v_s() * n_seqs,
15120
- hparams.n_embd_v_s() * kv_head * ggml_type_size (kv_self.v_l[layer_i]->type )
15112
+ hparams.n_embd_v_s() * kv_head * ggml_element_size (kv_self.v_l[il] )
15121
15113
)
15122
15114
)
15123
15115
);
15124
- struct ggml_tensor * last_norm = ggml_view_3d(ctx0, x_norm, n_embd, 1, n_seqs, x_norm->nb[1], x_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm));
15125
- ggml_build_forward_expand(
15126
- gf,
15127
- ggml_cpy(
15128
- ctx0, last_norm,
15129
- ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, 0)
15130
- )
15131
- );
15132
15116
15133
- x_norm = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i );
15117
+ ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il );
15134
15118
x_prev = ggml_concat(
15135
15119
ctx0,
15136
15120
ffn_shift,
15137
- ggml_view_3d(ctx0, x_norm , n_embd, n_seq_tokens - 1, n_seqs, x_norm ->nb[1], x_norm ->nb[2], 0),
15121
+ ggml_view_3d(ctx0, x_norm_ffn , n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn ->nb[1], x_norm_ffn ->nb[2], 0),
15138
15122
1
15139
15123
);
15140
- cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6(ctx0, layer, x_norm, x_prev));
15141
- last_norm = ggml_view_3d(ctx0, x_norm, n_embd, 1, n_seqs, x_norm->nb[1], x_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm));
15124
+ cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6(ctx0, layer, x_norm_ffn, x_prev));
15142
15125
ggml_build_forward_expand(gf, cur);
15143
- ggml_build_forward_expand(
15144
- gf,
15145
- ggml_cpy(
15146
- ctx0, last_norm,
15147
- ggml_view_1d(ctx0, token_shift, n_embd * n_seqs, n_embd * n_seqs * ggml_element_size(token_shift))
15148
- )
15149
- );
15150
15126
15151
15127
token_shift = ggml_cont(
15152
15128
ctx0,
@@ -15157,20 +15133,32 @@ struct llm_build_context {
15157
15133
)
15158
15134
);
15159
15135
15136
+ struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
15137
+ struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
15138
+
15139
+ token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
15140
+
15160
15141
ggml_build_forward_expand(
15161
15142
gf,
15162
15143
ggml_cpy(
15163
15144
ctx0,
15164
15145
ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
15165
- ggml_view_1d(ctx0, kv_self.k_l[layer_i ], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_type_size (kv_self.k_l[layer_i]->type ))
15146
+ ggml_view_1d(ctx0, kv_self.k_l[il ], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size (kv_self.k_l[il] ))
15166
15147
)
15167
15148
);
15168
15149
15169
- if ((layer_i + 1) % hparams.rescale_every_n_layers == 0) {
15150
+ if ((il + 1) % hparams.rescale_every_n_layers == 0) {
15170
15151
cur = ggml_scale(ctx0, cur, 0.5F);
15171
15152
}
15153
+
15154
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
15155
+ cb(cur, "l_out", il);
15156
+
15157
+ // input for next layer
15158
+ inpL = cur;
15172
15159
}
15173
15160
15161
+ cur = inpL;
15174
15162
ggml_tensor * inp_out_ids = build_inp_out_ids();
15175
15163
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
15176
15164
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
0 commit comments