@@ -8563,36 +8563,29 @@ static struct ggml_tensor * llm_build_kv(
8563
8563
static struct ggml_tensor * llm_build_time_mix(
8564
8564
struct ggml_context * ctx,
8565
8565
const struct llama_layer * layer,
8566
- struct ggml_tensor * current ,
8566
+ struct ggml_tensor * cur ,
8567
8567
struct ggml_tensor * x_prev,
8568
8568
struct ggml_tensor ** wkv_state,
8569
8569
struct ggml_tensor * state_seq) {
8570
- size_t n_embed = current ->ne[0];
8571
- size_t n_tokens = current ->ne[1];
8570
+ size_t n_embed = cur ->ne[0];
8571
+ size_t n_tokens = cur ->ne[1];
8572
8572
size_t head_size = layer->time_mix_first->ne[0];
8573
8573
size_t head_count = layer->time_mix_first->ne[1];
8574
8574
size_t n_kv = state_seq->ne[0];
8575
8575
8576
- struct ggml_tensor * sx = ggml_sub(ctx, x_prev, current);
8577
- struct ggml_tensor * xxx = ggml_add_inplace(
8578
- ctx,
8579
- ggml_mul(ctx, sx, layer->time_mix_lerp_x),
8580
- current
8581
- );
8576
+ struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
8577
+ struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
8582
8578
8583
8579
xxx = ggml_reshape_4d(
8584
8580
ctx,
8585
- ggml_tanh_inplace (
8581
+ ggml_tanh (
8586
8582
ctx,
8587
8583
ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
8588
8584
),
8589
8585
layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
8590
8586
);
8591
8587
8592
- xxx = ggml_cont(
8593
- ctx,
8594
- ggml_permute(ctx, xxx, 0, 1, 3, 2)
8595
- );
8588
+ xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2));
8596
8589
8597
8590
xxx = ggml_mul_mat(
8598
8591
ctx,
@@ -8614,151 +8607,138 @@ static struct ggml_tensor * llm_build_time_mix(
8614
8607
struct ggml_tensor *mk = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens);
8615
8608
mk = ggml_reshape_2d(
8616
8609
ctx,
8617
- ggml_set_1d_inplace (ctx, mk, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * sizeof(float)), 0),
8610
+ ggml_set_1d (ctx, mk, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * sizeof(float)), 0),
8618
8611
n_embed, n_tokens
8619
8612
);
8620
8613
8621
8614
struct ggml_tensor *mv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens);
8622
8615
mv = ggml_reshape_2d(
8623
8616
ctx,
8624
- ggml_set_1d_inplace (ctx, mv, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 2 * sizeof(float)), 0),
8617
+ ggml_set_1d (ctx, mv, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 2 * sizeof(float)), 0),
8625
8618
n_embed, n_tokens
8626
8619
);
8627
8620
8628
8621
struct ggml_tensor *mr = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens);
8629
8622
mr = ggml_reshape_2d(
8630
8623
ctx,
8631
- ggml_set_1d_inplace (ctx, mr, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 3 * sizeof(float)), 0),
8624
+ ggml_set_1d (ctx, mr, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 3 * sizeof(float)), 0),
8632
8625
n_embed, n_tokens
8633
8626
);
8634
8627
8635
8628
struct ggml_tensor *mg = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embed * n_tokens);
8636
8629
mg = ggml_reshape_2d(
8637
8630
ctx,
8638
- ggml_set_1d_inplace (ctx, mg, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 4 * sizeof(float)), 0),
8631
+ ggml_set_1d (ctx, mg, ggml_view_1d(ctx, xxx, n_embed * n_tokens, n_embed * n_tokens * 4 * sizeof(float)), 0),
8639
8632
n_embed, n_tokens
8640
8633
);
8641
8634
8642
- struct ggml_tensor * xw = ggml_add_inplace (
8635
+ struct ggml_tensor * xw = ggml_add (
8643
8636
ctx,
8644
- ggml_mul_inplace (
8637
+ ggml_mul (
8645
8638
ctx,
8646
8639
ggml_add(ctx, mw, layer->time_mix_lerp_w),
8647
8640
sx
8648
8641
),
8649
- current
8642
+ cur
8650
8643
);
8651
8644
8652
- struct ggml_tensor * xk = ggml_add_inplace (
8645
+ struct ggml_tensor * xk = ggml_add (
8653
8646
ctx,
8654
- ggml_mul_inplace (
8647
+ ggml_mul (
8655
8648
ctx,
8656
8649
ggml_add(ctx, mk, layer->time_mix_lerp_k),
8657
8650
sx
8658
8651
),
8659
- current
8652
+ cur
8660
8653
);
8661
8654
8662
- struct ggml_tensor * xv = ggml_add_inplace (
8655
+ struct ggml_tensor * xv = ggml_add (
8663
8656
ctx,
8664
- ggml_mul_inplace (
8657
+ ggml_mul (
8665
8658
ctx,
8666
8659
ggml_add(ctx, mv, layer->time_mix_lerp_v),
8667
8660
sx
8668
8661
),
8669
- current
8662
+ cur
8670
8663
);
8671
8664
8672
- struct ggml_tensor * xr = ggml_add_inplace (
8665
+ struct ggml_tensor * xr = ggml_add (
8673
8666
ctx,
8674
- ggml_mul_inplace (
8667
+ ggml_mul (
8675
8668
ctx,
8676
8669
ggml_add(ctx, mr, layer->time_mix_lerp_r),
8677
8670
sx
8678
8671
),
8679
- current
8672
+ cur
8680
8673
);
8681
8674
8682
- struct ggml_tensor * xg = ggml_add_inplace (
8675
+ struct ggml_tensor * xg = ggml_add (
8683
8676
ctx,
8684
- ggml_mul_inplace (
8677
+ ggml_mul (
8685
8678
ctx,
8686
8679
ggml_add(ctx, mg, layer->time_mix_lerp_g),
8687
8680
sx
8688
8681
),
8689
- current
8682
+ cur
8690
8683
);
8691
8684
8692
8685
struct ggml_tensor * r = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
8693
8686
struct ggml_tensor * k = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
8694
8687
struct ggml_tensor * v = ggml_reshape_4d(ctx, ggml_mul_mat(ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
8695
- struct ggml_tensor * g = ggml_silu_inplace (
8688
+ struct ggml_tensor * g = ggml_silu (
8696
8689
ctx,
8697
8690
ggml_mul_mat(ctx, layer->time_mix_gate, xg)
8698
8691
);
8699
8692
8700
8693
struct ggml_tensor * w = ggml_mul_mat(
8701
8694
ctx,
8702
8695
layer->time_mix_decay_w2,
8703
- ggml_tanh_inplace (
8696
+ ggml_tanh (
8704
8697
ctx,
8705
8698
ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
8706
8699
)
8707
8700
);
8708
- w = ggml_add_inplace(
8709
- ctx,
8710
- w,
8711
- ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed)
8712
- );
8701
+ w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
8713
8702
w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
8714
8703
w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
8715
8704
8716
8705
k = ggml_transpose(ctx, k);
8717
8706
v = ggml_transpose(ctx, v);
8718
8707
r = ggml_transpose(ctx, r);
8719
8708
struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state, state_seq);
8720
- current = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
8709
+ cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
8721
8710
*wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_kv, n_embed * n_tokens * sizeof(float));
8722
8711
8723
8712
// ggml_group_norm considers groups in the third dimension.
8724
- current = ggml_reshape_4d(ctx, current , 1, 1, n_embed, n_tokens);
8725
- current = ggml_group_norm(ctx, current , head_count, 64e-5f);
8713
+ cur = ggml_reshape_4d(ctx, cur , 1, 1, n_embed, n_tokens);
8714
+ cur = ggml_group_norm(ctx, cur , head_count, 64e-5f);
8726
8715
// Convert back to a regular vector.
8727
- current = ggml_reshape_2d(ctx, current, n_embed, n_tokens);
8728
- current = ggml_add_inplace(
8729
- ctx,
8730
- ggml_mul_inplace(
8731
- ctx,
8732
- current,
8733
- layer->time_mix_ln
8734
- ),
8735
- layer->time_mix_ln_b
8736
- );
8716
+ cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
8717
+ cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
8737
8718
8738
- current = ggml_mul(ctx, current , g);
8719
+ cur = ggml_mul(ctx, cur , g);
8739
8720
8740
- return ggml_mul_mat(ctx, layer->time_mix_output, current );
8721
+ return ggml_mul_mat(ctx, layer->time_mix_output, cur );
8741
8722
}
8742
8723
8743
8724
static struct ggml_tensor * llm_build_channel_mix(
8744
8725
struct ggml_context * ctx,
8745
8726
const struct llama_layer * layer,
8746
- struct ggml_tensor * current ,
8727
+ struct ggml_tensor * cur ,
8747
8728
struct ggml_tensor * x_prev) {
8748
- struct ggml_tensor * sx = ggml_sub(ctx, x_prev, current);
8749
- struct ggml_tensor * xk = ggml_add_inplace(
8750
- ctx,
8751
- ggml_mul(ctx, sx, layer->channel_mix_lerp_k),
8752
- current
8753
- );
8754
- struct ggml_tensor * xr = ggml_add_inplace(
8729
+ struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
8730
+ struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
8731
+ struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
8732
+
8733
+ struct ggml_tensor * r = ggml_sigmoid(ctx, ggml_mul_mat(ctx, layer->channel_mix_receptance, xr));
8734
+ struct ggml_tensor * k = ggml_sqr(
8755
8735
ctx,
8756
- ggml_mul(ctx, sx, layer->channel_mix_lerp_r),
8757
- current
8736
+ ggml_relu(
8737
+ ctx,
8738
+ ggml_mul_mat(ctx, layer->channel_mix_key, xk)
8739
+ )
8758
8740
);
8759
- struct ggml_tensor * r = ggml_sigmoid_inplace(ctx, ggml_mul_mat(ctx, layer->channel_mix_receptance, xr));
8760
- struct ggml_tensor * k = ggml_sqr_inplace(ctx, ggml_relu_inplace(ctx, ggml_mul_mat(ctx, layer->channel_mix_key, xk)));
8761
- return ggml_mul_inplace(ctx, r, ggml_mul_mat(ctx, layer->channel_mix_value, k));
8741
+ return ggml_mul(ctx, r, ggml_mul_mat(ctx, layer->channel_mix_value, k));
8762
8742
}
8763
8743
8764
8744
struct llm_build_context {
@@ -14165,13 +14145,12 @@ struct llm_build_context {
14165
14145
// Token shift state dimensions should be 2 * n_emb
14166
14146
GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
14167
14147
14168
- // Input embeddings, start of the model after tokenizing ({n_embd, n_tokens})
14169
14148
ggml_tensor * input_embeddings = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
14170
14149
14171
14150
struct ggml_tensor * state_mask = build_inp_s_mask();
14172
14151
struct ggml_tensor * state_seq = build_inp_s_seq();
14173
14152
14174
- ggml_tensor * x = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
14153
+ ggml_tensor * cur = llm_build_norm(ctx0, input_embeddings, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
14175
14154
14176
14155
for (int layer_i = 0; layer_i < n_layer; ++layer_i) {
14177
14156
const llama_layer * layer = &model.layers[layer_i];
@@ -14200,16 +14179,16 @@ struct llm_build_context {
14200
14179
struct ggml_tensor * att_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_kv, 0);
14201
14180
struct ggml_tensor * ffn_shift = ggml_view_1d(ctx0, token_shift, n_embd * n_kv, n_embd * n_kv * ggml_element_size(kv_self.k_l[layer_i]));
14202
14181
14203
- struct ggml_tensor * x_norm = llm_build_norm(ctx0, x , hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i);
14182
+ struct ggml_tensor * x_norm = llm_build_norm(ctx0, cur , hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, layer_i);
14204
14183
struct ggml_tensor * tmp = ggml_rwkv_token_shift(ctx0, att_shift, x_norm, state_seq);
14205
14184
struct ggml_tensor * x_prev = ggml_reshape_2d(
14206
14185
ctx0,
14207
14186
ggml_view_1d(ctx0, tmp, n_embd * n_tokens, 0),
14208
14187
n_embd, n_tokens
14209
14188
);
14210
14189
14211
- x = ggml_add(ctx0, x , llm_build_time_mix(ctx0, layer, x_norm, x_prev, &wkv_states, state_seq));
14212
- ggml_build_forward_expand(gf, x );
14190
+ cur = ggml_add(ctx0, cur , llm_build_time_mix(ctx0, layer, x_norm, x_prev, &wkv_states, state_seq));
14191
+ ggml_build_forward_expand(gf, cur );
14213
14192
ggml_build_forward_expand(
14214
14193
gf,
14215
14194
ggml_cpy(
@@ -14237,15 +14216,15 @@ struct llm_build_context {
14237
14216
)
14238
14217
);
14239
14218
14240
- x_norm = llm_build_norm(ctx0, x , hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i);
14219
+ x_norm = llm_build_norm(ctx0, cur , hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, layer_i);
14241
14220
tmp = ggml_rwkv_token_shift(ctx0, ffn_shift, x_norm, state_seq);
14242
14221
x_prev = ggml_reshape_2d(
14243
14222
ctx0,
14244
14223
ggml_view_1d(ctx0, tmp, n_embd * n_tokens, 0),
14245
14224
n_embd, n_tokens
14246
14225
);
14247
- x = ggml_add(ctx0, x , llm_build_channel_mix(ctx0, layer, x_norm, x_prev));
14248
- ggml_build_forward_expand(gf, x );
14226
+ cur = ggml_add(ctx0, cur , llm_build_channel_mix(ctx0, layer, x_norm, x_prev));
14227
+ ggml_build_forward_expand(gf, cur );
14249
14228
ggml_build_forward_expand(
14250
14229
gf,
14251
14230
ggml_cpy(
@@ -14279,21 +14258,18 @@ struct llm_build_context {
14279
14258
);
14280
14259
14281
14260
if ((layer_i + 1) % hparams.rescale_every_n_layers == 0) {
14282
- x = ggml_scale(ctx0, x , 0.5F);
14261
+ cur = ggml_scale(ctx0, cur , 0.5F);
14283
14262
}
14284
14263
}
14285
14264
14286
- // Something related to skipping tokens, specifics unclear
14287
14265
ggml_tensor * inp_out_ids = build_inp_out_ids();
14288
- x = ggml_get_rows(ctx0, x , inp_out_ids);
14266
+ cur = ggml_get_rows(ctx0, cur , inp_out_ids);
14289
14267
14290
- // Output head, convert result vector to logits
14291
- x = llm_build_norm(ctx0, x, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
14292
- x = ggml_mul_mat(ctx0, model.output, x);
14268
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
14269
+ cur = ggml_mul_mat(ctx0, model.output, cur);
14293
14270
14294
- // Mark the output as being the result
14295
- cb(x, "result_output", -1);
14296
- ggml_build_forward_expand(gf, x);
14271
+ cb(cur, "result_output", -1);
14272
+ ggml_build_forward_expand(gf, cur);
14297
14273
14298
14274
return gf;
14299
14275
}
0 commit comments