@@ -210,7 +210,7 @@ enum llm_arch {
210
210
LLM_ARCH_T5,
211
211
LLM_ARCH_T5ENCODER,
212
212
LLM_ARCH_JAIS,
213
- LLM_ARCH_RWKV ,
213
+ LLM_ARCH_RWKV6 ,
214
214
LLM_ARCH_UNKNOWN,
215
215
};
216
216
@@ -256,7 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
256
256
{ LLM_ARCH_T5, "t5" },
257
257
{ LLM_ARCH_T5ENCODER, "t5encoder" },
258
258
{ LLM_ARCH_JAIS, "jais" },
259
- { LLM_ARCH_RWKV , "rwkv" },
259
+ { LLM_ARCH_RWKV6 , "rwkv6" },
260
260
{ LLM_ARCH_UNKNOWN, "(unknown)" },
261
261
};
262
262
@@ -1328,7 +1328,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1328
1328
},
1329
1329
},
1330
1330
{
1331
- LLM_ARCH_RWKV ,
1331
+ LLM_ARCH_RWKV6 ,
1332
1332
{
1333
1333
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1334
1334
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
@@ -3052,7 +3052,7 @@ static bool llama_kv_cache_init(
3052
3052
cache.has_shift = false;
3053
3053
3054
3054
// TODO: find a nicer way to add other recurrent model architectures
3055
- cache.recurrent = model.arch == LLM_ARCH_MAMBA || model.arch == LLM_ARCH_RWKV ;
3055
+ cache.recurrent = model.arch == LLM_ARCH_MAMBA || model.arch == LLM_ARCH_RWKV6 ;
3056
3056
cache.v_trans = !cache.recurrent && !cparams.flash_attn;
3057
3057
3058
3058
cache.head = 0;
@@ -5348,7 +5348,7 @@ static void llm_load_hparams(
5348
5348
default: model.type = e_model::MODEL_UNKNOWN;
5349
5349
}
5350
5350
} break;
5351
- case LLM_ARCH_RWKV :
5351
+ case LLM_ARCH_RWKV6 :
5352
5352
{
5353
5353
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
5354
5354
ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
@@ -7700,7 +7700,7 @@ static bool llm_load_tensors(
7700
7700
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
7701
7701
}
7702
7702
} break;
7703
- case LLM_ARCH_RWKV :
7703
+ case LLM_ARCH_RWKV6 :
7704
7704
{
7705
7705
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
7706
7706
@@ -8555,7 +8555,7 @@ static struct ggml_tensor * llm_build_kv(
8555
8555
}
8556
8556
8557
8557
8558
- static struct ggml_tensor * llm_build_time_mix (
8558
+ static struct ggml_tensor * llm_build_time_mix_rwkv6 (
8559
8559
struct ggml_context * ctx,
8560
8560
const struct llama_layer * layer,
8561
8561
struct ggml_tensor * cur,
@@ -8716,7 +8716,7 @@ static struct ggml_tensor * llm_build_time_mix(
8716
8716
return ggml_mul_mat(ctx, layer->time_mix_output, cur);
8717
8717
}
8718
8718
8719
- static struct ggml_tensor * llm_build_channel_mix (
8719
+ static struct ggml_tensor * llm_build_channel_mix_rwkv6 (
8720
8720
struct ggml_context * ctx,
8721
8721
const struct llama_layer * layer,
8722
8722
struct ggml_tensor * cur,
@@ -14134,7 +14134,7 @@ struct llm_build_context {
14134
14134
return gf;
14135
14135
}
14136
14136
14137
- ggml_cgraph * build_rwkv () {
14137
+ ggml_cgraph * build_rwkv6 () {
14138
14138
ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
14139
14139
14140
14140
// Token shift state dimensions should be 2 * n_emb
@@ -14182,7 +14182,7 @@ struct llm_build_context {
14182
14182
n_embd, n_tokens
14183
14183
);
14184
14184
14185
- cur = ggml_add(ctx0, cur, llm_build_time_mix (ctx0, layer, x_norm, x_prev, &wkv_states, state_seq));
14185
+ cur = ggml_add(ctx0, cur, llm_build_time_mix_rwkv6 (ctx0, layer, x_norm, x_prev, &wkv_states, state_seq));
14186
14186
ggml_build_forward_expand(gf, cur);
14187
14187
ggml_build_forward_expand(
14188
14188
gf,
@@ -14218,7 +14218,7 @@ struct llm_build_context {
14218
14218
ggml_view_1d(ctx0, tmp, n_embd * n_tokens, 0),
14219
14219
n_embd, n_tokens
14220
14220
);
14221
- cur = ggml_add(ctx0, cur, llm_build_channel_mix (ctx0, layer, x_norm, x_prev));
14221
+ cur = ggml_add(ctx0, cur, llm_build_channel_mix_rwkv6 (ctx0, layer, x_norm, x_prev));
14222
14222
ggml_build_forward_expand(gf, cur);
14223
14223
ggml_build_forward_expand(
14224
14224
gf,
@@ -14523,9 +14523,9 @@ static struct ggml_cgraph * llama_build_graph(
14523
14523
{
14524
14524
result = llm.build_jais();
14525
14525
} break;
14526
- case LLM_ARCH_RWKV :
14526
+ case LLM_ARCH_RWKV6 :
14527
14527
{
14528
- result = llm.build_rwkv ();
14528
+ result = llm.build_rwkv6 ();
14529
14529
} break;
14530
14530
default:
14531
14531
GGML_ABORT("fatal error");
@@ -17250,7 +17250,7 @@ struct llama_context * llama_new_context_with_model(
17250
17250
ggml_type type_v = params.type_v;
17251
17251
17252
17252
// Mamba and RWKV only need a constant number of KV cache cells per sequence
17253
- if (model->arch == LLM_ARCH_MAMBA || model->arch == LLM_ARCH_RWKV ) {
17253
+ if (model->arch == LLM_ARCH_MAMBA || model->arch == LLM_ARCH_RWKV6 ) {
17254
17254
// Mamba and RWKV need at least as many KV cells as there are sequences kept at any time
17255
17255
kv_size = std::max((uint32_t) 1, params.n_seq_max);
17256
17256
// it's probably best to keep as much precision as possible for the states
@@ -17560,7 +17560,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
17560
17560
case LLM_ARCH_T5:
17561
17561
case LLM_ARCH_T5ENCODER:
17562
17562
case LLM_ARCH_JAIS:
17563
- case LLM_ARCH_RWKV :
17563
+ case LLM_ARCH_RWKV6 :
17564
17564
return LLAMA_ROPE_TYPE_NONE;
17565
17565
17566
17566
// use what we call a normal RoPE, operating on pairs of consecutive head values
0 commit comments