Skip to content

Commit a8db247

Browse files
committed
llama: rwkv6: Add kv time_mix_extra_dim and time_decay_extra_dim
Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
1 parent ccf392a commit a8db247

File tree

4 files changed

+24
-2
lines changed

4 files changed

+24
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2754,6 +2754,8 @@ def set_gguf_parameters(self):
27542754
layer_norm_eps = self.hparams["layer_norm_epsilon"]
27552755
rescale_every_n_layers = self.hparams["rescale_every"]
27562756
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
2757+
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
2758+
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
27572759

27582760
# RWKV isn't context limited
27592761
self.gguf_writer.add_context_length(1048576)
@@ -2762,6 +2764,8 @@ def set_gguf_parameters(self):
27622764
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
27632765
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
27642766
self.gguf_writer.add_wkv_head_size(head_size)
2767+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
2768+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
27652769
self.gguf_writer.add_feed_forward_length(intermediate_size)
27662770
self.gguf_writer.add_file_type(self.ftype)
27672771

gguf-py/gguf/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ class LLM:
9595
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
9696
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
9797
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
98+
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
99+
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
98100

99101
class Attention:
100102
HEAD_COUNT = "{arch}.attention.head_count"

gguf-py/gguf/gguf_writer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,12 @@ def add_expert_weights_scale(self, value: float) -> None:
673673
def add_rescale_every_n_layers(self, count: int) -> None:
674674
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
675675

676+
def add_time_mix_extra_dim(self, dim: int) -> None:
677+
self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
678+
679+
def add_time_decay_extra_dim(self, dim: int) -> None:
680+
self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
681+
676682
def add_wkv_head_size(self, size: int) -> None:
677683
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
678684

src/llama.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,8 @@ enum llm_kv {
298298
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
299299
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
300300
LLM_KV_RESCALE_EVERY_N_LAYERS,
301+
LLM_KV_TIME_MIX_EXTRA_DIM,
302+
LLM_KV_TIME_DECAY_EXTRA_DIM,
301303

302304
LLM_KV_ATTENTION_HEAD_COUNT,
303305
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -400,6 +402,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
400402
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
401403
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
402404
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
405+
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
406+
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
403407

404408
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
405409
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -2296,6 +2300,8 @@ struct llama_hparams {
22962300

22972301
// for RWKV
22982302
uint32_t rescale_every_n_layers = 0;
2303+
uint32_t time_mix_extra_dim = 0;
2304+
uint32_t time_decay_extra_dim = 0;
22992305
uint32_t wkv_head_size = 0;
23002306

23012307
float rope_attn_factor = 1.0f;
@@ -2362,6 +2368,8 @@ struct llama_hparams {
23622368
if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
23632369

23642370
if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
2371+
if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
2372+
if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
23652373
if (this->wkv_head_size != other.wkv_head_size) return true;
23662374

23672375
if (this->dec_start_token_id != other.dec_start_token_id) return true;
@@ -5909,6 +5917,8 @@ static void llm_load_hparams(
59095917
{
59105918
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
59115919
ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
5920+
ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
5921+
ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
59125922
ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
59135923

59145924
switch (hparams.n_layer) {
@@ -8364,8 +8374,8 @@ static bool llm_load_tensors(
83648374
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
83658375
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
83668376

8367-
const int time_mix_extra_dim = (n_embd == 4096) ? 64 : 32;
8368-
const int time_decay_extra_dim = (n_embd == 4096) ? 128 : 64;
8377+
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
8378+
const int time_decay_extra_dim = hparams.time_decay_extra_dim;
83698379
const int head_size = hparams.wkv_head_size;
83708380
const int attn_hidden_size = n_embd;
83718381
const int ffn_size = hparams.n_ff_arr[0];

0 commit comments

Comments
 (0)