Skip to content

Commit ccf392a

Browse files
committed
llama: rwkv6: Make use of key feed_forward_length
Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
1 parent 7c6e520 commit ccf392a

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2753,6 +2753,7 @@ def set_gguf_parameters(self):
27532753
hidden_size = self.hparams["hidden_size"]
27542754
layer_norm_eps = self.hparams["layer_norm_epsilon"]
27552755
rescale_every_n_layers = self.hparams["rescale_every"]
2756+
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
27562757

27572758
# RWKV isn't context limited
27582759
self.gguf_writer.add_context_length(1048576)
@@ -2761,11 +2762,11 @@ def set_gguf_parameters(self):
27612762
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
27622763
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
27632764
self.gguf_writer.add_wkv_head_size(head_size)
2765+
self.gguf_writer.add_feed_forward_length(intermediate_size)
27642766
self.gguf_writer.add_file_type(self.ftype)
27652767

27662768
# required by llama.cpp, unused
27672769
self.gguf_writer.add_head_count(0)
2768-
self.gguf_writer.add_feed_forward_length(0)
27692770

27702771
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
27712772
new_name = self.map_tensor_name(name)

src/llama.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8368,7 +8368,7 @@ static bool llm_load_tensors(
83688368
const int time_decay_extra_dim = (n_embd == 4096) ? 128 : 64;
83698369
const int head_size = hparams.wkv_head_size;
83708370
const int attn_hidden_size = n_embd;
8371-
const int ffn_size = (int)(n_embd * 3.5 / 32) * 32;
8371+
const int ffn_size = hparams.n_ff_arr[0];
83728372

83738373
for (int i = 0; i < n_layer; ++i) {
83748374
ggml_context * ctx_layer = ctx_for_layer(i);
@@ -8391,7 +8391,6 @@ static bool llm_load_tensors(
83918391
layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
83928392
layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1});
83938393

8394-
// TODO: Parametrize hardcoded dimensions for first & decay
83958394
layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size});
83968395
layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd});
83978396
layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim});

0 commit comments

Comments
 (0)