@@ -328,6 +328,7 @@ enum llm_kv {
328
328
LLM_KV_SSM_CONV_KERNEL,
329
329
LLM_KV_SSM_STATE_SIZE,
330
330
LLM_KV_SSM_TIME_STEP_RANK,
331
+ LLM_KV_SSM_DT_B_C_RMS,
331
332
332
333
LLM_KV_TOKENIZER_MODEL,
333
334
LLM_KV_TOKENIZER_PRE,
@@ -426,6 +427,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
426
427
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
427
428
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
428
429
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
430
+ { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
429
431
430
432
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
431
433
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
@@ -2237,6 +2239,7 @@ struct llama_hparams {
2237
2239
uint32_t ssm_d_inner = 0;
2238
2240
uint32_t ssm_d_state = 0;
2239
2241
uint32_t ssm_dt_rank = 0;
2242
+ bool ssm_dt_b_c_rms = false;
2240
2243
2241
2244
float f_clamp_kqv = 0.0f;
2242
2245
float f_max_alibi_bias = 0.0f;
@@ -2286,6 +2289,7 @@ struct llama_hparams {
2286
2289
if (this->ssm_d_inner != other.ssm_d_inner) return true;
2287
2290
if (this->ssm_d_state != other.ssm_d_state) return true;
2288
2291
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
2292
+ if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
2289
2293
2290
2294
if (this->dec_start_token_id != other.dec_start_token_id) return true;
2291
2295
@@ -5052,6 +5056,7 @@ static void llm_load_hparams(
5052
5056
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
5053
5057
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
5054
5058
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
5059
+ ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
5055
5060
5056
5061
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5057
5062
@@ -5907,6 +5912,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5907
5912
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
5908
5913
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
5909
5914
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
5915
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
5910
5916
}
5911
5917
5912
5918
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
@@ -12161,6 +12167,10 @@ struct llm_build_context {
12161
12167
GGML_ASSERT(2 * d_model == d_inner);
12162
12168
const int64_t d_state = hparams.ssm_d_state;
12163
12169
const int64_t dt_rank = hparams.ssm_dt_rank;
12170
+ // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
12171
+ const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
12172
+ // Use the same RMS norm as the final layer norm
12173
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
12164
12174
12165
12175
struct ggml_tensor * cur;
12166
12176
struct ggml_tensor * inpL;
@@ -12241,6 +12251,13 @@ struct llm_build_context {
12241
12251
struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
12242
12252
struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
12243
12253
12254
+ // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
12255
+ if (ssm_dt_b_c_rms) {
12256
+ dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
12257
+ B = ggml_rms_norm(ctx0, B, norm_rms_eps);
12258
+ C = ggml_rms_norm(ctx0, C, norm_rms_eps);
12259
+ }
12260
+
12244
12261
// {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
12245
12262
dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt);
12246
12263
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
@@ -16105,6 +16122,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16105
16122
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
16106
16123
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
16107
16124
}
16125
+ if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
16126
+ new_type = GGML_TYPE_F16;
16127
+ }
16108
16128
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
16109
16129
++qs.n_fallback;
16110
16130
}
@@ -16433,8 +16453,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16433
16453
// do not quantize Mamba's small yet 2D weights
16434
16454
// NOTE: can't use LLM_TN here because the layer number is not known
16435
16455
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
16436
- quantize &= name.find("ssm_x.weight") == std::string::npos;
16437
- quantize &= name.find("ssm_dt.weight") == std::string::npos;
16438
16456
16439
16457
// do not quantize relative position bias (T5)
16440
16458
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
0 commit comments