-
Notifications
You must be signed in to change notification settings - Fork 12k
Feat: Support for falcon-mamba
architecture
#9074
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 3 commits
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
59a08be
feat: initial support for llama.cpp
younesbelkada bfa0286
fix: lint
younesbelkada b97704c
refactor: better refactor
younesbelkada 343b583
Update src/llama.cpp
younesbelkada a8109e3
Update src/llama.cpp
younesbelkada 184a4c6
fix: address comments
younesbelkada 3494265
Update convert_hf_to_gguf.py
younesbelkada f7d2e91
fix: add more cleanup and harmonization
younesbelkada 9e22bb7
fix: lint
younesbelkada 4553502
Update gguf-py/gguf/gguf_writer.py
younesbelkada d637bb9
fix: change name
younesbelkada 57c3eb4
Apply suggestions from code review
younesbelkada bf5e344
add in operator
younesbelkada ca4db9e
fix: add `dt_b_c_rms` in `llm_load_print_meta`
younesbelkada 78ad84f
fix: correct printf format for bool
younesbelkada 7aeccbb
fix: correct print format
younesbelkada 5c0f108
Update src/llama.cpp
younesbelkada 3491291
llama : quantize more Mamba tensors
compilade File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -328,6 +328,7 @@ enum llm_kv { | |
LLM_KV_SSM_CONV_KERNEL, | ||
LLM_KV_SSM_STATE_SIZE, | ||
LLM_KV_SSM_TIME_STEP_RANK, | ||
LLM_KV_SSM_B_DT_RMS, | ||
|
||
LLM_KV_TOKENIZER_MODEL, | ||
LLM_KV_TOKENIZER_PRE, | ||
|
@@ -426,6 +427,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { | |
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, | ||
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, | ||
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, | ||
{ LLM_KV_SSM_B_DT_RMS, "%s.ssm.b_dt_rms" }, | ||
younesbelkada marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, | ||
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, | ||
|
@@ -2237,6 +2239,7 @@ struct llama_hparams { | |
uint32_t ssm_d_inner = 0; | ||
uint32_t ssm_d_state = 0; | ||
uint32_t ssm_dt_rank = 0; | ||
bool ssm_b_dt_rms = false; | ||
|
||
float f_clamp_kqv = 0.0f; | ||
float f_max_alibi_bias = 0.0f; | ||
|
@@ -5052,6 +5055,7 @@ static void llm_load_hparams( | |
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); | ||
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); | ||
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); | ||
ml.get_key(LLM_KV_SSM_B_DT_RMS, hparams.ssm_b_dt_rms); | ||
younesbelkada marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); | ||
|
||
|
@@ -12161,6 +12165,10 @@ struct llm_build_context { | |
GGML_ASSERT(2 * d_model == d_inner); | ||
const int64_t d_state = hparams.ssm_d_state; | ||
const int64_t dt_rank = hparams.ssm_dt_rank; | ||
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) | ||
const bool ssm_b_dt_rms = hparams.ssm_b_dt_rms; | ||
// Use the same RMS norm as the final layer norm | ||
const float norm_rms_eps = hparams.f_norm_rms_eps; | ||
|
||
struct ggml_tensor * cur; | ||
struct ggml_tensor * inpL; | ||
|
@@ -12241,6 +12249,13 @@ struct llm_build_context { | |
struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank); | ||
struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state)); | ||
|
||
// Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers | ||
if (ssm_b_dt_rms) { | ||
dt = ggml_rms_norm(ctx0, dt, norm_rms_eps); | ||
B = ggml_rms_norm(ctx0, B, norm_rms_eps); | ||
C = ggml_rms_norm(ctx0, C, norm_rms_eps); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will eventually be rewritten to use But for now I think this is fine. |
||
|
||
// {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens} | ||
dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt); | ||
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.