56
56
#pragma warning(disable: 4244 4267) // possible loss of data
57
57
#endif
58
58
59
+ // tensor names
60
+ #define TN_TOKEN_EMBD " token_embd.weight"
61
+ #define TN_OUTPUT_NORM " output_norm.weight"
62
+ #define TN_OUTPUT " output.weight"
63
+ #define TN_ATTN_NORM " blk.%d.attn_norm.weight"
64
+ #define TN_ATTN_Q " blk.%d.attn_q.weight"
65
+ #define TN_ATTN_K " blk.%d.attn_k.weight"
66
+ #define TN_ATTN_V " blk.%d.attn_v.weight"
67
+ #define TN_ATTN_OUTPUT " blk.%d.attn_output.weight"
68
+ #define TN_FFN_NORM " blk.%d.ffn_norm.weight"
69
+ #define TN_FFN_GATE " blk.%d.ffn_gate.weight"
70
+ #define TN_FFN_DOWN " blk.%d.ffn_down.weight"
71
+ #define TN_FFN_UP " blk.%d.ffn_up.weight"
72
+
59
73
static void llama_log_internal (llama_log_level level, const char * format, ...);
60
74
static void llama_log_callback_default (llama_log_level level, const char * text, void * user_data);
61
75
#define LLAMA_LOG_INFO (...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -1310,7 +1324,7 @@ static void llama_model_load_internal(
1310
1324
1311
1325
ml->ggml_ctx = ctx;
1312
1326
1313
- model.tok_embeddings = ml->get_tensor (" token_embd.weight " , {n_embd, n_vocab}, GGML_BACKEND_CPU);
1327
+ model.tok_embeddings = ml->get_tensor (TN_TOKEN_EMBD , {n_embd, n_vocab}, GGML_BACKEND_CPU);
1314
1328
1315
1329
// "output" tensor
1316
1330
{
@@ -1331,8 +1345,8 @@ static void llama_model_load_internal(
1331
1345
backend_output = GGML_BACKEND_CPU;
1332
1346
}
1333
1347
1334
- model.norm = ml->get_tensor (" output_norm.weight " , {n_embd}, backend_norm);
1335
- model.output = ml->get_tensor (" output.weight " , {n_embd, n_vocab}, backend_output);
1348
+ model.norm = ml->get_tensor (TN_OUTPUT_NORM , {n_embd}, backend_norm);
1349
+ model.output = ml->get_tensor (TN_OUTPUT , {n_embd, n_vocab}, backend_output);
1336
1350
if (backend_norm == GGML_BACKEND_GPU) {
1337
1351
vram_weights += ggml_nbytes (model.norm );
1338
1352
}
@@ -1349,21 +1363,18 @@ static void llama_model_load_internal(
1349
1363
const ggml_backend backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1350
1364
1351
1365
auto & layer = model.layers [i];
1366
+ layer.attention_norm = ml->get_tensor (format (TN_ATTN_NORM, i), {n_embd}, backend);
1352
1367
1353
- std::string layers_i = " blk." + std::to_string (i);
1354
-
1355
- layer.attention_norm = ml->get_tensor (layers_i + " .attn_norm.weight" , {n_embd}, backend);
1356
-
1357
- layer.wq = ml->get_tensor (layers_i + " .attn_q.weight" , {n_embd, n_embd}, backend_split);
1358
- layer.wk = ml->get_tensor (layers_i + " .attn_k.weight" , {n_embd, n_embd_gqa}, backend_split);
1359
- layer.wv = ml->get_tensor (layers_i + " .attn_v.weight" , {n_embd, n_embd_gqa}, backend_split);
1360
- layer.wo = ml->get_tensor (layers_i + " .attn_output.weight" , {n_embd, n_embd}, backend_split);
1368
+ layer.wq = ml->get_tensor (format (TN_ATTN_Q, i), {n_embd, n_embd}, backend_split);
1369
+ layer.wk = ml->get_tensor (format (TN_ATTN_K, i), {n_embd, n_embd_gqa}, backend_split);
1370
+ layer.wv = ml->get_tensor (format (TN_ATTN_V, i), {n_embd, n_embd_gqa}, backend_split);
1371
+ layer.wo = ml->get_tensor (format (TN_ATTN_OUTPUT, i), {n_embd, n_embd}, backend_split);
1361
1372
1362
- layer.ffn_norm = ml->get_tensor (layers_i + " .ffn_norm.weight " , {n_embd}, backend);
1373
+ layer.ffn_norm = ml->get_tensor (format (TN_FFN_NORM, i) , {n_embd}, backend);
1363
1374
1364
- layer.w1 = ml->get_tensor (layers_i + " .ffn_gate.weight " , {n_embd, n_ff}, backend_split);
1365
- layer.w2 = ml->get_tensor (layers_i + " .ffn_down.weight " , { n_ff, n_embd}, backend_split);
1366
- layer.w3 = ml->get_tensor (layers_i + " .ffn_up.weight " , {n_embd, n_ff}, backend_split);
1375
+ layer.w1 = ml->get_tensor (format (TN_FFN_GATE, i) , {n_embd, n_ff}, backend_split);
1376
+ layer.w2 = ml->get_tensor (format (TN_FFN_DOWN, i) , { n_ff, n_embd}, backend_split);
1377
+ layer.w3 = ml->get_tensor (format (TN_FFN_UP, i) , {n_embd, n_ff}, backend_split);
1367
1378
1368
1379
if (backend == GGML_BACKEND_GPU) {
1369
1380
vram_weights +=
@@ -3240,10 +3251,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3240
3251
int n_attention_wv = 0 ;
3241
3252
int n_feed_forward_w2 = 0 ;
3242
3253
for (auto & tensor : model_loader->tensors_map .tensors ) {
3243
- if (tensor.name .find (" attention.wv .weight" ) != std::string::npos) {
3254
+ if (tensor.name .find (" attn_v .weight" ) != std::string::npos) {
3244
3255
++n_attention_wv;
3245
3256
}
3246
- else if (tensor.name .find (" feed_forward.w2 .weight" ) != std::string::npos) {
3257
+ else if (tensor.name .find (" ffn_down .weight" ) != std::string::npos) {
3247
3258
++n_feed_forward_w2;
3248
3259
}
3249
3260
}
@@ -3298,13 +3309,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3298
3309
} else {
3299
3310
new_type = quantized_type;
3300
3311
#ifdef GGML_USE_K_QUANTS
3301
- if (tensor.name == " output.weight " ) {
3312
+ if (tensor.name == TN_OUTPUT ) {
3302
3313
int nx = tensor.ne .at (0 );
3303
3314
int ny = tensor.ne .at (1 );
3304
3315
if (nx % QK_K == 0 && ny % QK_K == 0 ) {
3305
3316
new_type = GGML_TYPE_Q6_K;
3306
3317
}
3307
- } else if (tensor.name .find (" attention.wv .weight" ) != std::string::npos) {
3318
+ } else if (tensor.name .find (" attn_v .weight" ) != std::string::npos) {
3308
3319
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
3309
3320
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
3310
3321
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@@ -3319,7 +3330,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3319
3330
use_more_bits (i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
3320
3331
// else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
3321
3332
++i_feed_forward_w2;
3322
- } else if (tensor.name .find (" attention.wo .weight" ) != std::string::npos) {
3333
+ } else if (tensor.name .find (" attn_output .weight" ) != std::string::npos) {
3323
3334
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
3324
3335
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
3325
3336
}
@@ -3334,10 +3345,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3334
3345
}
3335
3346
}
3336
3347
if (convert_incompatible_tensor) {
3337
- if (tensor.name == " output.weight " ) {
3348
+ if (tensor.name == TN_OUTPUT ) {
3338
3349
new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
3339
3350
LLAMA_LOG_WARN (" F16 will be used for this tensor instead.\n " );
3340
- } else if (tensor.name == " tok_embeddings.weight " ) {
3351
+ } else if (tensor.name == TN_TOKEN_EMBD ) {
3341
3352
new_type = GGML_TYPE_Q4_0; // fall back to Q4_0 instead of just failing.
3342
3353
LLAMA_LOG_WARN (" Q4_0 will be used for this tensor instead.\n " );
3343
3354
} else {
0 commit comments