@@ -402,9 +402,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
402
402
// get general kv
403
403
ml.get_key (LLM_KV_GENERAL_NAME, name, false );
404
404
405
- // get hparams kv
406
- ml.get_key (LLM_KV_VOCAB_SIZE, hparams.n_vocab , false ) || ml.get_arr_n (LLM_KV_TOKENIZER_LIST, hparams.n_vocab , false );
407
-
408
405
// everything past this point is not vocab-related
409
406
if (hparams.vocab_only ) {
410
407
return ;
@@ -500,6 +497,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
500
497
hparams.n_embd_head_v = 0 ;
501
498
}
502
499
500
+ // for differentiating model types
501
+ uint32_t n_vocab = 0 ;
502
+ ml.get_key (LLM_KV_VOCAB_SIZE, n_vocab, false ) || ml.get_arr_n (LLM_KV_TOKENIZER_LIST, n_vocab, false );
503
+
503
504
// arch-specific KVs
504
505
switch (arch) {
505
506
case LLM_ARCH_LLAMA:
@@ -519,7 +520,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
519
520
case 26 : type = LLM_TYPE_3B; break ;
520
521
case 28 : type = LLM_TYPE_3B; break ; // Llama 3.2 3B
521
522
// granite uses a vocab with len 49152
522
- case 32 : type = hparams. n_vocab == 49152 ? LLM_TYPE_3B : (hparams. n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break ;
523
+ case 32 : type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break ;
523
524
case 36 : type = LLM_TYPE_8B; break ; // granite
524
525
case 40 : type = LLM_TYPE_13B; break ;
525
526
case 48 : type = LLM_TYPE_34B; break ;
@@ -621,7 +622,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
621
622
{
622
623
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
623
624
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
624
- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
625
625
ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type , false );
626
626
627
627
switch (hparams.n_layer ) {
@@ -644,7 +644,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
644
644
{
645
645
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
646
646
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
647
- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
648
647
ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type , false );
649
648
hparams.f_max_alibi_bias = 8 .0f ;
650
649
@@ -658,7 +657,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
658
657
{
659
658
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
660
659
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
661
- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
662
660
ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type );
663
661
664
662
if (hparams.n_layer == 12 && hparams.n_embd == 768 ) {
@@ -1369,8 +1367,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1369
1367
const int64_t n_embd_head_v = hparams.n_embd_head_v ;
1370
1368
const int64_t n_ff = hparams.n_ff ();
1371
1369
const int64_t n_embd_gqa = n_embd_v_gqa;
1372
- const int64_t n_vocab = hparams .n_vocab ;
1373
- const int64_t n_vocab_type = hparams. n_vocab_type ;
1370
+ const int64_t n_vocab = vocab .n_vocab () ;
1371
+ const int64_t n_token_types = vocab. n_token_types () ;
1374
1372
const int64_t n_rot = hparams.n_rot ;
1375
1373
const int64_t n_expert = hparams.n_expert ;
1376
1374
const int64_t n_expert_used = hparams.n_expert_used ;
@@ -1815,7 +1813,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1815
1813
case LLM_ARCH_NOMIC_BERT:
1816
1814
{
1817
1815
tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 );
1818
- type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_vocab_type }, 0 );
1816
+ type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_token_types }, 0 );
1819
1817
1820
1818
if (arch == LLM_ARCH_BERT) {
1821
1819
pos_embd = create_tensor (tn (LLM_TENSOR_POS_EMBD, " weight" ), {n_embd, n_ctx_train}, 0 );
@@ -1869,7 +1867,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1869
1867
case LLM_ARCH_JINA_BERT_V2:
1870
1868
{
1871
1869
tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 ); // word_embeddings
1872
- type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_vocab_type }, 0 ); // token_type_embeddings
1870
+ type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_token_types }, 0 ); // token_type_embeddings
1873
1871
1874
1872
tok_norm = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD_NORM, " weight" ), {n_embd}, 0 ); // LayerNorm
1875
1873
tok_norm_b = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD_NORM, " bias" ), {n_embd}, 0 ); // LayerNorm bias
@@ -3553,7 +3551,6 @@ void llama_model::print_info() const {
3553
3551
3554
3552
// hparams
3555
3553
LLAMA_LOG_INFO (" %s: arch = %s\n " , __func__, arch_name ().c_str ());
3556
- LLAMA_LOG_INFO (" %s: n_vocab (hp) = %u\n " , __func__, hparams.n_vocab );
3557
3554
LLAMA_LOG_INFO (" %s: vocab_only = %d\n " , __func__, hparams.vocab_only );
3558
3555
3559
3556
if (!hparams.vocab_only ) {
0 commit comments