@@ -402,10 +402,33 @@ struct mtmd_tokenizer {
402
402
}
403
403
} else {
404
404
// this is a text part, we should add it as text
405
- add_text (part, add_special, parse_special);
405
+ add_text (part, parse_special);
406
406
}
407
407
}
408
408
409
+ if (add_special && llama_vocab_get_add_bos (vocab)) {
410
+ // if first chunk is text, we add BOS token to first text chunk
411
+ // otherwise, create a new text chunk with BOS token
412
+ if (!cur.entries .empty () && cur.entries [0 ].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
413
+ // add BOS token to the beginning of first text chunk
414
+ cur.entries [0 ].tokens_text .insert (cur.entries [0 ].tokens_text .begin (), llama_vocab_bos (vocab));
415
+ } else {
416
+ // create a new text chunk with BOS token at the beginning
417
+ mtmd_input_chunk bos_chunk{
418
+ MTMD_INPUT_CHUNK_TYPE_TEXT,
419
+ {llama_vocab_bos (vocab)},
420
+ nullptr , // image tokens
421
+ nullptr , // audio tokens
422
+ };
423
+ cur.entries .insert (cur.entries .begin (), std::move (bos_chunk));
424
+ }
425
+ }
426
+
427
+ if (add_special && llama_vocab_get_add_eos (vocab)) {
428
+ // if last chunk is text, we add EOS token to it
429
+ add_text ({llama_vocab_eos (vocab)});
430
+ }
431
+
409
432
if (i_bm != bitmaps.size ()) {
410
433
LOG_ERR (" %s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n " ,
411
434
__func__, bitmaps.size (), parts.size () - 1 );
@@ -417,9 +440,9 @@ struct mtmd_tokenizer {
417
440
return 0 ;
418
441
}
419
442
420
- void add_text (const std::string & txt, bool add_special, bool parse_special) {
443
+ void add_text (const std::string & txt, bool parse_special) {
421
444
LOG_DBG (" %s: %s\n " , __func__, txt.c_str ());
422
- auto tokens = mtmd_tokenize_text_internal (vocab, txt, add_special, parse_special);
445
+ auto tokens = mtmd_tokenize_text_internal (vocab, txt, /* add_special */ false , parse_special);
423
446
add_text (tokens);
424
447
}
425
448
@@ -454,7 +477,7 @@ struct mtmd_tokenizer {
454
477
}
455
478
456
479
if (!ctx->img_beg .empty ()) {
457
- add_text (ctx->img_beg , false , true ); // add image begin token
480
+ add_text (ctx->img_beg , true ); // add image begin token
458
481
}
459
482
460
483
// convert mtmd_bitmap to clip_image_u8
@@ -571,7 +594,7 @@ struct mtmd_tokenizer {
571
594
}
572
595
573
596
if (!ctx->img_end .empty ()) {
574
- add_text (ctx->img_end , false , true ); // add image end token
597
+ add_text (ctx->img_end , true ); // add image end token
575
598
}
576
599
577
600
} else {
@@ -588,7 +611,7 @@ struct mtmd_tokenizer {
588
611
}
589
612
590
613
if (!ctx->aud_beg .empty ()) {
591
- add_text (ctx->aud_beg , false , true ); // add audio begin token
614
+ add_text (ctx->aud_beg , true ); // add audio begin token
592
615
}
593
616
594
617
// preprocess audio
@@ -632,7 +655,7 @@ struct mtmd_tokenizer {
632
655
}
633
656
634
657
if (!ctx->aud_end .empty ()) {
635
- add_text (ctx->aud_end , false , true ); // add audio end token
658
+ add_text (ctx->aud_end , true ); // add audio end token
636
659
}
637
660
}
638
661
0 commit comments