fix tokenizing with add_special

ngxson · ngxson · commit d03c2407abb9 · 2025-05-26T11:43:39.000+02:00
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
@@ -130,7 +130,7 @@ enum projector_type {
     PROJECTOR_TYPE_INTERNVL,
     PROJECTOR_TYPE_LLAMA4,
     PROJECTOR_TYPE_QWEN2A,
-    PROJECTOR_TYPE_QWEN25O,
+    PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
     PROJECTOR_TYPE_UNKNOWN,
 };
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -402,10 +402,33 @@ struct mtmd_tokenizer {
                 }
             } else {
                 // this is a text part, we should add it as text
-                add_text(part, add_special, parse_special);
+                add_text(part, parse_special);
             }
         }
 
+        if (add_special && llama_vocab_get_add_bos(vocab)) {
+            // if first chunk is text, we add BOS token to first text chunk
+            // otherwise, create a new text chunk with BOS token
+            if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+                // add BOS token to the beginning of first text chunk
+                cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
+            } else {
+                // create a new text chunk with BOS token at the beginning
+                mtmd_input_chunk bos_chunk{
+                    MTMD_INPUT_CHUNK_TYPE_TEXT,
+                    {llama_vocab_bos(vocab)},
+                    nullptr, // image tokens
+                    nullptr, // audio tokens
+                };
+                cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
+            }
+        }
+
+        if (add_special && llama_vocab_get_add_eos(vocab)) {
+            // if last chunk is text, we add EOS token to it
+            add_text({llama_vocab_eos(vocab)});
+        }
+
         if (i_bm != bitmaps.size()) {
             LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
                     __func__, bitmaps.size(), parts.size() - 1);
@@ -417,9 +440,9 @@ struct mtmd_tokenizer {
         return 0;
     }
 
-    void add_text(const std::string & txt, bool add_special, bool parse_special) {
+    void add_text(const std::string & txt, bool parse_special) {
         LOG_DBG("%s: %s\n", __func__, txt.c_str());
-        auto tokens = mtmd_tokenize_text_internal(vocab, txt, add_special, parse_special);
+        auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
         add_text(tokens);
     }
 
@@ -454,7 +477,7 @@ struct mtmd_tokenizer {
             }
 
             if (!ctx->img_beg.empty()) {
-                add_text(ctx->img_beg, false, true); // add image begin token
+                add_text(ctx->img_beg, true); // add image begin token
             }
 
             // convert mtmd_bitmap to clip_image_u8
@@ -571,7 +594,7 @@ struct mtmd_tokenizer {
             }
 
             if (!ctx->img_end.empty()) {
-                add_text(ctx->img_end, false, true); // add image end token
+                add_text(ctx->img_end, true); // add image end token
             }
 
         } else {
@@ -588,7 +611,7 @@ struct mtmd_tokenizer {
             }
 
             if (!ctx->aud_beg.empty()) {
-                add_text(ctx->aud_beg, false, true); // add audio begin token
+                add_text(ctx->aud_beg, true); // add audio begin token
             }
 
             // preprocess audio
@@ -632,7 +655,7 @@ struct mtmd_tokenizer {
             }
 
             if (!ctx->aud_end.empty()) {
-                add_text(ctx->aud_end, false, true); // add audio end token
+                add_text(ctx->aud_end, true); // add audio end token
             }
         }
 
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh
@@ -30,20 +30,23 @@ fi
 
 ###############
 
+arr_prefix=()
 arr_hf=()
 arr_tmpl=() # chat template
 arr_file=()
 
 add_test_vision() {
     local hf=$1
     local tmpl=${2:-""} # default to empty string if not provided
+    arr_prefix+=("[vision]")
     arr_hf+=("$hf")
     arr_tmpl+=("$tmpl")
     arr_file+=("test-1.jpeg")
 }
 
 add_test_audio() {
     local hf=$1
+    arr_prefix+=("[audio] ")
     arr_hf+=("$hf")
     arr_tmpl+=("") # no need for chat tmpl
     arr_file+=("test-2.mp3")
@@ -107,6 +110,7 @@ arr_res=()
 
 for i in "${!arr_hf[@]}"; do
     bin="llama-mtmd-cli"
+    prefix="${arr_prefix[$i]}"
     hf="${arr_hf[$i]}"
     tmpl="${arr_tmpl[$i]}"
     inp_file="${arr_file[$i]}"
@@ -127,9 +131,9 @@ for i in "${!arr_hf[@]}"; do
     echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
 
     if echo "$output" | grep -iq "new york"; then
-        result="\033[32mOK\033[0m:   $bin $hf"
+        result="$prefix \033[32mOK\033[0m:   $bin $hf"
     else
-        result="\033[31mFAIL\033[0m: $bin $hf"
+        result="$prefix \033[31mFAIL\033[0m: $bin $hf"
     fi
     echo -e "$result"
     arr_res+=("$result")

Original file line number	Diff line number	Diff line change
`@@ -402,10 +402,33 @@ struct mtmd_tokenizer {`
`402`	`402`	`}`
`403`	`403`	`} else {`
`404`	`404`	`// this is a text part, we should add it as text`
`405`		`- add_text(part, add_special, parse_special);`
	`405`	`+ add_text(part, parse_special);`
`406`	`406`	`}`
`407`	`407`	`}`
`408`	`408`
	`409`	`+ if (add_special && llama_vocab_get_add_bos(vocab)) {`
	`410`	`+ // if first chunk is text, we add BOS token to first text chunk`
	`411`	`+ // otherwise, create a new text chunk with BOS token`
	`412`	`+ if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {`
	`413`	`+ // add BOS token to the beginning of first text chunk`
	`414`	`+ cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));`
	`415`	`+ } else {`
	`416`	`+ // create a new text chunk with BOS token at the beginning`
	`417`	`+ mtmd_input_chunk bos_chunk{`
	`418`	`+ MTMD_INPUT_CHUNK_TYPE_TEXT,`
	`419`	`+ {llama_vocab_bos(vocab)},`
	`420`	`+ nullptr, // image tokens`
	`421`	`+ nullptr, // audio tokens`
	`422`	`+ };`
	`423`	`+ cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));`
	`424`	`+ }`
	`425`	`+ }`
	`426`	`+`
	`427`	`+ if (add_special && llama_vocab_get_add_eos(vocab)) {`
	`428`	`+ // if last chunk is text, we add EOS token to it`
	`429`	`+ add_text({llama_vocab_eos(vocab)});`
	`430`	`+ }`
	`431`	`+`
`409`	`432`	`if (i_bm != bitmaps.size()) {`
`410`	`433`	`LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",`
`411`	`434`	`__func__, bitmaps.size(), parts.size() - 1);`
`@@ -417,9 +440,9 @@ struct mtmd_tokenizer {`
`417`	`440`	`return 0;`
`418`	`441`	`}`
`419`	`442`
`420`		`- void add_text(const std::string & txt, bool add_special, bool parse_special) {`
	`443`	`+ void add_text(const std::string & txt, bool parse_special) {`
`421`	`444`	`LOG_DBG("%s: %s\n", __func__, txt.c_str());`
`422`		`- auto tokens = mtmd_tokenize_text_internal(vocab, txt, add_special, parse_special);`
	`445`	`+ auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);`
`423`	`446`	`add_text(tokens);`
`424`	`447`	`}`
`425`	`448`
`@@ -454,7 +477,7 @@ struct mtmd_tokenizer {`
`454`	`477`	`}`
`455`	`478`
`456`	`479`	`if (!ctx->img_beg.empty()) {`
`457`		`- add_text(ctx->img_beg, false, true); // add image begin token`
	`480`	`+ add_text(ctx->img_beg, true); // add image begin token`
`458`	`481`	`}`
`459`	`482`
`460`	`483`	`// convert mtmd_bitmap to clip_image_u8`
`@@ -571,7 +594,7 @@ struct mtmd_tokenizer {`
`571`	`594`	`}`
`572`	`595`
`573`	`596`	`if (!ctx->img_end.empty()) {`
`574`		`- add_text(ctx->img_end, false, true); // add image end token`
	`597`	`+ add_text(ctx->img_end, true); // add image end token`
`575`	`598`	`}`
`576`	`599`
`577`	`600`	`} else {`
`@@ -588,7 +611,7 @@ struct mtmd_tokenizer {`
`588`	`611`	`}`
`589`	`612`
`590`	`613`	`if (!ctx->aud_beg.empty()) {`
`591`		`- add_text(ctx->aud_beg, false, true); // add audio begin token`
	`614`	`+ add_text(ctx->aud_beg, true); // add audio begin token`
`592`	`615`	`}`
`593`	`616`
`594`	`617`	`// preprocess audio`
`@@ -632,7 +655,7 @@ struct mtmd_tokenizer {`
`632`	`655`	`}`
`633`	`656`
`634`	`657`	`if (!ctx->aud_end.empty()) {`
`635`		`- add_text(ctx->aud_end, false, true); // add audio end token`
	`658`	`+ add_text(ctx->aud_end, true); // add audio end token`
`636`	`659`	`}`
`637`	`660`	`}`
`638`	`661`