Skip to content

Commit d03c240

Browse files
committed
fix tokenizing with add_special
1 parent bf34f38 commit d03c240

File tree

3 files changed

+37
-10
lines changed

3 files changed

+37
-10
lines changed

tools/mtmd/clip-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ enum projector_type {
130130
PROJECTOR_TYPE_INTERNVL,
131131
PROJECTOR_TYPE_LLAMA4,
132132
PROJECTOR_TYPE_QWEN2A,
133-
PROJECTOR_TYPE_QWEN25O,
133+
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
134134
PROJECTOR_TYPE_UNKNOWN,
135135
};
136136

tools/mtmd/mtmd.cpp

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -402,10 +402,33 @@ struct mtmd_tokenizer {
402402
}
403403
} else {
404404
// this is a text part, we should add it as text
405-
add_text(part, add_special, parse_special);
405+
add_text(part, parse_special);
406406
}
407407
}
408408

409+
if (add_special && llama_vocab_get_add_bos(vocab)) {
410+
// if first chunk is text, we add BOS token to first text chunk
411+
// otherwise, create a new text chunk with BOS token
412+
if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
413+
// add BOS token to the beginning of first text chunk
414+
cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
415+
} else {
416+
// create a new text chunk with BOS token at the beginning
417+
mtmd_input_chunk bos_chunk{
418+
MTMD_INPUT_CHUNK_TYPE_TEXT,
419+
{llama_vocab_bos(vocab)},
420+
nullptr, // image tokens
421+
nullptr, // audio tokens
422+
};
423+
cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
424+
}
425+
}
426+
427+
if (add_special && llama_vocab_get_add_eos(vocab)) {
428+
// if last chunk is text, we add EOS token to it
429+
add_text({llama_vocab_eos(vocab)});
430+
}
431+
409432
if (i_bm != bitmaps.size()) {
410433
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
411434
__func__, bitmaps.size(), parts.size() - 1);
@@ -417,9 +440,9 @@ struct mtmd_tokenizer {
417440
return 0;
418441
}
419442

420-
void add_text(const std::string & txt, bool add_special, bool parse_special) {
443+
void add_text(const std::string & txt, bool parse_special) {
421444
LOG_DBG("%s: %s\n", __func__, txt.c_str());
422-
auto tokens = mtmd_tokenize_text_internal(vocab, txt, add_special, parse_special);
445+
auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
423446
add_text(tokens);
424447
}
425448

@@ -454,7 +477,7 @@ struct mtmd_tokenizer {
454477
}
455478

456479
if (!ctx->img_beg.empty()) {
457-
add_text(ctx->img_beg, false, true); // add image begin token
480+
add_text(ctx->img_beg, true); // add image begin token
458481
}
459482

460483
// convert mtmd_bitmap to clip_image_u8
@@ -571,7 +594,7 @@ struct mtmd_tokenizer {
571594
}
572595

573596
if (!ctx->img_end.empty()) {
574-
add_text(ctx->img_end, false, true); // add image end token
597+
add_text(ctx->img_end, true); // add image end token
575598
}
576599

577600
} else {
@@ -588,7 +611,7 @@ struct mtmd_tokenizer {
588611
}
589612

590613
if (!ctx->aud_beg.empty()) {
591-
add_text(ctx->aud_beg, false, true); // add audio begin token
614+
add_text(ctx->aud_beg, true); // add audio begin token
592615
}
593616

594617
// preprocess audio
@@ -632,7 +655,7 @@ struct mtmd_tokenizer {
632655
}
633656

634657
if (!ctx->aud_end.empty()) {
635-
add_text(ctx->aud_end, false, true); // add audio end token
658+
add_text(ctx->aud_end, true); // add audio end token
636659
}
637660
}
638661

tools/mtmd/tests.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,20 +30,23 @@ fi
3030

3131
###############
3232

33+
arr_prefix=()
3334
arr_hf=()
3435
arr_tmpl=() # chat template
3536
arr_file=()
3637

3738
add_test_vision() {
3839
local hf=$1
3940
local tmpl=${2:-""} # default to empty string if not provided
41+
arr_prefix+=("[vision]")
4042
arr_hf+=("$hf")
4143
arr_tmpl+=("$tmpl")
4244
arr_file+=("test-1.jpeg")
4345
}
4446

4547
add_test_audio() {
4648
local hf=$1
49+
arr_prefix+=("[audio] ")
4750
arr_hf+=("$hf")
4851
arr_tmpl+=("") # no need for chat tmpl
4952
arr_file+=("test-2.mp3")
@@ -107,6 +110,7 @@ arr_res=()
107110

108111
for i in "${!arr_hf[@]}"; do
109112
bin="llama-mtmd-cli"
113+
prefix="${arr_prefix[$i]}"
110114
hf="${arr_hf[$i]}"
111115
tmpl="${arr_tmpl[$i]}"
112116
inp_file="${arr_file[$i]}"
@@ -127,9 +131,9 @@ for i in "${!arr_hf[@]}"; do
127131
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
128132

129133
if echo "$output" | grep -iq "new york"; then
130-
result="\033[32mOK\033[0m: $bin $hf"
134+
result="$prefix \033[32mOK\033[0m: $bin $hf"
131135
else
132-
result="\033[31mFAIL\033[0m: $bin $hf"
136+
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
133137
fi
134138
echo -e "$result"
135139
arr_res+=("$result")

0 commit comments

Comments
 (0)