@@ -157,18 +157,26 @@ struct mtmd_context {
157
157
throw std::runtime_error (string_format (" Failed to load CLIP model from %s\n " , mmproj_fname));
158
158
}
159
159
160
- clip_ctx * ctx_clip = get_clip_ctx ();
161
- if (llama_model_n_embd (text_model) != clip_n_mmproj_embd (ctx_clip)) {
160
+ if (llama_model_n_embd (text_model) != n_embd_projected ()) {
162
161
throw std::runtime_error (string_format (
163
162
" mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n "
164
163
" hint: you may be using wrong mmproj\n " ,
165
- llama_model_n_embd (text_model), clip_n_mmproj_embd (ctx_clip )));
164
+ llama_model_n_embd (text_model), n_embd_projected ( )));
166
165
}
166
+ if (ctx_v) {
167
+ init_vision ();
168
+ }
169
+ if (ctx_a) {
170
+ init_audio ();
171
+ }
172
+ }
167
173
168
- use_mrope = clip_is_qwen2vl (ctx_clip);
174
+ void init_vision () {
175
+ GGML_ASSERT (ctx_v != nullptr );
176
+ use_mrope = clip_is_qwen2vl (ctx_v);
169
177
170
- projector_type proj = clip_get_projector_type (ctx_clip );
171
- int minicpmv_version = clip_is_minicpmv (ctx_clip );
178
+ projector_type proj = clip_get_projector_type (ctx_v );
179
+ int minicpmv_version = clip_is_minicpmv (ctx_v );
172
180
if (minicpmv_version == 2 ) {
173
181
// minicpmv 2.5 format:
174
182
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
@@ -219,57 +227,53 @@ struct mtmd_context {
219
227
}
220
228
221
229
// set boi/eoi
222
- projector_type pt = proj_type ();
223
- if (pt == PROJECTOR_TYPE_GEMMA3) {
230
+ if (proj == PROJECTOR_TYPE_GEMMA3) {
224
231
// <start_of_image> ... (image embeddings) ... <end_of_image>
225
232
img_beg = " <start_of_image>" ;
226
233
img_end = " <end_of_image>" ;
227
234
228
- } else if (pt == PROJECTOR_TYPE_IDEFICS3) {
235
+ } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
229
236
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
230
237
img_beg = " <fake_token_around_image><global-img>" ;
231
238
img_end = " <fake_token_around_image>" ;
232
239
233
- } else if (pt == PROJECTOR_TYPE_PIXTRAL) {
240
+ } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
234
241
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
235
242
img_end = " [IMG_END]" ;
236
243
237
- } else if (pt == PROJECTOR_TYPE_QWEN2VL || pt == PROJECTOR_TYPE_QWEN25VL) {
244
+ } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
238
245
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
239
246
img_beg = " <|vision_start|>" ;
240
247
img_end = " <|vision_end|>" ;
241
248
242
- } else if (pt == PROJECTOR_TYPE_LLAMA4) {
249
+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
243
250
// (more details in mtmd_context constructor)
244
251
img_beg = " <|image_start|>" ;
245
252
img_end = " <|image_end|>" ;
253
+ LOG_WRN (" %s: llama 4 vision is known to have degraded quality:\n "
254
+ " https://github.com/ggml-org/llama.cpp/pull/13282\n " , __func__);
246
255
247
- } else if (pt == PROJECTOR_TYPE_INTERNVL) {
256
+ } else if (proj == PROJECTOR_TYPE_INTERNVL) {
248
257
// <img> ... (image embeddings) ... </img>
249
258
img_beg = " <img>" ;
250
259
img_end = " </img>" ;
251
260
252
- } else if (pt == PROJECTOR_TYPE_QWEN2A) {
261
+ }
262
+ }
263
+
264
+ void init_audio () {
265
+ GGML_ASSERT (ctx_a != nullptr );
266
+ projector_type proj = clip_get_projector_type (ctx_a);
267
+
268
+ LOG_WRN (" %s: audio input is in experimental stage and may have reduced quality:\n "
269
+ " https://github.com/ggml-org/llama.cpp/discussions/13759\n " , __func__);
270
+
271
+ if (proj == PROJECTOR_TYPE_QWEN2A) {
253
272
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
254
273
aud_beg = " <|audio_bos|>" ;
255
274
aud_end = " <|audio_eos|>" ;
256
275
257
276
}
258
-
259
- // warning messages
260
- if (proj == PROJECTOR_TYPE_LLAMA4) {
261
- LOG_WRN (" %s: llama 4 vision is known to have degraded quality:\n "
262
- " https://github.com/ggml-org/llama.cpp/pull/13282\n " , __func__);
263
- }
264
- if (ctx_a) {
265
- LOG_WRN (" %s: audio input is in experimental stage and may have reduced quality:\n "
266
- " https://github.com/ggml-org/llama.cpp/discussions/13759\n " , __func__);
267
- }
268
- }
269
-
270
- // get the main clip ctx
271
- clip_ctx * get_clip_ctx () const {
272
- return ctx_v ? ctx_v : ctx_a;
273
277
}
274
278
275
279
// get clip ctx based on chunk type
@@ -282,14 +286,17 @@ struct mtmd_context {
282
286
GGML_ABORT (" unknown chunk type" );
283
287
}
284
288
285
- // both audio and vision contexts have the same projector type
286
- projector_type proj_type () const {
287
- return clip_get_projector_type (get_clip_ctx ());
289
+ projector_type proj_type_v () const {
290
+ return ctx_v ? clip_get_projector_type (ctx_v) : PROJECTOR_TYPE_UNKNOWN;
291
+ }
292
+
293
+ projector_type proj_type_a () const {
294
+ return ctx_a ? clip_get_projector_type (ctx_a) : PROJECTOR_TYPE_UNKNOWN;
288
295
}
289
296
290
297
// both audio and vision contexts have the n_embd output dimension
291
298
int n_embd_projected () const {
292
- return clip_n_mmproj_embd (get_clip_ctx () );
299
+ return clip_n_mmproj_embd (ctx_v ? ctx_v : ctx_a );
293
300
}
294
301
295
302
~mtmd_context () {
@@ -400,6 +407,7 @@ struct mtmd_tokenizer {
400
407
}
401
408
402
409
void add_text (const std::string & txt, bool add_special, bool parse_special) {
410
+ LOG_DBG (" %s: %s\n " , __func__, txt.c_str ());
403
411
auto tokens = mtmd_tokenize_text_internal (vocab, txt, add_special, parse_special);
404
412
add_text (tokens);
405
413
}
@@ -434,7 +442,9 @@ struct mtmd_tokenizer {
434
442
return 2 ;
435
443
}
436
444
437
- add_text (ctx->img_beg , false , true ); // add image begin token
445
+ if (!ctx->img_beg .empty ()) {
446
+ add_text (ctx->img_beg , false , true ); // add image begin token
447
+ }
438
448
439
449
// convert mtmd_bitmap to clip_image_u8
440
450
clip_image_u8_ptr img_u8 (clip_image_u8_init ());
@@ -549,7 +559,9 @@ struct mtmd_tokenizer {
549
559
cur.entries .emplace_back (std::move (chunk));
550
560
}
551
561
552
- add_text (ctx->img_end , false , true ); // add image end token
562
+ if (!ctx->img_end .empty ()) {
563
+ add_text (ctx->img_end , false , true ); // add image end token
564
+ }
553
565
554
566
} else {
555
567
// handle audio
@@ -564,7 +576,9 @@ struct mtmd_tokenizer {
564
576
return 2 ;
565
577
}
566
578
567
- add_text (ctx->aud_beg , false , true ); // add audio begin token
579
+ if (!ctx->aud_beg .empty ()) {
580
+ add_text (ctx->aud_beg , false , true ); // add audio begin token
581
+ }
568
582
569
583
// preprocess audio
570
584
GGML_ASSERT (ctx->w_filters .n_mel ); // make sure we have filter preloaded
@@ -606,7 +620,9 @@ struct mtmd_tokenizer {
606
620
cur.entries .emplace_back (std::move (chunk));
607
621
}
608
622
609
- add_text (ctx->aud_end , false , true ); // add audio end token
623
+ if (!ctx->aud_end .empty ()) {
624
+ add_text (ctx->aud_end , false , true ); // add audio end token
625
+ }
610
626
}
611
627
612
628
return 0 ;
@@ -751,7 +767,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
751
767
}
752
768
753
769
bool mtmd_decode_use_non_causal (mtmd_context * ctx) {
754
- if (ctx->proj_type ( ) == PROJECTOR_TYPE_GEMMA3) {
770
+ if (ctx->ctx_v && clip_get_projector_type (ctx-> ctx_v ) == PROJECTOR_TYPE_GEMMA3) {
755
771
return true ;
756
772
}
757
773
return false ;
0 commit comments