@@ -254,7 +254,9 @@ struct clip_vision_model {
254
254
ggml_tensor * post_ln_w;
255
255
ggml_tensor * post_ln_b;
256
256
257
- ggml_tensor * projection;
257
+ ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
258
+ ggml_tensor * mm_fc_w;
259
+ ggml_tensor * mm_fc_b;
258
260
259
261
// LLaVA projection
260
262
ggml_tensor * mm_input_norm_w = nullptr ;
@@ -1471,48 +1473,58 @@ struct clip_graph {
1471
1473
1472
1474
cb (cur, " after_transformer" , -1 );
1473
1475
1474
- // StackAudioFrames
1475
- // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1476
- {
1477
- int64_t stride = n_embd * hparams.proj_stack_factor ;
1478
- int64_t padded_len = GGML_PAD (ggml_nelements (cur), stride);
1479
- int64_t pad = padded_len - ggml_nelements (cur);
1480
- if (pad > 0 ) {
1481
- cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1482
- cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
1476
+ if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
1477
+ // StackAudioFrames
1478
+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1479
+ {
1480
+ int64_t stride = n_embd * hparams.proj_stack_factor ;
1481
+ int64_t padded_len = GGML_PAD (ggml_nelements (cur), stride);
1482
+ int64_t pad = padded_len - ggml_nelements (cur);
1483
+ if (pad > 0 ) {
1484
+ cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1485
+ cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
1486
+ }
1487
+ cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
1488
+ ggml_row_size (cur->type , stride), 0 );
1483
1489
}
1484
- cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
1485
- ggml_row_size (cur->type , stride), 0 );
1486
- }
1487
1490
1488
- cb (cur, " after_stacked" , -1 );
1491
+ cb (cur, " after_stacked" , -1 );
1489
1492
1490
- // UltravoxProjector
1491
- {
1492
- // pre-norm
1493
- cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1494
- cur = ggml_mul (ctx0, cur, model.mm_norm_pre_w );
1493
+ // UltravoxProjector
1494
+ {
1495
+ // pre-norm
1496
+ cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1497
+ cur = ggml_mul (ctx0, cur, model.mm_norm_pre_w );
1495
1498
1496
- // ffn in
1497
- cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
1499
+ // ffn in
1500
+ cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
1498
1501
1499
- // swiglu
1500
- {
1501
- int64_t split_point = cur->ne [0 ] / 2 ;
1502
- ggml_tensor * x0 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], 0 ));
1503
- ggml_tensor * x1 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], split_point * ggml_element_size (cur)));
1502
+ // swiglu
1503
+ {
1504
+ int64_t split_point = cur->ne [0 ] / 2 ;
1505
+ ggml_tensor * x0 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], 0 ));
1506
+ ggml_tensor * x1 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], split_point * ggml_element_size (cur)));
1507
+
1508
+ // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1509
+ x1 = ggml_silu (ctx0, x1);
1510
+ cur = ggml_mul (ctx0, x0, x1);
1511
+ }
1504
1512
1505
- // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1506
- x1 = ggml_silu (ctx0, x1);
1507
- cur = ggml_mul (ctx0, x0, x1);
1513
+ // mid-norm
1514
+ cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1515
+ cur = ggml_mul (ctx0, cur, model.mm_norm_mid_w );
1516
+
1517
+ // ffn out
1518
+ cur = ggml_mul_mat (ctx0, model.mm_2_w , cur);
1508
1519
}
1509
1520
1510
- // mid-norm
1511
- cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1512
- cur = ggml_mul (ctx0, cur, model.mm_norm_mid_w );
1521
+ } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
1522
+ // projector
1523
+ cur = ggml_mul_mat (ctx0, model.mm_fc_w , cur);
1524
+ cur = ggml_add (ctx0, cur, model.mm_fc_b );
1513
1525
1514
- // ffn out
1515
- cur = ggml_mul_mat (ctx0, model. mm_2_w , cur );
1526
+ } else {
1527
+ GGML_ABORT ( " %s: unknown projector type " , __func__ );
1516
1528
}
1517
1529
1518
1530
cb (cur, " projected" , -1 );
@@ -1655,6 +1667,17 @@ struct clip_graph {
1655
1667
inpL = cur;
1656
1668
}
1657
1669
1670
+ // TODO @ngxson : find a way to move this outside
1671
+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
1672
+ ggml_tensor * cur = inpL;
1673
+ cur = ggml_transpose (ctx0, cur);
1674
+ cur = ggml_cont (ctx0, cur);
1675
+ cur = ggml_pool_1d (ctx0, cur, GGML_OP_POOL_AVG, 2 , 2 , 0 );
1676
+ cur = ggml_transpose (ctx0, cur);
1677
+ cur = ggml_cont (ctx0, cur);
1678
+ inpL = cur;
1679
+ }
1680
+
1658
1681
// post-layernorm
1659
1682
if (model.post_ln_w ) {
1660
1683
inpL = build_norm (inpL, model.post_ln_w , model.post_ln_b , norm_t , eps, -1 );
@@ -1952,6 +1975,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
1952
1975
res = graph.build_llama4 ();
1953
1976
} break ;
1954
1977
case PROJECTOR_TYPE_ULTRAVOX:
1978
+ case PROJECTOR_TYPE_QWEN2A:
1955
1979
{
1956
1980
res = graph.build_whisper_enc ();
1957
1981
} break ;
@@ -2186,8 +2210,10 @@ struct clip_model_loader {
2186
2210
};
2187
2211
} break ;
2188
2212
case PROJECTOR_TYPE_ULTRAVOX:
2213
+ case PROJECTOR_TYPE_QWEN2A:
2189
2214
{
2190
- get_u32 (KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor );
2215
+ bool require_stack = ctx_clip.proj_type == PROJECTOR_TYPE_ULTRAVOX;
2216
+ get_u32 (KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor , require_stack);
2191
2217
if (hparams.n_mel_bins != 128 ) {
2192
2218
throw std::runtime_error (string_format (" %s: only 128 mel bins are supported for ultravox\n " , __func__));
2193
2219
}
@@ -2266,7 +2292,7 @@ struct clip_model_loader {
2266
2292
return cur;
2267
2293
};
2268
2294
2269
- auto & vision_model = ctx_clip.vision_model ;
2295
+ auto & vision_model = ctx_clip.vision_model ; // TODO: rename this to just "model"
2270
2296
2271
2297
vision_model.class_embedding = get_tensor (TN_CLASS_EMBD, false );
2272
2298
@@ -2463,6 +2489,15 @@ struct clip_model_loader {
2463
2489
vision_model.mm_norm_pre_w = get_tensor (string_format (TN_MM_NORM_PRE, " weight" ));
2464
2490
vision_model.mm_norm_mid_w = get_tensor (string_format (TN_MM_NORM_MID, " weight" ));
2465
2491
} break ;
2492
+ case PROJECTOR_TYPE_QWEN2A:
2493
+ {
2494
+ vision_model.conv1d_1_w = get_tensor (string_format (TN_CONV1D, 1 , " weight" ));
2495
+ vision_model.conv1d_1_b = get_tensor (string_format (TN_CONV1D, 1 , " bias" ));
2496
+ vision_model.conv1d_2_w = get_tensor (string_format (TN_CONV1D, 2 , " weight" ));
2497
+ vision_model.conv1d_2_b = get_tensor (string_format (TN_CONV1D, 2 , " bias" ));
2498
+ vision_model.mm_fc_w = get_tensor (string_format (TN_MM_AUDIO_FC, " weight" ));
2499
+ vision_model.mm_fc_b = get_tensor (string_format (TN_MM_AUDIO_FC, " bias" ));
2500
+ } break ;
2466
2501
case PROJECTOR_TYPE_INTERNVL:
2467
2502
{
2468
2503
vision_model.mm_0_w = get_tensor (string_format (TN_MVLM_PROJ_MLP, 0 , " weight" ));
@@ -3450,6 +3485,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
3450
3485
const int proj_stack_factor = ctx->vision_model .hparams .proj_stack_factor ;
3451
3486
const int n_len = CLIP_ALIGN (img->nx , proj_stack_factor);
3452
3487
n_patches = n_len / proj_stack_factor / 2 ;
3488
+ } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
3489
+ // divide by 2 because of whisper
3490
+ // another divide by 2 because of nn.AvgPool1d(2, stride=2)
3491
+ n_patches = img->nx / 4 ;
3453
3492
}
3454
3493
3455
3494
return n_patches;
@@ -3850,6 +3889,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3850
3889
case PROJECTOR_TYPE_GEMMA3:
3851
3890
case PROJECTOR_TYPE_IDEFICS3:
3852
3891
case PROJECTOR_TYPE_INTERNVL:
3892
+ case PROJECTOR_TYPE_QWEN2A:
3853
3893
case PROJECTOR_TYPE_ULTRAVOX:
3854
3894
{
3855
3895
// do nothing
@@ -3910,7 +3950,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3910
3950
const int n_tokens_out = embeddings->ne [1 ];
3911
3951
const int expected_n_tokens_out = clip_n_output_tokens (ctx, imgs.entries [0 ].get ());
3912
3952
if (n_tokens_out != expected_n_tokens_out) {
3913
- LOG_ERR (" %s: expected %d tokens, got %d\n " , __func__, expected_n_tokens_out, n_tokens_out);
3953
+ LOG_ERR (" %s: expected output %d tokens, got %d\n " , __func__, expected_n_tokens_out, n_tokens_out);
3914
3954
GGML_ABORT (" Invalid number of output tokens" );
3915
3955
}
3916
3956
@@ -3955,6 +3995,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3955
3995
return ctx->vision_model .mm_3_w ->ne [1 ];
3956
3996
case PROJECTOR_TYPE_LLAMA4:
3957
3997
return ctx->vision_model .mm_model_proj ->ne [1 ];
3998
+ case PROJECTOR_TYPE_QWEN2A:
3999
+ return ctx->vision_model .mm_fc_w ->ne [1 ];
3958
4000
default :
3959
4001
GGML_ABORT (" Unknown projector type" );
3960
4002
}
@@ -3991,6 +4033,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
3991
4033
return ctx->vision_model .hparams .has_audio ;
3992
4034
}
3993
4035
4036
+ bool clip_has_whisper_encoder (const struct clip_ctx * ctx) {
4037
+ return ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX || ctx->proj_type == PROJECTOR_TYPE_QWEN2A;
4038
+ }
4039
+
3994
4040
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
3995
4041
clip_image_f32 clip_img;
3996
4042
clip_img.buf .resize (h * w * 3 );
0 commit comments