@@ -35,6 +35,7 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac
35
35
36
36
enum ffn_op_type {
37
37
FFN_GELU,
38
+ FFN_GELU_ERF,
38
39
FFN_SILU,
39
40
FFN_GELU_QUICK,
40
41
};
@@ -1422,28 +1423,24 @@ struct clip_graph {
1422
1423
1423
1424
// whisper encoder with custom projector
1424
1425
ggml_cgraph * build_whisper_enc () {
1425
- const int n_step = img.nx ;
1426
- const int n_pos = n_step / 2 ;
1426
+ const int n_frames = img.nx ;
1427
+ const int n_pos = n_frames / 2 ;
1427
1428
GGML_ASSERT (model.position_embeddings ->ne [1 ] >= n_pos);
1428
1429
1429
1430
ggml_tensor * inp = build_inp_raw (1 );
1430
1431
1431
- ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
1432
- ggml_set_name (positions, " positions" );
1433
- ggml_set_input (positions);
1434
-
1435
1432
// conv1d block
1436
1433
{
1437
1434
// convolution + gelu
1438
1435
ggml_tensor * cur = ggml_conv_1d_ph (ctx0, model.conv1d_1_w , inp, 1 , 1 );
1439
1436
cur = ggml_add (ctx0, cur, model.conv1d_1_b );
1440
1437
1441
- cur = ggml_gelu (ctx0, cur);
1438
+ cur = ggml_gelu_erf (ctx0, cur);
1442
1439
1443
1440
cur = ggml_conv_1d_ph (ctx0, model.conv1d_2_w , cur, 2 , 1 );
1444
1441
cur = ggml_add (ctx0, cur, model.conv1d_2_b );
1445
1442
1446
- cur = ggml_gelu (ctx0, cur);
1443
+ cur = ggml_gelu_erf (ctx0, cur);
1447
1444
// transpose
1448
1445
inp = ggml_cont (ctx0, ggml_transpose (ctx0, cur));
1449
1446
cb (inp, " after_conv1d" , -1 );
@@ -1457,7 +1454,11 @@ struct clip_graph {
1457
1454
GGML_ASSERT (!model.layers [0 ].k_b ); // no bias for k
1458
1455
GGML_ASSERT (model.post_ln_w && model.post_ln_b );
1459
1456
1460
- ggml_tensor * pos_embd_selected = ggml_get_rows (ctx0, model.position_embeddings , positions);
1457
+ ggml_tensor * pos_embd_selected = ggml_view_2d (
1458
+ ctx0, model.position_embeddings ,
1459
+ model.position_embeddings ->ne [0 ], n_pos,
1460
+ model.position_embeddings ->nb [1 ], 0
1461
+ );
1461
1462
ggml_tensor * cur = build_vit (
1462
1463
inp, n_pos,
1463
1464
NORM_TYPE_NORMAL,
@@ -1751,6 +1752,11 @@ struct clip_graph {
1751
1752
cur = ggml_gelu (ctx0, cur);
1752
1753
cb (cur, " ffn_gelu" , il);
1753
1754
} break ;
1755
+ case FFN_GELU_ERF:
1756
+ {
1757
+ cur = ggml_gelu_erf (ctx0, cur);
1758
+ cb (cur, " ggml_gelu_erf" , il);
1759
+ } break ;
1754
1760
case FFN_GELU_QUICK:
1755
1761
{
1756
1762
cur = ggml_gelu_quick (ctx0, cur);
@@ -2169,7 +2175,7 @@ struct clip_model_loader {
2169
2175
case PROJECTOR_TYPE_ULTRAVOX:
2170
2176
{
2171
2177
get_u32 (KEY_PROJ_STACK_FACTOR, hparams.proj_stack_factor );
2172
- hparams.ffn_op = FFN_GELU ;
2178
+ hparams.ffn_op = FFN_GELU_ERF ;
2173
2179
} break ;
2174
2180
default :
2175
2181
break ;
@@ -3615,7 +3621,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3615
3621
} else {
3616
3622
// audio input
3617
3623
GGML_ASSERT (imgs.entries .size () == 1 );
3618
- const auto & mel_inp = imgs.entries [0 ]; // 3 channels, but only use one
3624
+ const auto & mel_inp = imgs.entries [0 ];
3619
3625
const int n_step = mel_inp->nx ;
3620
3626
const int n_mel = mel_inp->ny ;
3621
3627
std::vector<float > inp_raw (n_step * n_mel);
@@ -3817,6 +3823,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3817
3823
case PROJECTOR_TYPE_GEMMA3:
3818
3824
case PROJECTOR_TYPE_IDEFICS3:
3819
3825
case PROJECTOR_TYPE_INTERNVL:
3826
+ case PROJECTOR_TYPE_ULTRAVOX:
3820
3827
{
3821
3828
// do nothing
3822
3829
} break ;
@@ -3837,16 +3844,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3837
3844
}
3838
3845
set_input_i32 (" pos_w" , pos_data);
3839
3846
} break ;
3840
- case PROJECTOR_TYPE_ULTRAVOX:
3841
- {
3842
- const auto & mel_inp = imgs.entries [0 ];
3843
- const int n_pos = mel_inp->nx / 2 ;
3844
- std::vector<int32_t > positions (n_pos);
3845
- for (int i = 0 ; i < n_pos; i++) {
3846
- positions[i] = i;
3847
- }
3848
- set_input_i32 (" positions" , positions);
3849
- } break ;
3850
3847
default :
3851
3848
GGML_ABORT (" Unknown projector type" );
3852
3849
}
@@ -3988,12 +3985,12 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
3988
3985
return ctx->proj_type ;
3989
3986
}
3990
3987
3991
- void clip_image_f32_batch_add_mel (struct clip_image_f32_batch * batch, int n_mel, int n_step , float * mel) {
3988
+ void clip_image_f32_batch_add_mel (struct clip_image_f32_batch * batch, int n_mel, int n_frames , float * mel) {
3992
3989
clip_image_f32 * audio = new clip_image_f32;
3993
- audio->nx = n_step ;
3990
+ audio->nx = n_frames ;
3994
3991
audio->ny = n_mel;
3995
- audio->buf .resize (n_step * n_mel);
3996
- std::memcpy (audio->buf .data (), mel, n_step * n_mel * sizeof (float ));
3992
+ audio->buf .resize (n_frames * n_mel);
3993
+ std::memcpy (audio->buf .data (), mel, n_frames * n_mel * sizeof (float ));
3997
3994
3998
3995
batch->entries .push_back (clip_image_f32_ptr (audio));
3999
3996
batch->is_audio = true ;
0 commit comments