Skip to content

Commit 23d0d7f

Browse files
committed
long audio input ok
1 parent cf9613f commit 23d0d7f

File tree

7 files changed

+484
-482
lines changed

7 files changed

+484
-482
lines changed

tools/mtmd/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# mtmd
22

3-
# compile mtmd-audio separately to avoid long compile times
3+
# compile mtmd-audio separately to avoid long compile times with miniaudio.h
4+
# TODO @ngxson : move miniaudio.h and stb_image.h to mtmd-helper.cpp, then compile the helper as a separate library
45
add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h)
56
if (BUILD_SHARED_LIBS)
67
set_target_properties(mtmd_audio PROPERTIES POSITION_INDEPENDENT_CODE ON)

tools/mtmd/clip-impl.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,10 @@ struct clip_image_u8 {
159159
std::vector<uint8_t> buf;
160160
};
161161

162-
// RGB float32 image (NHWC)
163-
// Memory layout: RGBRGBRGB...
162+
// For images, buf.size() == nx*ny*3
163+
// Memory layout: RGBRGBRGB...
164164
// For audio, only one channel is used, buf.size() == nx*ny
165+
// nx will be n_frames and ny will be n_mel
165166
struct clip_image_f32 {
166167
int nx;
167168
int ny;

tools/mtmd/clip.cpp

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac
3535

3636
enum ffn_op_type {
3737
FFN_GELU,
38+
FFN_GELU_ERF,
3839
FFN_SILU,
3940
FFN_GELU_QUICK,
4041
};
@@ -1422,28 +1423,24 @@ struct clip_graph {
14221423

14231424
// whisper encoder with custom projector
14241425
ggml_cgraph * build_whisper_enc() {
1425-
const int n_step = img.nx;
1426-
const int n_pos = n_step / 2;
1426+
const int n_frames = img.nx;
1427+
const int n_pos = n_frames / 2;
14271428
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
14281429

14291430
ggml_tensor * inp = build_inp_raw(1);
14301431

1431-
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
1432-
ggml_set_name(positions, "positions");
1433-
ggml_set_input(positions);
1434-
14351432
// conv1d block
14361433
{
14371434
// convolution + gelu
14381435
ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
14391436
cur = ggml_add(ctx0, cur, model.conv1d_1_b);
14401437

1441-
cur = ggml_gelu(ctx0, cur);
1438+
cur = ggml_gelu_erf(ctx0, cur);
14421439

14431440
cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
14441441
cur = ggml_add(ctx0, cur, model.conv1d_2_b);
14451442

1446-
cur = ggml_gelu(ctx0, cur);
1443+
cur = ggml_gelu_erf(ctx0, cur);
14471444
// transpose
14481445
inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
14491446
cb(inp, "after_conv1d", -1);
@@ -1457,7 +1454,11 @@ struct clip_graph {
14571454
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
14581455
GGML_ASSERT(model.post_ln_w && model.post_ln_b);
14591456

1460-
ggml_tensor * pos_embd_selected = ggml_get_rows(ctx0, model.position_embeddings, positions);
1457+
ggml_tensor * pos_embd_selected = ggml_view_2d(
1458+
ctx0, model.position_embeddings,
1459+
model.position_embeddings->ne[0], n_pos,
1460+
model.position_embeddings->nb[1], 0
1461+
);
14611462
ggml_tensor * cur = build_vit(
14621463
inp, n_pos,
14631464
NORM_TYPE_NORMAL,
@@ -1751,6 +1752,11 @@ struct clip_graph {
17511752
cur = ggml_gelu(ctx0, cur);
17521753
cb(cur, "ffn_gelu", il);
17531754
} break;
1755+
case FFN_GELU_ERF:
1756+
{
1757+
cur = ggml_gelu_erf(ctx0, cur);
1758+
cb(cur, "ggml_gelu_erf", il);
1759+
} break;
17541760
case FFN_GELU_QUICK:
17551761
{
17561762
cur = ggml_gelu_quick(ctx0, cur);
@@ -2169,7 +2175,7 @@ struct clip_model_loader {
21692175
case PROJECTOR_TYPE_ULTRAVOX:
21702176
{
21712177
get_u32(KEY_PROJ_STACK_FACTOR, hparams.proj_stack_factor);
2172-
hparams.ffn_op = FFN_GELU;
2178+
hparams.ffn_op = FFN_GELU_ERF;
21732179
} break;
21742180
default:
21752181
break;
@@ -3615,7 +3621,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
36153621
} else {
36163622
// audio input
36173623
GGML_ASSERT(imgs.entries.size() == 1);
3618-
const auto & mel_inp = imgs.entries[0]; // 3 channels, but only use one
3624+
const auto & mel_inp = imgs.entries[0];
36193625
const int n_step = mel_inp->nx;
36203626
const int n_mel = mel_inp->ny;
36213627
std::vector<float> inp_raw(n_step * n_mel);
@@ -3817,6 +3823,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
38173823
case PROJECTOR_TYPE_GEMMA3:
38183824
case PROJECTOR_TYPE_IDEFICS3:
38193825
case PROJECTOR_TYPE_INTERNVL:
3826+
case PROJECTOR_TYPE_ULTRAVOX:
38203827
{
38213828
// do nothing
38223829
} break;
@@ -3837,16 +3844,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
38373844
}
38383845
set_input_i32("pos_w", pos_data);
38393846
} break;
3840-
case PROJECTOR_TYPE_ULTRAVOX:
3841-
{
3842-
const auto & mel_inp = imgs.entries[0];
3843-
const int n_pos = mel_inp->nx / 2;
3844-
std::vector<int32_t> positions(n_pos);
3845-
for (int i = 0; i < n_pos; i++) {
3846-
positions[i] = i;
3847-
}
3848-
set_input_i32("positions", positions);
3849-
} break;
38503847
default:
38513848
GGML_ABORT("Unknown projector type");
38523849
}
@@ -3988,12 +3985,12 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
39883985
return ctx->proj_type;
39893986
}
39903987

3991-
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_step, float * mel) {
3988+
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
39923989
clip_image_f32 * audio = new clip_image_f32;
3993-
audio->nx = n_step;
3990+
audio->nx = n_frames;
39943991
audio->ny = n_mel;
3995-
audio->buf.resize(n_step * n_mel);
3996-
std::memcpy(audio->buf.data(), mel, n_step * n_mel * sizeof(float));
3992+
audio->buf.resize(n_frames * n_mel);
3993+
std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
39973994

39983995
batch->entries.push_back(clip_image_f32_ptr(audio));
39993996
batch->is_audio = true;

tools/mtmd/clip.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ bool clip_is_gemma3(const struct clip_ctx * ctx);
9595
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
9696

9797
// use by audio input
98-
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_step, float * mel);
98+
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
9999

100100
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
101101
bool clip_has_audio_encoder(const struct clip_ctx * ctx);

0 commit comments

Comments
 (0)