Skip to content

Commit cf9613f

Browse files
committed
add some debug stuff
1 parent 3ce96d7 commit cf9613f

File tree

5 files changed

+78
-20
lines changed

5 files changed

+78
-20
lines changed

tools/mtmd/clip.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,10 +1420,11 @@ struct clip_graph {
14201420
return gf;
14211421
}
14221422

1423-
// whisper encoder with ultravox projector
1424-
ggml_cgraph * build_ultravox() {
1423+
// whisper encoder with custom projector
1424+
ggml_cgraph * build_whisper_enc() {
14251425
const int n_step = img.nx;
14261426
const int n_pos = n_step / 2;
1427+
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
14271428

14281429
ggml_tensor * inp = build_inp_raw(1);
14291430

@@ -1943,7 +1944,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
19431944
} break;
19441945
case PROJECTOR_TYPE_ULTRAVOX:
19451946
{
1946-
res = graph.build_ultravox();
1947+
res = graph.build_whisper_enc();
19471948
} break;
19481949
default:
19491950
{
@@ -3413,7 +3414,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
34133414
} else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
34143415
n_patches /= (scale_factor * scale_factor);
34153416
} else if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
3416-
n_patches = img->nx / ctx->vision_model.hparams.proj_stack_factor / 2;
3417+
const int proj_stack_factor = ctx->vision_model.hparams.proj_stack_factor;
3418+
const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
3419+
n_patches = n_len / proj_stack_factor / 2;
34173420
}
34183421

34193422
return n_patches;

tools/mtmd/mtmd-audio.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,12 @@
99

1010
#include "mtmd-audio.h"
1111

12+
//#define MTMD_AUDIO_DEBUG
13+
1214
#define MINIAUDIO_IMPLEMENTATION
13-
#define MA_NO_ENCODING
15+
#ifndef MTMD_AUDIO_DEBUG
16+
# define MA_NO_ENCODING
17+
#endif
1418
#define MA_NO_DEVICE_IO
1519
#define MA_NO_RESOURCE_MANAGER
1620
#define MA_NO_NODE_GRAPH
@@ -300,7 +304,7 @@ static bool log_mel_spectrogram(
300304
bool preprocess_audio(
301305
const float * samples,
302306
size_t n_samples,
303-
whisper_filters & filters,
307+
const whisper_filters & filters,
304308
whisper_mel & output) {
305309

306310
// a bit hacky, but we want to align the output to a multiple of WHISPER_N_FFT * proj_stack_factor
@@ -373,6 +377,15 @@ bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_
373377
return false;
374378
}
375379

380+
#ifdef MTMD_AUDIO_DEBUG
381+
// save audio to wav file
382+
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
383+
ma_encoder encoder;
384+
ma_encoder_init_file("output.wav", &config, &encoder);
385+
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
386+
ma_encoder_uninit(&encoder);
387+
#endif
388+
376389
ma_decoder_uninit(&decoder);
377390
return true;
378391
}

tools/mtmd/mtmd-audio.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,23 +35,26 @@ struct whisper_filters {
3535
extern bool preprocess_audio(
3636
const float * samples,
3737
size_t n_samples,
38-
whisper_filters & filters,
38+
const whisper_filters & filters,
3939
whisper_mel & output);
4040

4141
} // namespace whisper_preprocessor
4242

4343

44-
44+
// TODO @ngxson : move this helper to mtmd-helpers.cpp
4545
namespace audio_helpers {
4646

4747
extern bool is_audio_file(const char * buf, size_t len);
4848

49-
extern bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono);
49+
extern bool decode_audio_from_buf(
50+
const unsigned char * buf_in,
51+
size_t len,
52+
int target_sampler_rate,
53+
std::vector<float> & pcmf32_mono);
5054

5155
} // namespace audio_helpers
5256

5357

54-
5558
namespace whisper_precalc_filters {
5659

5760
extern whisper_preprocessor::whisper_filters get_128_bins();

tools/mtmd/mtmd-helper.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,9 @@ int32_t mtmd_helper_decode_image_chunk(
150150
int32_t n_batch,
151151
llama_pos * new_n_past) {
152152
auto chunk_type = mtmd_input_chunk_get_type(chunk);
153+
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
153154
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
154-
LOG_ERR("failed to decode image chunk: input chunk not of image/audio type\n");
155+
LOG_ERR("failed to decode chunk: input chunk not of image/audio type\n");
155156
return -1;
156157
}
157158

@@ -166,8 +167,12 @@ int32_t mtmd_helper_decode_image_chunk(
166167

167168
if (mtmd_decode_use_mrope(ctx)) {
168169
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
170+
if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
171+
LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
172+
return -1;
173+
}
169174
if (!image_tokens) {
170-
LOG_ERR("failed to decode image chunk: image tokens are null\n");
175+
LOG_ERR("failed to decode chunk: image tokens are null\n");
171176
return -1;
172177
}
173178
const int nx = mtmd_image_tokens_get_nx(image_tokens);
@@ -187,17 +192,17 @@ int32_t mtmd_helper_decode_image_chunk(
187192
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
188193
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
189194

190-
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
195+
LOG_INF("decoding %s batch %d/%d, n_tokens_batch = %d\n", name, i_batch+1, n_img_batches, n_tokens_batch);
191196

192197
int64_t t1 = ggml_time_ms();
193198
int32_t ret = llama_decode(lctx, batch_embd_view);
194199
if (ret != 0) {
195-
LOG_ERR("failed to decode image\n");
200+
LOG_ERR("failed to decode %s\n", name);
196201
llama_set_causal_attn(lctx, true); // restore causal attn
197202
return ret;
198203
}
199204

200-
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
205+
LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);
201206

202207
i_batch++;
203208
}
@@ -259,7 +264,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
259264

260265
ret = mtmd_encode_chunk(ctx, chunk);
261266
if (ret != 0) {
262-
LOG_ERR("failed to encode image\n");
267+
LOG_ERR("failed to encode %s slice\n", name);
263268
llama_batch_free(text_batch);
264269
return ret;
265270
}

tools/mtmd/mtmd.cpp

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -527,11 +527,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
527527
// DEBUG!!!
528528
// mel_spec.data.resize(220*8*2 * mel_spec.n_mel);
529529
// mel_spec.n_len = 220*8*2;
530-
LOG_DBG("mel_spec.n_len = %d\n", mel_spec.n_len);
531-
LOG_DBG("mel_spec.n_mel = %d\n", mel_spec.n_mel);
530+
mel_spec.n_len = 64*8*2;
531+
LOG_DBG("mel_spec.n_len = %d\n", mel_spec.n_len);
532+
LOG_DBG("mel_spec.n_len_org = %d\n", mel_spec.n_len_org);
533+
LOG_DBG("mel_spec.n_mel = %d\n", mel_spec.n_mel);
532534

533535
// convert mel spectrogram to clip_image_f32_batch
534-
clip_image_f32_ptr mel_f32(clip_image_f32_init());
536+
/*clip_image_f32_ptr mel_f32(clip_image_f32_init());
535537
mel_f32->nx = mel_spec.n_len;
536538
mel_f32->ny = mel_spec.n_mel;
537539
mel_f32->buf = std::move(mel_spec.data);
@@ -554,7 +556,39 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
554556
nullptr, // image tokens
555557
std::move(audio_tokens),
556558
};
557-
output->entries.emplace_back(std::move(chunk));
559+
output->entries.emplace_back(std::move(chunk));*/
560+
561+
for (size_t off = 0; off < (size_t)mel_spec.n_len; off += 160*8*2) {
562+
size_t len = std::min(mel_spec.n_len - off, (size_t)160*8*2);
563+
clip_image_f32_ptr mel_f32(clip_image_f32_init());
564+
mel_f32->nx = len;
565+
mel_f32->ny = mel_spec.n_mel;
566+
mel_f32->buf.resize(len * mel_spec.n_mel);
567+
std::memcpy(
568+
mel_f32->buf.data(),
569+
&mel_spec.data[off * mel_spec.n_mel],
570+
len * mel_spec.n_mel * sizeof(float));
571+
size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
572+
573+
clip_image_f32_batch batch_f32;
574+
batch_f32.is_audio = true;
575+
batch_f32.entries.push_back(std::move(mel_f32));
576+
577+
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
578+
audio_tokens->n_tokens = n_tokens;
579+
audio_tokens->batch_f32 = std::move(batch_f32);
580+
audio_tokens->id = bitmaps[i_bm]->id; // optional
581+
582+
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
583+
584+
mtmd_input_chunk chunk{
585+
MTMD_INPUT_CHUNK_TYPE_AUDIO,
586+
{}, // text tokens
587+
nullptr, // image tokens
588+
std::move(audio_tokens),
589+
};
590+
output->entries.emplace_back(std::move(chunk));
591+
}
558592

559593
i_bm++;
560594
continue;

0 commit comments

Comments
 (0)