add some debug stuff

ngxson · ngxson · commit cf9613f689cf · 2025-05-20T12:50:44.000+02:00
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -1420,10 +1420,11 @@ struct clip_graph {
         return gf;
     }
 
-    // whisper encoder with ultravox projector
-    ggml_cgraph * build_ultravox() {
+    // whisper encoder with custom projector
+    ggml_cgraph * build_whisper_enc() {
         const int n_step = img.nx;
         const int n_pos  = n_step / 2;
+        GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
 
         ggml_tensor * inp = build_inp_raw(1);
 
@@ -1943,7 +1944,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             } break;
         case PROJECTOR_TYPE_ULTRAVOX:
             {
-                res = graph.build_ultravox();
+                res = graph.build_whisper_enc();
             } break;
         default:
             {
@@ -3413,7 +3414,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
     } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
         n_patches /= (scale_factor * scale_factor);
     } else if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
-        n_patches = img->nx / ctx->vision_model.hparams.proj_stack_factor / 2;
+        const int proj_stack_factor = ctx->vision_model.hparams.proj_stack_factor;
+        const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
+        n_patches = n_len / proj_stack_factor / 2;
     }
 
     return n_patches;
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
@@ -9,8 +9,12 @@
 
 #include "mtmd-audio.h"
 
+//#define MTMD_AUDIO_DEBUG
+
 #define MINIAUDIO_IMPLEMENTATION
-#define MA_NO_ENCODING
+#ifndef MTMD_AUDIO_DEBUG
+#   define MA_NO_ENCODING
+#endif
 #define MA_NO_DEVICE_IO
 #define MA_NO_RESOURCE_MANAGER
 #define MA_NO_NODE_GRAPH
@@ -300,7 +304,7 @@ static bool log_mel_spectrogram(
 bool preprocess_audio(
         const float * samples,
         size_t n_samples,
-        whisper_filters & filters,
+        const whisper_filters & filters,
         whisper_mel & output) {
 
     // a bit hacky, but we want to align the output to a multiple of WHISPER_N_FFT * proj_stack_factor
@@ -373,6 +377,15 @@ bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_
         return false;
     }
 
+#ifdef MTMD_AUDIO_DEBUG
+    // save audio to wav file
+    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
+    ma_encoder encoder;
+    ma_encoder_init_file("output.wav", &config, &encoder);
+    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
+    ma_encoder_uninit(&encoder);
+#endif
+
     ma_decoder_uninit(&decoder);
     return true;
 }
diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h
@@ -35,23 +35,26 @@ struct whisper_filters {
 extern bool preprocess_audio(
         const float * samples,
         size_t n_samples,
-        whisper_filters & filters,
+        const whisper_filters & filters,
         whisper_mel & output);
 
 } // namespace whisper_preprocessor
 
 
-
+// TODO @ngxson : move this helper to mtmd-helpers.cpp
 namespace audio_helpers {
 
 extern bool is_audio_file(const char * buf, size_t len);
 
-extern bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono);
+extern bool decode_audio_from_buf(
+        const unsigned char * buf_in,
+        size_t len,
+        int target_sampler_rate,
+        std::vector<float> & pcmf32_mono);
 
 } // namespace audio_helpers
 
 
-
 namespace whisper_precalc_filters {
 
 extern whisper_preprocessor::whisper_filters get_128_bins();
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
@@ -150,8 +150,9 @@ int32_t mtmd_helper_decode_image_chunk(
         int32_t n_batch,
         llama_pos * new_n_past) {
     auto chunk_type = mtmd_input_chunk_get_type(chunk);
+    const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
     if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-        LOG_ERR("failed to decode image chunk: input chunk not of image/audio type\n");
+        LOG_ERR("failed to decode chunk: input chunk not of image/audio type\n");
         return -1;
     }
 
@@ -166,8 +167,12 @@ int32_t mtmd_helper_decode_image_chunk(
 
     if (mtmd_decode_use_mrope(ctx)) {
         const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+        if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
+            return -1;
+        }
         if (!image_tokens) {
-            LOG_ERR("failed to decode image chunk: image tokens are null\n");
+            LOG_ERR("failed to decode chunk: image tokens are null\n");
             return -1;
         }
         const int nx = mtmd_image_tokens_get_nx(image_tokens);
@@ -187,17 +192,17 @@ int32_t mtmd_helper_decode_image_chunk(
         int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
         llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
 
-        LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+        LOG_INF("decoding %s batch %d/%d, n_tokens_batch = %d\n", name, i_batch+1, n_img_batches, n_tokens_batch);
 
         int64_t t1 = ggml_time_ms();
         int32_t ret = llama_decode(lctx, batch_embd_view);
         if (ret != 0) {
-            LOG_ERR("failed to decode image\n");
+            LOG_ERR("failed to decode %s\n", name);
             llama_set_causal_attn(lctx, true); // restore causal attn
             return ret;
         }
 
-        LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
+        LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);
 
         i_batch++;
     }
@@ -259,7 +264,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
 
         ret = mtmd_encode_chunk(ctx, chunk);
         if (ret != 0) {
-            LOG_ERR("failed to encode image\n");
+            LOG_ERR("failed to encode %s slice\n", name);
             llama_batch_free(text_batch);
             return ret;
         }
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -527,11 +527,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             // DEBUG!!!
             // mel_spec.data.resize(220*8*2 * mel_spec.n_mel);
             // mel_spec.n_len = 220*8*2;
-            LOG_DBG("mel_spec.n_len = %d\n", mel_spec.n_len);
-            LOG_DBG("mel_spec.n_mel = %d\n", mel_spec.n_mel);
+            mel_spec.n_len = 64*8*2;
+            LOG_DBG("mel_spec.n_len     = %d\n", mel_spec.n_len);
+            LOG_DBG("mel_spec.n_len_org = %d\n", mel_spec.n_len_org);
+            LOG_DBG("mel_spec.n_mel     = %d\n", mel_spec.n_mel);
 
             // convert mel spectrogram to clip_image_f32_batch
-            clip_image_f32_ptr mel_f32(clip_image_f32_init());
+            /*clip_image_f32_ptr mel_f32(clip_image_f32_init());
             mel_f32->nx = mel_spec.n_len;
             mel_f32->ny = mel_spec.n_mel;
             mel_f32->buf = std::move(mel_spec.data);
@@ -554,7 +556,39 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 nullptr, // image tokens
                 std::move(audio_tokens),
             };
-            output->entries.emplace_back(std::move(chunk));
+            output->entries.emplace_back(std::move(chunk));*/
+
+            for (size_t off = 0; off < (size_t)mel_spec.n_len; off += 160*8*2) {
+                size_t len = std::min(mel_spec.n_len - off, (size_t)160*8*2);
+                clip_image_f32_ptr mel_f32(clip_image_f32_init());
+                mel_f32->nx = len;
+                mel_f32->ny = mel_spec.n_mel;
+                mel_f32->buf.resize(len * mel_spec.n_mel);
+                std::memcpy(
+                    mel_f32->buf.data(),
+                    &mel_spec.data[off * mel_spec.n_mel],
+                    len * mel_spec.n_mel * sizeof(float));
+                size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
+
+                clip_image_f32_batch batch_f32;
+                batch_f32.is_audio = true;
+                batch_f32.entries.push_back(std::move(mel_f32));
+
+                mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
+                audio_tokens->n_tokens = n_tokens;
+                audio_tokens->batch_f32 = std::move(batch_f32);
+                audio_tokens->id = bitmaps[i_bm]->id; // optional
+
+                LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
+
+                mtmd_input_chunk chunk{
+                    MTMD_INPUT_CHUNK_TYPE_AUDIO,
+                    {}, // text tokens
+                    nullptr, // image tokens
+                    std::move(audio_tokens),
+                };
+                output->entries.emplace_back(std::move(chunk));
+            }
 
             i_bm++;
             continue;