first working version

ngxson · ngxson · commit 24ec43ebec42 · 2025-05-26T09:41:56.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -2845,13 +2845,35 @@ def get_vision_config(self) -> dict[str, Any] | None:
 
     def get_audio_config(self) -> dict[str, Any] | None:
         return self.global_config["thinker_config"].get("audio_config")
+        
+    
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        # SinusoidsPositionEmbedding
+        assert self.hparams_audio is not None
+        max_timescale = 10000
+        length = 1500
+        channels = self.hparams_audio["hidden_size"]
+        log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+        inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
+        scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+        pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
+        yield ("audio_tower.embed_positions.weight", pos_embd)
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        del bid, new_name, n_dims  # unused
+        if ".conv" in name and ".weight" in name:
+            return gguf.GGMLQuantizationType.F16
+        return False
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         if name.startswith("thinker."):
             name = name.replace("thinker.", "")
 
         if name.startswith("audio_tower"):
             # process audio tensors
+            if "conv1.bias" in name or "conv2.bias" in name:
+                # transpose conv1 and conv2 bias
+                data_torch = data_torch.unsqueeze(-1)
             if "audio_bos_eos_token" in name:
                 # this tensor is left unused in transformers code
                 # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -4066,7 +4066,8 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
 }
 
 bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
-    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL;
+    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL;
 }
 
 bool clip_is_llava(const struct clip_ctx * ctx) {
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -3,7 +3,6 @@
 #include "ggml.h"
 #include <stddef.h>
 #include <stdint.h>
-#include <string>
 
 // !!! Internal header, to be used by mtmd only !!!
 
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
@@ -66,7 +66,8 @@ struct decode_embd_batch {
         }
     }
 
-    void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
+    // M-RoPE for image
+    void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
         GGML_ASSERT(n_pos_per_embd == 4);
         seq_id_0[0] = seq_id;
         for (int y = 0; y < ny; y++) {
@@ -85,6 +86,23 @@ struct decode_embd_batch {
         }
     }
 
+    // M-RoPE for audio
+    void set_position_mrope_1d(llama_pos pos_0, int32_t n_tokens, llama_seq_id seq_id) {
+        GGML_ASSERT(n_pos_per_embd == 4);
+        seq_id_0[0] = seq_id;
+        for (int i = 0; i < n_tokens; i++) {
+            pos[i                     ] = pos_0 + i;
+            pos[i + batch.n_tokens    ] = pos_0 + i;
+            pos[i + batch.n_tokens * 2] = pos_0 + i;
+            pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+        }
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+
     llama_batch get_view(int offset, int n_tokens) {
         llama_pos * pos_ptr;
         pos_view.clear();
@@ -146,18 +164,20 @@ int32_t mtmd_helper_decode_image_chunk(
     decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
 
     if (mtmd_decode_use_mrope(ctx)) {
-        const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
-        if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
-            return -1;
-        }
-        if (!image_tokens) {
-            LOG_ERR("failed to decode chunk: image tokens are null\n");
-            return -1;
+        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            if (!image_tokens) {
+                LOG_ERR("failed to decode chunk: image tokens are null\n");
+                return -1;
+            }
+            const int nx = mtmd_image_tokens_get_nx(image_tokens);
+            const int ny = mtmd_image_tokens_get_ny(image_tokens);
+            batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
+        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            batch_embd.set_position_mrope_1d(n_past, n_tokens, seq_id);
+        } else {
+            GGML_ABORT("invalid chunk type for M-RoPE");
         }
-        const int nx = mtmd_image_tokens_get_nx(image_tokens);
-        const int ny = mtmd_image_tokens_get_ny(image_tokens);
-        batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
     } else {
         batch_embd.set_position_normal(n_past, seq_id);
     }
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -157,18 +157,26 @@ struct mtmd_context {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
         }
 
-        clip_ctx * ctx_clip = get_clip_ctx();
-        if (llama_model_n_embd(text_model) != clip_n_mmproj_embd(ctx_clip)) {
+        if (llama_model_n_embd(text_model) != n_embd_projected()) {
             throw std::runtime_error(string_format(
                 "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
                 "hint: you may be using wrong mmproj\n",
-                llama_model_n_embd(text_model), clip_n_mmproj_embd(ctx_clip)));
+                llama_model_n_embd(text_model), n_embd_projected()));
         }
+        if (ctx_v) {
+            init_vision();
+        }
+        if (ctx_a) {
+            init_audio();
+        }
+    }
 
-        use_mrope = clip_is_qwen2vl(ctx_clip);
+    void init_vision() {
+        GGML_ASSERT(ctx_v != nullptr);
+        use_mrope = clip_is_qwen2vl(ctx_v);
 
-        projector_type proj = clip_get_projector_type(ctx_clip);
-        int minicpmv_version = clip_is_minicpmv(ctx_clip);
+        projector_type proj = clip_get_projector_type(ctx_v);
+        int minicpmv_version = clip_is_minicpmv(ctx_v);
         if (minicpmv_version == 2) {
             // minicpmv 2.5 format:
             // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
@@ -219,57 +227,53 @@ struct mtmd_context {
         }
 
         // set boi/eoi
-        projector_type pt = proj_type();
-        if (pt == PROJECTOR_TYPE_GEMMA3) {
+        if (proj == PROJECTOR_TYPE_GEMMA3) {
             // <start_of_image> ... (image embeddings) ... <end_of_image>
             img_beg = "<start_of_image>";
             img_end = "<end_of_image>";
 
-        } else if (pt == PROJECTOR_TYPE_IDEFICS3) {
+        } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
             // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
             img_beg = "<fake_token_around_image><global-img>";
             img_end = "<fake_token_around_image>";
 
-        } else if (pt == PROJECTOR_TYPE_PIXTRAL) {
+        } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
             // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
             img_end = "[IMG_END]";
 
-        } else if (pt == PROJECTOR_TYPE_QWEN2VL || pt == PROJECTOR_TYPE_QWEN25VL) {
+        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
             // <|vision_start|> ... (image embeddings) ... <|vision_end|>
             img_beg = "<|vision_start|>";
             img_end = "<|vision_end|>";
 
-        } else if (pt == PROJECTOR_TYPE_LLAMA4) {
+        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
             // (more details in mtmd_context constructor)
             img_beg = "<|image_start|>";
             img_end = "<|image_end|>";
+            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
+                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
 
-        } else if (pt == PROJECTOR_TYPE_INTERNVL) {
+        } else if (proj == PROJECTOR_TYPE_INTERNVL) {
             // <img> ... (image embeddings) ... </img>
             img_beg = "<img>";
             img_end = "</img>";
 
-        } else if (pt == PROJECTOR_TYPE_QWEN2A) {
+        }
+    }
+
+    void init_audio() {
+        GGML_ASSERT(ctx_a != nullptr);
+        projector_type proj = clip_get_projector_type(ctx_a);
+
+        LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
+                "    https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
+
+        if (proj == PROJECTOR_TYPE_QWEN2A) {
             // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
             aud_beg = "<|audio_bos|>";
             aud_end = "<|audio_eos|>";
 
         }
-
-        // warning messages
-        if (proj == PROJECTOR_TYPE_LLAMA4) {
-            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
-                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
-        }
-        if (ctx_a) {
-            LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
-                    "    https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
-        }
-    }
-
-    // get the main clip ctx
-    clip_ctx * get_clip_ctx() const {
-        return ctx_v ? ctx_v : ctx_a;
     }
 
     // get clip ctx based on chunk type
@@ -282,14 +286,17 @@ struct mtmd_context {
         GGML_ABORT("unknown chunk type");
     }
 
-    // both audio and vision contexts have the same projector type
-    projector_type proj_type() const {
-        return clip_get_projector_type(get_clip_ctx());
+    projector_type proj_type_v() const {
+        return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
+    }
+
+    projector_type proj_type_a() const {
+        return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
     }
 
     // both audio and vision contexts have the n_embd output dimension
     int n_embd_projected() const {
-        return clip_n_mmproj_embd(get_clip_ctx());
+        return clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
     }
 
     ~mtmd_context() {
@@ -400,6 +407,7 @@ struct mtmd_tokenizer {
     }
 
     void add_text(const std::string & txt, bool add_special, bool parse_special) {
+        LOG_DBG("%s: %s\n", __func__, txt.c_str());
         auto tokens = mtmd_tokenize_text_internal(vocab, txt, add_special, parse_special);
         add_text(tokens);
     }
@@ -434,7 +442,9 @@ struct mtmd_tokenizer {
                 return 2;
             }
 
-            add_text(ctx->img_beg, false, true); // add image begin token
+            if (!ctx->img_beg.empty()) {
+                add_text(ctx->img_beg, false, true); // add image begin token
+            }
 
             // convert mtmd_bitmap to clip_image_u8
             clip_image_u8_ptr img_u8(clip_image_u8_init());
@@ -549,7 +559,9 @@ struct mtmd_tokenizer {
                 cur.entries.emplace_back(std::move(chunk));
             }
 
-            add_text(ctx->img_end, false, true); // add image end token
+            if (!ctx->img_end.empty()) {
+                add_text(ctx->img_end, false, true); // add image end token
+            }
 
         } else {
             // handle audio
@@ -564,7 +576,9 @@ struct mtmd_tokenizer {
                 return 2;
             }
 
-            add_text(ctx->aud_beg, false, true); // add audio begin token
+            if (!ctx->aud_beg.empty()) {
+                add_text(ctx->aud_beg, false, true); // add audio begin token
+            }
 
             // preprocess audio
             GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
@@ -606,7 +620,9 @@ struct mtmd_tokenizer {
                 cur.entries.emplace_back(std::move(chunk));
             }
 
-            add_text(ctx->aud_end, false, true); // add audio end token
+            if (!ctx->aud_end.empty()) {
+                add_text(ctx->aud_end, false, true); // add audio end token
+            }
         }
 
         return 0;
@@ -751,7 +767,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }
 
 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
+    if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
         return true;
     }
     return false;

Original file line number	Diff line number	Diff line change
`@@ -4066,7 +4066,8 @@ bool clip_is_glm(const struct clip_ctx * ctx) {`
`4066`	`4066`	`}`
`4067`	`4067`
`4068`	`4068`	`bool clip_is_qwen2vl(const struct clip_ctx * ctx) {`
`4069`		`- return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL \|\| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL;`
	`4069`	`+ return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL`
	`4070`	`+ \|\| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL;`
`4070`	`4071`	`}`
`4071`	`4072`
`4072`	`4073`	`bool clip_is_llava(const struct clip_ctx * ctx) {`