Skip to content

Commit 24ec43e

Browse files
committed
first working version
1 parent 8b51e7f commit 24ec43e

File tree

5 files changed

+110
-52
lines changed

5 files changed

+110
-52
lines changed

convert_hf_to_gguf.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2845,13 +2845,35 @@ def get_vision_config(self) -> dict[str, Any] | None:
28452845

28462846
def get_audio_config(self) -> dict[str, Any] | None:
28472847
return self.global_config["thinker_config"].get("audio_config")
2848+
2849+
2850+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2851+
# SinusoidsPositionEmbedding
2852+
assert self.hparams_audio is not None
2853+
max_timescale = 10000
2854+
length = 1500
2855+
channels = self.hparams_audio["hidden_size"]
2856+
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
2857+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
2858+
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
2859+
pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
2860+
yield ("audio_tower.embed_positions.weight", pos_embd)
2861+
2862+
def tensor_force_quant(self, name, new_name, bid, n_dims):
2863+
del bid, new_name, n_dims # unused
2864+
if ".conv" in name and ".weight" in name:
2865+
return gguf.GGMLQuantizationType.F16
2866+
return False
28482867

28492868
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
28502869
if name.startswith("thinker."):
28512870
name = name.replace("thinker.", "")
28522871

28532872
if name.startswith("audio_tower"):
28542873
# process audio tensors
2874+
if "conv1.bias" in name or "conv2.bias" in name:
2875+
# transpose conv1 and conv2 bias
2876+
data_torch = data_torch.unsqueeze(-1)
28552877
if "audio_bos_eos_token" in name:
28562878
# this tensor is left unused in transformers code
28572879
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809

tools/mtmd/clip.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4066,7 +4066,8 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
40664066
}
40674067

40684068
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
4069-
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL;
4069+
return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
4070+
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL;
40704071
}
40714072

40724073
bool clip_is_llava(const struct clip_ctx * ctx) {

tools/mtmd/clip.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include "ggml.h"
44
#include <stddef.h>
55
#include <stdint.h>
6-
#include <string>
76

87
// !!! Internal header, to be used by mtmd only !!!
98

tools/mtmd/mtmd-helper.cpp

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ struct decode_embd_batch {
6666
}
6767
}
6868

69-
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
69+
// M-RoPE for image
70+
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
7071
GGML_ASSERT(n_pos_per_embd == 4);
7172
seq_id_0[0] = seq_id;
7273
for (int y = 0; y < ny; y++) {
@@ -85,6 +86,23 @@ struct decode_embd_batch {
8586
}
8687
}
8788

89+
// M-RoPE for audio
90+
void set_position_mrope_1d(llama_pos pos_0, int32_t n_tokens, llama_seq_id seq_id) {
91+
GGML_ASSERT(n_pos_per_embd == 4);
92+
seq_id_0[0] = seq_id;
93+
for (int i = 0; i < n_tokens; i++) {
94+
pos[i ] = pos_0 + i;
95+
pos[i + batch.n_tokens ] = pos_0 + i;
96+
pos[i + batch.n_tokens * 2] = pos_0 + i;
97+
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
98+
}
99+
for (int i = 0; i < batch.n_tokens; i++) {
100+
batch.n_seq_id[i] = 1;
101+
batch.seq_id [i] = seq_id_0.data();
102+
batch.logits [i] = false;
103+
}
104+
}
105+
88106
llama_batch get_view(int offset, int n_tokens) {
89107
llama_pos * pos_ptr;
90108
pos_view.clear();
@@ -146,18 +164,20 @@ int32_t mtmd_helper_decode_image_chunk(
146164
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
147165

148166
if (mtmd_decode_use_mrope(ctx)) {
149-
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
150-
if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
151-
LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
152-
return -1;
153-
}
154-
if (!image_tokens) {
155-
LOG_ERR("failed to decode chunk: image tokens are null\n");
156-
return -1;
167+
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
168+
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
169+
if (!image_tokens) {
170+
LOG_ERR("failed to decode chunk: image tokens are null\n");
171+
return -1;
172+
}
173+
const int nx = mtmd_image_tokens_get_nx(image_tokens);
174+
const int ny = mtmd_image_tokens_get_ny(image_tokens);
175+
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
176+
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
177+
batch_embd.set_position_mrope_1d(n_past, n_tokens, seq_id);
178+
} else {
179+
GGML_ABORT("invalid chunk type for M-RoPE");
157180
}
158-
const int nx = mtmd_image_tokens_get_nx(image_tokens);
159-
const int ny = mtmd_image_tokens_get_ny(image_tokens);
160-
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
161181
} else {
162182
batch_embd.set_position_normal(n_past, seq_id);
163183
}

tools/mtmd/mtmd.cpp

Lines changed: 54 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -157,18 +157,26 @@ struct mtmd_context {
157157
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
158158
}
159159

160-
clip_ctx * ctx_clip = get_clip_ctx();
161-
if (llama_model_n_embd(text_model) != clip_n_mmproj_embd(ctx_clip)) {
160+
if (llama_model_n_embd(text_model) != n_embd_projected()) {
162161
throw std::runtime_error(string_format(
163162
"mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
164163
"hint: you may be using wrong mmproj\n",
165-
llama_model_n_embd(text_model), clip_n_mmproj_embd(ctx_clip)));
164+
llama_model_n_embd(text_model), n_embd_projected()));
166165
}
166+
if (ctx_v) {
167+
init_vision();
168+
}
169+
if (ctx_a) {
170+
init_audio();
171+
}
172+
}
167173

168-
use_mrope = clip_is_qwen2vl(ctx_clip);
174+
void init_vision() {
175+
GGML_ASSERT(ctx_v != nullptr);
176+
use_mrope = clip_is_qwen2vl(ctx_v);
169177

170-
projector_type proj = clip_get_projector_type(ctx_clip);
171-
int minicpmv_version = clip_is_minicpmv(ctx_clip);
178+
projector_type proj = clip_get_projector_type(ctx_v);
179+
int minicpmv_version = clip_is_minicpmv(ctx_v);
172180
if (minicpmv_version == 2) {
173181
// minicpmv 2.5 format:
174182
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
@@ -219,57 +227,53 @@ struct mtmd_context {
219227
}
220228

221229
// set boi/eoi
222-
projector_type pt = proj_type();
223-
if (pt == PROJECTOR_TYPE_GEMMA3) {
230+
if (proj == PROJECTOR_TYPE_GEMMA3) {
224231
// <start_of_image> ... (image embeddings) ... <end_of_image>
225232
img_beg = "<start_of_image>";
226233
img_end = "<end_of_image>";
227234

228-
} else if (pt == PROJECTOR_TYPE_IDEFICS3) {
235+
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
229236
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
230237
img_beg = "<fake_token_around_image><global-img>";
231238
img_end = "<fake_token_around_image>";
232239

233-
} else if (pt == PROJECTOR_TYPE_PIXTRAL) {
240+
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
234241
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
235242
img_end = "[IMG_END]";
236243

237-
} else if (pt == PROJECTOR_TYPE_QWEN2VL || pt == PROJECTOR_TYPE_QWEN25VL) {
244+
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
238245
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
239246
img_beg = "<|vision_start|>";
240247
img_end = "<|vision_end|>";
241248

242-
} else if (pt == PROJECTOR_TYPE_LLAMA4) {
249+
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
243250
// (more details in mtmd_context constructor)
244251
img_beg = "<|image_start|>";
245252
img_end = "<|image_end|>";
253+
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
254+
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
246255

247-
} else if (pt == PROJECTOR_TYPE_INTERNVL) {
256+
} else if (proj == PROJECTOR_TYPE_INTERNVL) {
248257
// <img> ... (image embeddings) ... </img>
249258
img_beg = "<img>";
250259
img_end = "</img>";
251260

252-
} else if (pt == PROJECTOR_TYPE_QWEN2A) {
261+
}
262+
}
263+
264+
void init_audio() {
265+
GGML_ASSERT(ctx_a != nullptr);
266+
projector_type proj = clip_get_projector_type(ctx_a);
267+
268+
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
269+
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
270+
271+
if (proj == PROJECTOR_TYPE_QWEN2A) {
253272
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
254273
aud_beg = "<|audio_bos|>";
255274
aud_end = "<|audio_eos|>";
256275

257276
}
258-
259-
// warning messages
260-
if (proj == PROJECTOR_TYPE_LLAMA4) {
261-
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
262-
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
263-
}
264-
if (ctx_a) {
265-
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
266-
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
267-
}
268-
}
269-
270-
// get the main clip ctx
271-
clip_ctx * get_clip_ctx() const {
272-
return ctx_v ? ctx_v : ctx_a;
273277
}
274278

275279
// get clip ctx based on chunk type
@@ -282,14 +286,17 @@ struct mtmd_context {
282286
GGML_ABORT("unknown chunk type");
283287
}
284288

285-
// both audio and vision contexts have the same projector type
286-
projector_type proj_type() const {
287-
return clip_get_projector_type(get_clip_ctx());
289+
projector_type proj_type_v() const {
290+
return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
291+
}
292+
293+
projector_type proj_type_a() const {
294+
return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
288295
}
289296

290297
// both audio and vision contexts have the n_embd output dimension
291298
int n_embd_projected() const {
292-
return clip_n_mmproj_embd(get_clip_ctx());
299+
return clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
293300
}
294301

295302
~mtmd_context() {
@@ -400,6 +407,7 @@ struct mtmd_tokenizer {
400407
}
401408

402409
void add_text(const std::string & txt, bool add_special, bool parse_special) {
410+
LOG_DBG("%s: %s\n", __func__, txt.c_str());
403411
auto tokens = mtmd_tokenize_text_internal(vocab, txt, add_special, parse_special);
404412
add_text(tokens);
405413
}
@@ -434,7 +442,9 @@ struct mtmd_tokenizer {
434442
return 2;
435443
}
436444

437-
add_text(ctx->img_beg, false, true); // add image begin token
445+
if (!ctx->img_beg.empty()) {
446+
add_text(ctx->img_beg, false, true); // add image begin token
447+
}
438448

439449
// convert mtmd_bitmap to clip_image_u8
440450
clip_image_u8_ptr img_u8(clip_image_u8_init());
@@ -549,7 +559,9 @@ struct mtmd_tokenizer {
549559
cur.entries.emplace_back(std::move(chunk));
550560
}
551561

552-
add_text(ctx->img_end, false, true); // add image end token
562+
if (!ctx->img_end.empty()) {
563+
add_text(ctx->img_end, false, true); // add image end token
564+
}
553565

554566
} else {
555567
// handle audio
@@ -564,7 +576,9 @@ struct mtmd_tokenizer {
564576
return 2;
565577
}
566578

567-
add_text(ctx->aud_beg, false, true); // add audio begin token
579+
if (!ctx->aud_beg.empty()) {
580+
add_text(ctx->aud_beg, false, true); // add audio begin token
581+
}
568582

569583
// preprocess audio
570584
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
@@ -606,7 +620,9 @@ struct mtmd_tokenizer {
606620
cur.entries.emplace_back(std::move(chunk));
607621
}
608622

609-
add_text(ctx->aud_end, false, true); // add audio end token
623+
if (!ctx->aud_end.empty()) {
624+
add_text(ctx->aud_end, false, true); // add audio end token
625+
}
610626
}
611627

612628
return 0;
@@ -751,7 +767,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
751767
}
752768

753769
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
754-
if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
770+
if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
755771
return true;
756772
}
757773
return false;

0 commit comments

Comments
 (0)