Skip to content

Commit 40aaa8a

Browse files
authored
mtmd : add support for Qwen2-Audio and SeaLLM-Audio (#13760)
* mtmd : add Qwen2-Audio support * small clean up * update discussion link * clarify mtmd_get_output_embd * clarification in multimodal.md * fix ultravox bug * ggml_cont
1 parent a08c1d2 commit 40aaa8a

File tree

9 files changed

+139
-47
lines changed

9 files changed

+139
-47
lines changed

convert_hf_to_gguf.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2643,7 +2643,7 @@ def set_gguf_parameters(self):
26432643
self.gguf_writer.add_file_type(self.ftype)
26442644

26452645

2646-
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM")
2646+
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
26472647
class Qwen2Model(TextModel):
26482648
model_arch = gguf.MODEL_ARCH.QWEN2
26492649

@@ -2667,8 +2667,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26672667
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
26682668
if "language_model." in name:
26692669
name = name.replace("language_model.", "") # for InternVL
2670-
if name.startswith("mlp") or name.startswith("vision_model"):
2671-
# skip visual tensors
2670+
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
2671+
or name.startswith("vision_model") or name.startswith("audio_tower"):
2672+
# skip vision and audio tensors
26722673
return []
26732674
yield from super().modify_tensors(data_torch, name, bid)
26742675

@@ -5993,11 +5994,11 @@ class UltravoxModel(TextModel):
59935994

59945995
def __init__(self, *args, **kwargs):
59955996
super().__init__(*args, **kwargs)
5996-
raise NotImplementedError("Ultravox does not have text decoder. Please use --mmproj argument")
5997+
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
59975998

59985999

5999-
@ModelBase.register("UltravoxModel")
6000-
class UltravoxAudioModel(MmprojModel):
6000+
@ModelBase.register("Qwen2AudioForConditionalGeneration")
6001+
class WhisperEncoderModel(MmprojModel):
60016002
has_vision_encoder = False # no vision encoder
60026003
has_audio_encoder = True
60036004

@@ -6009,10 +6010,9 @@ def __init__(self, *args, **kwargs):
60096010

60106011
def set_gguf_parameters(self):
60116012
super().set_gguf_parameters()
6012-
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
6013+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A)
60136014
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
60146015
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
6015-
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
60166016

60176017
def tensor_force_quant(self, name, new_name, bid, n_dims):
60186018
del bid, new_name, n_dims # unused
@@ -6023,6 +6023,10 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
60236023
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
60246024
del bid # unused
60256025

6026+
if name.startswith("language_model."):
6027+
# skip language model tensors
6028+
return []
6029+
60266030
# prevent clash naming with vision tensors
60276031
if name.startswith("multi_modal_projector"):
60286032
name = "audio." + name
@@ -6033,6 +6037,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
60336037

60346038
return [(self.map_tensor_name(name), data_torch)]
60356039

6040+
6041+
@ModelBase.register("UltravoxModel")
6042+
class UltravoxWhisperEncoderModel(WhisperEncoderModel):
6043+
has_vision_encoder = False # no vision encoder
6044+
has_audio_encoder = True
6045+
6046+
def set_gguf_parameters(self):
6047+
super().set_gguf_parameters()
6048+
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
6049+
60366050
###### CONVERSION LOGIC ######
60376051

60386052

docs/multimodal.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,8 @@ NOTE: some models may require large context window, for example: `-c 8192`
9393
# Ultravox 0.5
9494
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
9595
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
96+
97+
# Qwen2-Audio and SeaLLM-Audio
98+
# note: no pre-quantized GGUF this model, as they have very poor result
99+
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
96100
```

gguf-py/gguf/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,7 @@ class MODEL_TENSOR(IntEnum):
546546
A_ENC_FFN_GATE = auto()
547547
A_ENC_FFN_DOWN = auto()
548548
A_MMPROJ = auto()
549+
A_MMPROJ_FC = auto()
549550
A_MM_NORM_PRE = auto()
550551
A_MM_NORM_MID = auto()
551552

@@ -825,6 +826,7 @@ class MODEL_TENSOR(IntEnum):
825826
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
826827
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
827828
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
829+
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
828830
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
829831
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
830832
}
@@ -885,6 +887,7 @@ class MODEL_TENSOR(IntEnum):
885887
MODEL_TENSOR.A_ENC_FFN_GATE,
886888
MODEL_TENSOR.A_ENC_FFN_DOWN,
887889
MODEL_TENSOR.A_MMPROJ,
890+
MODEL_TENSOR.A_MMPROJ_FC,
888891
MODEL_TENSOR.A_MM_NORM_PRE,
889892
MODEL_TENSOR.A_MM_NORM_MID,
890893
],
@@ -2256,6 +2259,7 @@ class VisionProjectorType:
22562259
QWEN25VL = "qwen2.5vl_merger"
22572260
ULTRAVOX = "ultravox"
22582261
INTERNVL = "internvl"
2262+
QWEN2A = "qwen2a" # audio
22592263

22602264

22612265
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,6 +1165,10 @@ class TensorNameMap:
11651165
"audio.multi_modal_projector.linear_{bid}", # ultravox
11661166
),
11671167

1168+
MODEL_TENSOR.A_MMPROJ_FC: (
1169+
"audio.multi_modal_projector.linear", # qwen2audio
1170+
),
1171+
11681172
MODEL_TENSOR.A_MM_NORM_PRE: (
11691173
"audio.multi_modal_projector.ln_pre", # ultravox
11701174
),

tools/mtmd/clip-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@
107107
// ultravox
108108
#define TN_CONV1D "a.conv1d.%d.%s"
109109
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
110+
#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer
110111
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
111112
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"
112113

@@ -128,6 +129,7 @@ enum projector_type {
128129
PROJECTOR_TYPE_ULTRAVOX,
129130
PROJECTOR_TYPE_INTERNVL,
130131
PROJECTOR_TYPE_LLAMA4,
132+
PROJECTOR_TYPE_QWEN2A,
131133
PROJECTOR_TYPE_UNKNOWN,
132134
};
133135

@@ -145,6 +147,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
145147
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
146148
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
147149
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
150+
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
148151
};
149152

150153
static projector_type clip_projector_type_from_string(const std::string & str) {

tools/mtmd/clip.cpp

Lines changed: 83 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,9 @@ struct clip_vision_model {
254254
ggml_tensor * post_ln_w;
255255
ggml_tensor * post_ln_b;
256256

257-
ggml_tensor * projection;
257+
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
258+
ggml_tensor * mm_fc_w;
259+
ggml_tensor * mm_fc_b;
258260

259261
// LLaVA projection
260262
ggml_tensor * mm_input_norm_w = nullptr;
@@ -1471,48 +1473,58 @@ struct clip_graph {
14711473

14721474
cb(cur, "after_transformer", -1);
14731475

1474-
// StackAudioFrames
1475-
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1476-
{
1477-
int64_t stride = n_embd * hparams.proj_stack_factor;
1478-
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
1479-
int64_t pad = padded_len - ggml_nelements(cur);
1480-
if (pad > 0) {
1481-
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
1482-
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
1476+
if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
1477+
// StackAudioFrames
1478+
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1479+
{
1480+
int64_t stride = n_embd * hparams.proj_stack_factor;
1481+
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
1482+
int64_t pad = padded_len - ggml_nelements(cur);
1483+
if (pad > 0) {
1484+
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
1485+
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
1486+
}
1487+
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
1488+
ggml_row_size(cur->type, stride), 0);
14831489
}
1484-
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
1485-
ggml_row_size(cur->type, stride), 0);
1486-
}
14871490

1488-
cb(cur, "after_stacked", -1);
1491+
cb(cur, "after_stacked", -1);
14891492

1490-
// UltravoxProjector
1491-
{
1492-
// pre-norm
1493-
cur = ggml_rms_norm(ctx0, cur, 1e-6);
1494-
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
1493+
// UltravoxProjector
1494+
{
1495+
// pre-norm
1496+
cur = ggml_rms_norm(ctx0, cur, 1e-6);
1497+
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
14951498

1496-
// ffn in
1497-
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1499+
// ffn in
1500+
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
14981501

1499-
// swiglu
1500-
{
1501-
int64_t split_point = cur->ne[0] / 2;
1502-
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
1503-
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
1502+
// swiglu
1503+
{
1504+
int64_t split_point = cur->ne[0] / 2;
1505+
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
1506+
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
1507+
1508+
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1509+
x1 = ggml_silu(ctx0, x1);
1510+
cur = ggml_mul(ctx0, x0, x1);
1511+
}
15041512

1505-
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1506-
x1 = ggml_silu(ctx0, x1);
1507-
cur = ggml_mul(ctx0, x0, x1);
1513+
// mid-norm
1514+
cur = ggml_rms_norm(ctx0, cur, 1e-6);
1515+
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
1516+
1517+
// ffn out
1518+
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
15081519
}
15091520

1510-
// mid-norm
1511-
cur = ggml_rms_norm(ctx0, cur, 1e-6);
1512-
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
1521+
} else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
1522+
// projector
1523+
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
1524+
cur = ggml_add(ctx0, cur, model.mm_fc_b);
15131525

1514-
// ffn out
1515-
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
1526+
} else {
1527+
GGML_ABORT("%s: unknown projector type", __func__);
15161528
}
15171529

15181530
cb(cur, "projected", -1);
@@ -1655,6 +1667,17 @@ struct clip_graph {
16551667
inpL = cur;
16561668
}
16571669

1670+
// TODO @ngxson : find a way to move this outside
1671+
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
1672+
ggml_tensor * cur = inpL;
1673+
cur = ggml_transpose(ctx0, cur);
1674+
cur = ggml_cont(ctx0, cur);
1675+
cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0);
1676+
cur = ggml_transpose(ctx0, cur);
1677+
cur = ggml_cont(ctx0, cur);
1678+
inpL = cur;
1679+
}
1680+
16581681
// post-layernorm
16591682
if (model.post_ln_w) {
16601683
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
@@ -1952,6 +1975,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
19521975
res = graph.build_llama4();
19531976
} break;
19541977
case PROJECTOR_TYPE_ULTRAVOX:
1978+
case PROJECTOR_TYPE_QWEN2A:
19551979
{
19561980
res = graph.build_whisper_enc();
19571981
} break;
@@ -2186,8 +2210,10 @@ struct clip_model_loader {
21862210
};
21872211
} break;
21882212
case PROJECTOR_TYPE_ULTRAVOX:
2213+
case PROJECTOR_TYPE_QWEN2A:
21892214
{
2190-
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor);
2215+
bool require_stack = ctx_clip.proj_type == PROJECTOR_TYPE_ULTRAVOX;
2216+
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
21912217
if (hparams.n_mel_bins != 128) {
21922218
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
21932219
}
@@ -2266,7 +2292,7 @@ struct clip_model_loader {
22662292
return cur;
22672293
};
22682294

2269-
auto & vision_model = ctx_clip.vision_model;
2295+
auto & vision_model = ctx_clip.vision_model; // TODO: rename this to just "model"
22702296

22712297
vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
22722298

@@ -2463,6 +2489,15 @@ struct clip_model_loader {
24632489
vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
24642490
vision_model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
24652491
} break;
2492+
case PROJECTOR_TYPE_QWEN2A:
2493+
{
2494+
vision_model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
2495+
vision_model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
2496+
vision_model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
2497+
vision_model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
2498+
vision_model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
2499+
vision_model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
2500+
} break;
24662501
case PROJECTOR_TYPE_INTERNVL:
24672502
{
24682503
vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -3450,6 +3485,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
34503485
const int proj_stack_factor = ctx->vision_model.hparams.proj_stack_factor;
34513486
const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
34523487
n_patches = n_len / proj_stack_factor / 2;
3488+
} else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
3489+
// divide by 2 because of whisper
3490+
// another divide by 2 because of nn.AvgPool1d(2, stride=2)
3491+
n_patches = img->nx / 4;
34533492
}
34543493

34553494
return n_patches;
@@ -3850,6 +3889,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
38503889
case PROJECTOR_TYPE_GEMMA3:
38513890
case PROJECTOR_TYPE_IDEFICS3:
38523891
case PROJECTOR_TYPE_INTERNVL:
3892+
case PROJECTOR_TYPE_QWEN2A:
38533893
case PROJECTOR_TYPE_ULTRAVOX:
38543894
{
38553895
// do nothing
@@ -3910,7 +3950,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
39103950
const int n_tokens_out = embeddings->ne[1];
39113951
const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
39123952
if (n_tokens_out != expected_n_tokens_out) {
3913-
LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
3953+
LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
39143954
GGML_ABORT("Invalid number of output tokens");
39153955
}
39163956

@@ -3955,6 +3995,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
39553995
return ctx->vision_model.mm_3_w->ne[1];
39563996
case PROJECTOR_TYPE_LLAMA4:
39573997
return ctx->vision_model.mm_model_proj->ne[1];
3998+
case PROJECTOR_TYPE_QWEN2A:
3999+
return ctx->vision_model.mm_fc_w->ne[1];
39584000
default:
39594001
GGML_ABORT("Unknown projector type");
39604002
}
@@ -3991,6 +4033,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
39914033
return ctx->vision_model.hparams.has_audio;
39924034
}
39934035

4036+
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
4037+
return ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX || ctx->proj_type == PROJECTOR_TYPE_QWEN2A;
4038+
}
4039+
39944040
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
39954041
clip_image_f32 clip_img;
39964042
clip_img.buf.resize(h * w * 3);

tools/mtmd/clip.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#include <stddef.h>
55
#include <stdint.h>
66

7+
// !!! Internal header, to be used by mtmd only !!!
8+
79
struct clip_ctx;
810

911
struct clip_image_size {
@@ -99,3 +101,4 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
99101

100102
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
101103
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
104+
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);

0 commit comments

Comments
 (0)