Skip to content

Commit 8b51e7f

Browse files
committed
ok, missing SinusoidsPositionEmbedding
1 parent bb92d1d commit 8b51e7f

File tree

4 files changed

+60
-16
lines changed

4 files changed

+60
-16
lines changed

convert_hf_to_gguf.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,6 +1124,8 @@ class MmprojModel(ModelBase):
11241124
preprocessor_config: dict[str, Any]
11251125
global_config: dict[str, Any]
11261126

1127+
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
1128+
11271129
has_vision_encoder: bool = True # by default
11281130
has_audio_encoder: bool = False
11291131

@@ -1160,8 +1162,7 @@ def __init__(self, *args, **kwargs):
11601162

11611163
# TODO @ngxson : this is a hack to support both vision and audio encoders
11621164
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
1163-
self.block_count = 128 if have_multiple_encoders else \
1164-
self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"], True)
1165+
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
11651166
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
11661167

11671168
# load preprocessor config
@@ -1185,33 +1186,51 @@ def set_gguf_parameters(self):
11851186
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
11861187

11871188
# vision config
1188-
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
1189-
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
1190-
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
1191-
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
1192-
self.gguf_writer.add_vision_block_count(self.block_count)
1193-
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
1189+
self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
1190+
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
1191+
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
1192+
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
1193+
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
1194+
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
11941195

11951196
# preprocessor config
11961197
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
11971198
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
11981199

1199-
elif self.has_audio_encoder:
1200+
if self.has_audio_encoder:
12001201
self.gguf_writer.add_clip_has_audio_encoder(True)
12011202
self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
12021203

12031204
# audio config
1204-
self.gguf_writer.add_audio_embedding_length(self.find_hparam(["hidden_size"]))
1205-
self.gguf_writer.add_audio_feed_forward_length(self.find_hparam(["intermediate_size"]))
1206-
self.gguf_writer.add_audio_block_count(self.block_count)
1207-
self.gguf_writer.add_audio_head_count(self.find_hparam(["num_attention_heads"]))
1205+
self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
1206+
self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
1207+
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
1208+
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
12081209

12091210
else:
12101211
raise ValueError("MmprojModel must have either vision or audio encoder")
12111212

12121213
def write_vocab(self):
12131214
raise ValueError("MmprojModel does not support vocab writing")
12141215

1216+
def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
1217+
key = next((k for k in keys if k in self.hparams), None)
1218+
assert self.hparams_vision is not None
1219+
return self._find_param(self.hparams_vision, keys, optional)
1220+
1221+
def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
1222+
key = next((k for k in keys if k in self.hparams), None)
1223+
assert self.hparams_audio is not None
1224+
return self._find_param(self.hparams_audio, keys, optional)
1225+
1226+
def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
1227+
key = next((k for k in keys if k in obj), None)
1228+
if key is not None:
1229+
return obj[key]
1230+
if optional:
1231+
return None
1232+
raise KeyError(f"could not find any of: {keys}")
1233+
12151234

12161235
@ModelBase.register("GPTNeoXForCausalLM")
12171236
class GPTNeoXModel(TextModel):
@@ -2743,9 +2762,9 @@ def set_gguf_parameters(self):
27432762
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
27442763
elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
27452764
if model_type == 'qwen2_5_omni':
2746-
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
2747-
else:
27482765
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
2766+
else:
2767+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
27492768
self.gguf_writer.add_vision_use_silu(True)
27502769
# find n_wa_pattern (window attention pattern)
27512770
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
@@ -2808,6 +2827,19 @@ class Qwen25OmniModel(Qwen2VLVisionModel):
28082827
has_vision_encoder = True
28092828
has_audio_encoder = True
28102829

2830+
def __init__(self, *args, **kwargs):
2831+
super().__init__(*args, **kwargs)
2832+
assert self.hparams_audio is not None
2833+
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
2834+
self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
2835+
self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
2836+
2837+
def set_gguf_parameters(self):
2838+
super().set_gguf_parameters()
2839+
assert self.hparams_audio is not None
2840+
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
2841+
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
2842+
28112843
def get_vision_config(self) -> dict[str, Any] | None:
28122844
return self.global_config["thinker_config"].get("vision_config")
28132845

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ enum projector_type {
130130
PROJECTOR_TYPE_INTERNVL,
131131
PROJECTOR_TYPE_LLAMA4,
132132
PROJECTOR_TYPE_QWEN2A,
133+
PROJECTOR_TYPE_QWEN25O,
133134
PROJECTOR_TYPE_UNKNOWN,
134135
};
135136

@@ -148,6 +149,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
148149
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
149150
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
150151
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
152+
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
151153
};
152154

153155
static projector_type clip_projector_type_from_string(const std::string & str) {

tools/mtmd/clip.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@ struct clip_ctx {
415415
}
416416
}
417417

418+
// this function is added so that we don't change too much of the existing code
418419
projector_type proj_type() const {
419420
return model.proj_type;
420421
}
@@ -2086,6 +2087,13 @@ struct clip_model_loader {
20862087
if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
20872088
throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
20882089
}
2090+
2091+
// correct arch for multimodal models
2092+
if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
2093+
model.proj_type = modality == CLIP_MODALITY_VISION
2094+
? PROJECTOR_TYPE_QWEN25VL
2095+
: PROJECTOR_TYPE_QWEN2A;
2096+
}
20892097
}
20902098

20912099
const bool is_vision = model.modality == CLIP_MODALITY_VISION;
@@ -4078,7 +4086,8 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
40784086
}
40794087

40804088
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
4081-
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A;
4089+
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
4090+
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A;
40824091
}
40834092

40844093
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {

tools/mtmd/mtmd.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ struct mtmd_context {
104104
int n_threads;
105105
std::string media_marker;
106106

107+
// these are not token, but strings used to mark the beginning and end of image/audio embeddings
107108
std::string img_beg;
108109
std::string img_end;
109110
std::string aud_beg;

0 commit comments

Comments
 (0)