Skip to content

Commit 58c7849

Browse files
committed
mtmd : allow multiple modalities at the same time
1 parent 2f099b5 commit 58c7849

File tree

6 files changed

+493
-342
lines changed

6 files changed

+493
-342
lines changed

convert_hf_to_gguf.py

Lines changed: 79 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,9 @@ def load_hparams(dir_model: Path):
432432
if "llm_config" in config:
433433
# rename for InternVL
434434
config["text_config"] = config["llm_config"]
435+
if "thinker_config" in config:
436+
# rename for Qwen2.5-Omni
437+
config["text_config"] = config["thinker_config"]["text_config"]
435438
return config
436439

437440
@classmethod
@@ -1124,15 +1127,16 @@ class MmprojModel(ModelBase):
11241127
has_vision_encoder: bool = True # by default
11251128
has_audio_encoder: bool = False
11261129

1130+
# for models having multiple encoders, we need to separate their hparams
1131+
hparams_vision: dict[str, Any] | None = None
1132+
hparams_audio: dict[str, Any] | None = None
1133+
11271134
def __init__(self, *args, **kwargs):
11281135
super().__init__(*args, **kwargs)
11291136

11301137
if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
11311138
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
11321139

1133-
if self.has_vision_encoder and self.has_audio_encoder:
1134-
raise NotImplementedError("both vision + audio not supported yet")
1135-
11361140
# get n_embd of the text model
11371141
if "text_config" not in self.hparams:
11381142
self.hparams["text_config"] = {}
@@ -1143,22 +1147,33 @@ def __init__(self, *args, **kwargs):
11431147
assert self.n_embd_text > 0, "n_embd not found in hparams"
11441148

11451149
# move vision config to the top level, while preserving the original hparams in global_config
1146-
self.global_config = self.hparams
1147-
1148-
if "vision_config" in self.hparams:
1149-
self.hparams = self.hparams["vision_config"]
1150-
elif "audio_config" in self.hparams:
1151-
self.hparams = self.hparams["audio_config"]
1152-
else:
1150+
import copy
1151+
self.global_config = copy.deepcopy(self.hparams)
1152+
self.hparams_vision = self.get_vision_config()
1153+
self.hparams_audio = self.get_audio_config()
1154+
1155+
if self.hparams_vision is None and self.hparams_audio is None:
11531156
raise ValueError("vision_config / audio_config not found in hparams")
11541157

1155-
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
1158+
# for compat with vision-only models
1159+
self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
1160+
1161+
# TODO @ngxson : this is a hack to support both vision and audio encoders
1162+
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
1163+
self.block_count = 128 if have_multiple_encoders else \
1164+
self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"], True)
11561165
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
11571166

11581167
# load preprocessor config
11591168
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
11601169
self.preprocessor_config = json.load(f)
11611170

1171+
def get_vision_config(self) -> dict[str, Any] | None:
1172+
return self.global_config.get("vision_config")
1173+
1174+
def get_audio_config(self) -> dict[str, Any] | None:
1175+
return self.global_config.get("audio_config")
1176+
11621177
def set_type(self):
11631178
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
11641179

@@ -2674,7 +2689,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26742689
yield from super().modify_tensors(data_torch, name, bid)
26752690

26762691

2677-
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
2692+
@ModelBase.register(
2693+
"Qwen2VLModel",
2694+
"Qwen2VLForConditionalGeneration",
2695+
"Qwen2_5_VLForConditionalGeneration",
2696+
"Qwen2_5OmniModel",
2697+
)
26782698
class Qwen2VLModel(TextModel):
26792699
model_arch = gguf.MODEL_ARCH.QWEN2VL
26802700

@@ -2692,8 +2712,11 @@ def set_vocab(self):
26922712

26932713
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
26942714
del bid # unused
2695-
if name.startswith("visual."):
2696-
# skip visual tensors
2715+
if name.startswith("thinker."):
2716+
name = name.replace("thinker.", "")
2717+
if name.startswith("visual") or name.startswith("audio") or \
2718+
name.startswith("talker") or name.startswith("token2wav"):
2719+
# skip multimodal tensors
26972720
return []
26982721
return [(self.map_tensor_name(name), data_torch)]
26992722

@@ -2702,21 +2725,27 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27022725
class Qwen2VLVisionModel(MmprojModel):
27032726
def __init__(self, *args, **kwargs):
27042727
super().__init__(*args, **kwargs)
2705-
self.hparams["image_size"] = self.hparams.get("image_size", 560)
2728+
assert self.hparams_vision is not None
2729+
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
27062730
# rename config.json values
2707-
self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
2708-
self.hparams["num_hidden_layers"] = self.hparams.get("depth")
2709-
if "embed_dim" in self.hparams: # qwen2vl
2710-
self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
2711-
self.hparams["hidden_size"] = self.hparams.get("embed_dim")
2731+
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
2732+
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
2733+
if "embed_dim" in self.hparams_vision: # qwen2vl
2734+
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
2735+
self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
27122736

27132737
def set_gguf_parameters(self):
27142738
super().set_gguf_parameters()
2715-
hparams = self.hparams
2716-
if self.global_config['model_type'] == 'qwen2_vl':
2739+
assert self.hparams_vision is not None
2740+
hparams = self.hparams_vision
2741+
model_type = self.global_config['model_type']
2742+
if model_type == 'qwen2_vl':
27172743
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
2718-
elif self.global_config['model_type'] == 'qwen2_5_vl':
2719-
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
2744+
elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
2745+
if model_type == 'qwen2_5_omni':
2746+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
2747+
else:
2748+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
27202749
self.gguf_writer.add_vision_use_silu(True)
27212750
# find n_wa_pattern (window attention pattern)
27222751
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
@@ -2774,6 +2803,32 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27742803
return [] # skip other tensors
27752804

27762805

2806+
@ModelBase.register("Qwen2_5OmniModel")
2807+
class Qwen25OmniModel(Qwen2VLVisionModel):
2808+
has_vision_encoder = True
2809+
has_audio_encoder = True
2810+
2811+
def get_vision_config(self) -> dict[str, Any] | None:
2812+
return self.global_config["thinker_config"].get("vision_config")
2813+
2814+
def get_audio_config(self) -> dict[str, Any] | None:
2815+
return self.global_config["thinker_config"].get("audio_config")
2816+
2817+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2818+
if name.startswith("thinker."):
2819+
name = name.replace("thinker.", "")
2820+
2821+
if name.startswith("audio_tower"):
2822+
# process audio tensors
2823+
if "audio_bos_eos_token" in name:
2824+
# this tensor is left unused in transformers code
2825+
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
2826+
return []
2827+
return [(self.map_tensor_name(name), data_torch)]
2828+
2829+
return super().modify_tensors(data_torch, name, bid)
2830+
2831+
27772832
@ModelBase.register("InternVisionModel")
27782833
class InternVisionModel(MmprojModel):
27792834
def set_gguf_parameters(self):

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2260,6 +2260,7 @@ class VisionProjectorType:
22602260
ULTRAVOX = "ultravox"
22612261
INTERNVL = "internvl"
22622262
QWEN2A = "qwen2a" # audio
2263+
QWEN25O = "qwen2.5o" # omni
22632264

22642265

22652266
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1125,6 +1125,7 @@ class TensorNameMap:
11251125

11261126
MODEL_TENSOR.A_POST_NORM: (
11271127
"audio_tower.layer_norm", # ultravox
1128+
"audio_tower.ln_post", # qwen2omni
11281129
),
11291130

11301131
MODEL_TENSOR.A_ENC_ATTN_Q: (
@@ -1161,12 +1162,16 @@ class TensorNameMap:
11611162
"audio_tower.layers.{bid}.fc2", # ultravox
11621163
),
11631164

1165+
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
1166+
# this prefix is added in the conversion code in modify_tensors()
1167+
11641168
MODEL_TENSOR.A_MMPROJ: (
11651169
"audio.multi_modal_projector.linear_{bid}", # ultravox
11661170
),
11671171

11681172
MODEL_TENSOR.A_MMPROJ_FC: (
11691173
"audio.multi_modal_projector.linear", # qwen2audio
1174+
"audio_tower.proj", # qwen2omni
11701175
),
11711176

11721177
MODEL_TENSOR.A_MM_NORM_PRE: (

0 commit comments

Comments
 (0)