@@ -432,6 +432,9 @@ def load_hparams(dir_model: Path):
432
432
if "llm_config" in config :
433
433
# rename for InternVL
434
434
config ["text_config" ] = config ["llm_config" ]
435
+ if "thinker_config" in config :
436
+ # rename for Qwen2.5-Omni
437
+ config ["text_config" ] = config ["thinker_config" ]["text_config" ]
435
438
return config
436
439
437
440
@classmethod
@@ -1124,15 +1127,16 @@ class MmprojModel(ModelBase):
1124
1127
has_vision_encoder : bool = True # by default
1125
1128
has_audio_encoder : bool = False
1126
1129
1130
+ # for models having multiple encoders, we need to separate their hparams
1131
+ hparams_vision : dict [str , Any ] | None = None
1132
+ hparams_audio : dict [str , Any ] | None = None
1133
+
1127
1134
def __init__ (self , * args , ** kwargs ):
1128
1135
super ().__init__ (* args , ** kwargs )
1129
1136
1130
1137
if self .model_arch != gguf .MODEL_ARCH .MMPROJ :
1131
1138
raise TypeError ("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ" )
1132
1139
1133
- if self .has_vision_encoder and self .has_audio_encoder :
1134
- raise NotImplementedError ("both vision + audio not supported yet" )
1135
-
1136
1140
# get n_embd of the text model
1137
1141
if "text_config" not in self .hparams :
1138
1142
self .hparams ["text_config" ] = {}
@@ -1143,22 +1147,33 @@ def __init__(self, *args, **kwargs):
1143
1147
assert self .n_embd_text > 0 , "n_embd not found in hparams"
1144
1148
1145
1149
# move vision config to the top level, while preserving the original hparams in global_config
1146
- self .global_config = self .hparams
1147
-
1148
- if "vision_config" in self .hparams :
1149
- self .hparams = self .hparams ["vision_config" ]
1150
- elif "audio_config" in self .hparams :
1151
- self .hparams = self .hparams ["audio_config" ]
1152
- else :
1150
+ import copy
1151
+ self .global_config = copy .deepcopy (self .hparams )
1152
+ self .hparams_vision = self .get_vision_config ()
1153
+ self .hparams_audio = self .get_audio_config ()
1154
+
1155
+ if self .hparams_vision is None and self .hparams_audio is None :
1153
1156
raise ValueError ("vision_config / audio_config not found in hparams" )
1154
1157
1155
- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ])
1158
+ # for compat with vision-only models
1159
+ self .hparams = self .hparams_vision or self .hparams_audio or self .hparams
1160
+
1161
+ # TODO @ngxson : this is a hack to support both vision and audio encoders
1162
+ have_multiple_encoders = self .has_audio_encoder and self .has_vision_encoder
1163
+ self .block_count = 128 if have_multiple_encoders else \
1164
+ self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ], True )
1156
1165
self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .MMPROJ , self .block_count )
1157
1166
1158
1167
# load preprocessor config
1159
1168
with open (self .dir_model / "preprocessor_config.json" , "r" , encoding = "utf-8" ) as f :
1160
1169
self .preprocessor_config = json .load (f )
1161
1170
1171
+ def get_vision_config (self ) -> dict [str , Any ] | None :
1172
+ return self .global_config .get ("vision_config" )
1173
+
1174
+ def get_audio_config (self ) -> dict [str , Any ] | None :
1175
+ return self .global_config .get ("audio_config" )
1176
+
1162
1177
def set_type (self ):
1163
1178
self .gguf_writer .add_type (gguf .GGUFType .MMPROJ )
1164
1179
@@ -2674,7 +2689,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2674
2689
yield from super ().modify_tensors (data_torch , name , bid )
2675
2690
2676
2691
2677
- @ModelBase .register ("Qwen2VLModel" , "Qwen2VLForConditionalGeneration" , "Qwen2_5_VLForConditionalGeneration" )
2692
+ @ModelBase .register (
2693
+ "Qwen2VLModel" ,
2694
+ "Qwen2VLForConditionalGeneration" ,
2695
+ "Qwen2_5_VLForConditionalGeneration" ,
2696
+ "Qwen2_5OmniModel" ,
2697
+ )
2678
2698
class Qwen2VLModel (TextModel ):
2679
2699
model_arch = gguf .MODEL_ARCH .QWEN2VL
2680
2700
@@ -2692,8 +2712,11 @@ def set_vocab(self):
2692
2712
2693
2713
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2694
2714
del bid # unused
2695
- if name .startswith ("visual." ):
2696
- # skip visual tensors
2715
+ if name .startswith ("thinker." ):
2716
+ name = name .replace ("thinker." , "" )
2717
+ if name .startswith ("visual" ) or name .startswith ("audio" ) or \
2718
+ name .startswith ("talker" ) or name .startswith ("token2wav" ):
2719
+ # skip multimodal tensors
2697
2720
return []
2698
2721
return [(self .map_tensor_name (name ), data_torch )]
2699
2722
@@ -2702,21 +2725,27 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2702
2725
class Qwen2VLVisionModel (MmprojModel ):
2703
2726
def __init__ (self , * args , ** kwargs ):
2704
2727
super ().__init__ (* args , ** kwargs )
2705
- self .hparams ["image_size" ] = self .hparams .get ("image_size" , 560 )
2728
+ assert self .hparams_vision is not None
2729
+ self .hparams_vision ["image_size" ] = self .hparams_vision .get ("image_size" , 560 )
2706
2730
# rename config.json values
2707
- self .hparams ["num_attention_heads" ] = self .hparams .get ("num_heads" )
2708
- self .hparams ["num_hidden_layers" ] = self .hparams .get ("depth" )
2709
- if "embed_dim" in self .hparams : # qwen2vl
2710
- self .hparams ["intermediate_size" ] = self .hparams .get ("hidden_size" )
2711
- self .hparams ["hidden_size" ] = self .hparams .get ("embed_dim" )
2731
+ self .hparams_vision ["num_attention_heads" ] = self .hparams_vision .get ("num_heads" )
2732
+ self .hparams_vision ["num_hidden_layers" ] = self .hparams_vision .get ("depth" )
2733
+ if "embed_dim" in self .hparams_vision : # qwen2vl
2734
+ self .hparams_vision ["intermediate_size" ] = self .hparams_vision .get ("hidden_size" )
2735
+ self .hparams_vision ["hidden_size" ] = self .hparams_vision .get ("embed_dim" )
2712
2736
2713
2737
def set_gguf_parameters (self ):
2714
2738
super ().set_gguf_parameters ()
2715
- hparams = self .hparams
2716
- if self .global_config ['model_type' ] == 'qwen2_vl' :
2739
+ assert self .hparams_vision is not None
2740
+ hparams = self .hparams_vision
2741
+ model_type = self .global_config ['model_type' ]
2742
+ if model_type == 'qwen2_vl' :
2717
2743
self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN2VL )
2718
- elif self .global_config ['model_type' ] == 'qwen2_5_vl' :
2719
- self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
2744
+ elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni' :
2745
+ if model_type == 'qwen2_5_omni' :
2746
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
2747
+ else :
2748
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25O )
2720
2749
self .gguf_writer .add_vision_use_silu (True )
2721
2750
# find n_wa_pattern (window attention pattern)
2722
2751
fullatt_block_indexes = hparams .get ("fullatt_block_indexes" )
@@ -2774,6 +2803,32 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2774
2803
return [] # skip other tensors
2775
2804
2776
2805
2806
+ @ModelBase .register ("Qwen2_5OmniModel" )
2807
+ class Qwen25OmniModel (Qwen2VLVisionModel ):
2808
+ has_vision_encoder = True
2809
+ has_audio_encoder = True
2810
+
2811
+ def get_vision_config (self ) -> dict [str , Any ] | None :
2812
+ return self .global_config ["thinker_config" ].get ("vision_config" )
2813
+
2814
+ def get_audio_config (self ) -> dict [str , Any ] | None :
2815
+ return self .global_config ["thinker_config" ].get ("audio_config" )
2816
+
2817
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2818
+ if name .startswith ("thinker." ):
2819
+ name = name .replace ("thinker." , "" )
2820
+
2821
+ if name .startswith ("audio_tower" ):
2822
+ # process audio tensors
2823
+ if "audio_bos_eos_token" in name :
2824
+ # this tensor is left unused in transformers code
2825
+ # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
2826
+ return []
2827
+ return [(self .map_tensor_name (name ), data_torch )]
2828
+
2829
+ return super ().modify_tensors (data_torch , name , bid )
2830
+
2831
+
2777
2832
@ModelBase .register ("InternVisionModel" )
2778
2833
class InternVisionModel (MmprojModel ):
2779
2834
def set_gguf_parameters (self ):
0 commit comments