@@ -432,6 +432,9 @@ def load_hparams(dir_model: Path):
432
432
if "llm_config" in config :
433
433
# rename for InternVL
434
434
config ["text_config" ] = config ["llm_config" ]
435
+ if "thinker_config" in config :
436
+ # rename for Qwen2.5-Omni
437
+ config ["text_config" ] = config ["thinker_config" ]["text_config" ]
435
438
return config
436
439
437
440
@classmethod
@@ -1121,18 +1124,21 @@ class MmprojModel(ModelBase):
1121
1124
preprocessor_config : dict [str , Any ]
1122
1125
global_config : dict [str , Any ]
1123
1126
1127
+ n_block_keys = ["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ]
1128
+
1124
1129
has_vision_encoder : bool = True # by default
1125
1130
has_audio_encoder : bool = False
1126
1131
1132
+ # for models having multiple encoders, we need to separate their hparams
1133
+ hparams_vision : dict [str , Any ] | None = None
1134
+ hparams_audio : dict [str , Any ] | None = None
1135
+
1127
1136
def __init__ (self , * args , ** kwargs ):
1128
1137
super ().__init__ (* args , ** kwargs )
1129
1138
1130
1139
if self .model_arch != gguf .MODEL_ARCH .MMPROJ :
1131
1140
raise TypeError ("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ" )
1132
1141
1133
- if self .has_vision_encoder and self .has_audio_encoder :
1134
- raise NotImplementedError ("both vision + audio not supported yet" )
1135
-
1136
1142
# get n_embd of the text model
1137
1143
if "text_config" not in self .hparams :
1138
1144
self .hparams ["text_config" ] = {}
@@ -1143,22 +1149,32 @@ def __init__(self, *args, **kwargs):
1143
1149
assert self .n_embd_text > 0 , "n_embd not found in hparams"
1144
1150
1145
1151
# move vision config to the top level, while preserving the original hparams in global_config
1146
- self .global_config = self .hparams
1152
+ import copy
1153
+ self .global_config = copy .deepcopy (self .hparams )
1154
+ self .hparams_vision = self .get_vision_config ()
1155
+ self .hparams_audio = self .get_audio_config ()
1147
1156
1148
- if "vision_config" in self .hparams :
1149
- self .hparams = self .hparams ["vision_config" ]
1150
- elif "audio_config" in self .hparams :
1151
- self .hparams = self .hparams ["audio_config" ]
1152
- else :
1157
+ if self .hparams_vision is None and self .hparams_audio is None :
1153
1158
raise ValueError ("vision_config / audio_config not found in hparams" )
1154
1159
1155
- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ])
1160
+ # for compat with vision-only models
1161
+ self .hparams = self .hparams_vision or self .hparams_audio or self .hparams
1162
+
1163
+ # TODO @ngxson : this is a hack to support both vision and audio encoders
1164
+ have_multiple_encoders = self .has_audio_encoder and self .has_vision_encoder
1165
+ self .block_count = 128 if have_multiple_encoders else self .find_hparam (self .n_block_keys , True )
1156
1166
self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .MMPROJ , self .block_count )
1157
1167
1158
1168
# load preprocessor config
1159
1169
with open (self .dir_model / "preprocessor_config.json" , "r" , encoding = "utf-8" ) as f :
1160
1170
self .preprocessor_config = json .load (f )
1161
1171
1172
+ def get_vision_config (self ) -> dict [str , Any ] | None :
1173
+ return self .global_config .get ("vision_config" )
1174
+
1175
+ def get_audio_config (self ) -> dict [str , Any ] | None :
1176
+ return self .global_config .get ("audio_config" )
1177
+
1162
1178
def set_type (self ):
1163
1179
self .gguf_writer .add_type (gguf .GGUFType .MMPROJ )
1164
1180
@@ -1170,33 +1186,49 @@ def set_gguf_parameters(self):
1170
1186
self .gguf_writer .add_vision_projection_dim (self .n_embd_text )
1171
1187
1172
1188
# vision config
1173
- self .gguf_writer .add_vision_image_size (self .find_hparam (["image_size" ]))
1174
- self .gguf_writer .add_vision_patch_size (self .find_hparam (["patch_size" ]))
1175
- self .gguf_writer .add_vision_embedding_length (self .find_hparam (["hidden_size" ]))
1176
- self .gguf_writer .add_vision_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1177
- self .gguf_writer .add_vision_block_count (self .block_count )
1178
- self .gguf_writer .add_vision_head_count (self .find_hparam (["num_attention_heads" ]))
1189
+ self .gguf_writer .add_vision_image_size (self .find_vparam (["image_size" ]))
1190
+ self .gguf_writer .add_vision_patch_size (self .find_vparam (["patch_size" ]))
1191
+ self .gguf_writer .add_vision_embedding_length (self .find_vparam (["hidden_size" ]))
1192
+ self .gguf_writer .add_vision_feed_forward_length (self .find_vparam (["intermediate_size" ]))
1193
+ self .gguf_writer .add_vision_block_count (self .find_vparam ( self . n_block_keys ) )
1194
+ self .gguf_writer .add_vision_head_count (self .find_vparam (["num_attention_heads" ]))
1179
1195
1180
1196
# preprocessor config
1181
1197
self .gguf_writer .add_vision_image_mean (self .preprocessor_config ["image_mean" ])
1182
1198
self .gguf_writer .add_vision_image_std (self .preprocessor_config ["image_std" ])
1183
1199
1184
- elif self .has_audio_encoder :
1200
+ if self .has_audio_encoder :
1185
1201
self .gguf_writer .add_clip_has_audio_encoder (True )
1186
1202
self .gguf_writer .add_audio_projection_dim (self .n_embd_text )
1187
1203
1188
1204
# audio config
1189
- self .gguf_writer .add_audio_embedding_length (self .find_hparam (["hidden_size" ]))
1190
- self .gguf_writer .add_audio_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1191
- self .gguf_writer .add_audio_block_count (self .block_count )
1192
- self .gguf_writer .add_audio_head_count (self .find_hparam (["num_attention_heads" ]))
1205
+ self .gguf_writer .add_audio_embedding_length (self .find_aparam (["hidden_size" ]))
1206
+ self .gguf_writer .add_audio_feed_forward_length (self .find_aparam (["intermediate_size" ]))
1207
+ self .gguf_writer .add_audio_block_count (self .find_aparam ( self . n_block_keys ) )
1208
+ self .gguf_writer .add_audio_head_count (self .find_aparam (["num_attention_heads" ]))
1193
1209
1194
1210
else :
1195
1211
raise ValueError ("MmprojModel must have either vision or audio encoder" )
1196
1212
1197
1213
def write_vocab (self ):
1198
1214
raise ValueError ("MmprojModel does not support vocab writing" )
1199
1215
1216
+ def find_vparam (self , keys : Iterable [str ], optional : bool = False ) -> Any :
1217
+ assert self .hparams_vision is not None
1218
+ return self ._find_param (self .hparams_vision , keys , optional )
1219
+
1220
+ def find_aparam (self , keys : Iterable [str ], optional : bool = False ) -> Any :
1221
+ assert self .hparams_audio is not None
1222
+ return self ._find_param (self .hparams_audio , keys , optional )
1223
+
1224
+ def _find_param (self , obj : dict [str , Any ], keys : Iterable [str ], optional : bool = False ) -> Any :
1225
+ key = next ((k for k in keys if k in obj ), None )
1226
+ if key is not None :
1227
+ return obj [key ]
1228
+ if optional :
1229
+ return None
1230
+ raise KeyError (f"could not find any of: { keys } " )
1231
+
1200
1232
1201
1233
@ModelBase .register ("GPTNeoXForCausalLM" )
1202
1234
class GPTNeoXModel (TextModel ):
@@ -2674,7 +2706,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2674
2706
yield from super ().modify_tensors (data_torch , name , bid )
2675
2707
2676
2708
2677
- @ModelBase .register ("Qwen2VLModel" , "Qwen2VLForConditionalGeneration" , "Qwen2_5_VLForConditionalGeneration" )
2709
+ @ModelBase .register (
2710
+ "Qwen2VLModel" ,
2711
+ "Qwen2VLForConditionalGeneration" ,
2712
+ "Qwen2_5_VLForConditionalGeneration" ,
2713
+ "Qwen2_5OmniModel" ,
2714
+ )
2678
2715
class Qwen2VLModel (TextModel ):
2679
2716
model_arch = gguf .MODEL_ARCH .QWEN2VL
2680
2717
@@ -2692,8 +2729,11 @@ def set_vocab(self):
2692
2729
2693
2730
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2694
2731
del bid # unused
2695
- if name .startswith ("visual." ):
2696
- # skip visual tensors
2732
+ if name .startswith ("thinker." ):
2733
+ name = name .replace ("thinker." , "" )
2734
+ if name .startswith ("visual" ) or name .startswith ("audio" ) or \
2735
+ name .startswith ("talker" ) or name .startswith ("token2wav" ):
2736
+ # skip multimodal tensors
2697
2737
return []
2698
2738
return [(self .map_tensor_name (name ), data_torch )]
2699
2739
@@ -2702,21 +2742,27 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2702
2742
class Qwen2VLVisionModel (MmprojModel ):
2703
2743
def __init__ (self , * args , ** kwargs ):
2704
2744
super ().__init__ (* args , ** kwargs )
2705
- self .hparams ["image_size" ] = self .hparams .get ("image_size" , 560 )
2745
+ assert self .hparams_vision is not None
2746
+ self .hparams_vision ["image_size" ] = self .hparams_vision .get ("image_size" , 560 )
2706
2747
# rename config.json values
2707
- self .hparams ["num_attention_heads" ] = self .hparams .get ("num_heads" )
2708
- self .hparams ["num_hidden_layers" ] = self .hparams .get ("depth" )
2709
- if "embed_dim" in self .hparams : # qwen2vl
2710
- self .hparams ["intermediate_size" ] = self .hparams .get ("hidden_size" )
2711
- self .hparams ["hidden_size" ] = self .hparams .get ("embed_dim" )
2748
+ self .hparams_vision ["num_attention_heads" ] = self .hparams_vision .get ("num_heads" )
2749
+ self .hparams_vision ["num_hidden_layers" ] = self .hparams_vision .get ("depth" )
2750
+ if "embed_dim" in self .hparams_vision : # qwen2vl
2751
+ self .hparams_vision ["intermediate_size" ] = self .hparams_vision .get ("hidden_size" )
2752
+ self .hparams_vision ["hidden_size" ] = self .hparams_vision .get ("embed_dim" )
2712
2753
2713
2754
def set_gguf_parameters (self ):
2714
2755
super ().set_gguf_parameters ()
2715
- hparams = self .hparams
2716
- if self .global_config ['model_type' ] == 'qwen2_vl' :
2756
+ assert self .hparams_vision is not None
2757
+ hparams = self .hparams_vision
2758
+ model_type = self .global_config ['model_type' ]
2759
+ if model_type == 'qwen2_vl' :
2717
2760
self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN2VL )
2718
- elif self .global_config ['model_type' ] == 'qwen2_5_vl' :
2719
- self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
2761
+ elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni' :
2762
+ if model_type == 'qwen2_5_omni' :
2763
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25O )
2764
+ else :
2765
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
2720
2766
self .gguf_writer .add_vision_use_silu (True )
2721
2767
# find n_wa_pattern (window attention pattern)
2722
2768
fullatt_block_indexes = hparams .get ("fullatt_block_indexes" )
@@ -2774,6 +2820,66 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2774
2820
return [] # skip other tensors
2775
2821
2776
2822
2823
+ @ModelBase .register ("Qwen2_5OmniModel" )
2824
+ class Qwen25OmniModel (Qwen2VLVisionModel ):
2825
+ has_vision_encoder = True
2826
+ has_audio_encoder = True
2827
+
2828
+ def __init__ (self , * args , ** kwargs ):
2829
+ super ().__init__ (* args , ** kwargs )
2830
+ assert self .hparams_audio is not None
2831
+ self .hparams_audio ["hidden_size" ] = self .hparams_audio ["d_model" ]
2832
+ self .hparams_audio ["intermediate_size" ] = self .hparams_audio ["encoder_ffn_dim" ]
2833
+ self .hparams_audio ["num_attention_heads" ] = self .hparams_audio ["encoder_attention_heads" ]
2834
+
2835
+ def set_gguf_parameters (self ):
2836
+ super ().set_gguf_parameters ()
2837
+ assert self .hparams_audio is not None
2838
+ self .gguf_writer .add_audio_num_mel_bins (self .hparams_audio ["num_mel_bins" ])
2839
+ self .gguf_writer .add_audio_attention_layernorm_eps (self .hparams_audio .get ("layer_norm_eps" , 1e-5 ))
2840
+
2841
+ def get_vision_config (self ) -> dict [str , Any ] | None :
2842
+ return self .global_config ["thinker_config" ].get ("vision_config" )
2843
+
2844
+ def get_audio_config (self ) -> dict [str , Any ] | None :
2845
+ return self .global_config ["thinker_config" ].get ("audio_config" )
2846
+
2847
+ def generate_extra_tensors (self ) -> Iterable [tuple [str , Tensor ]]:
2848
+ # SinusoidsPositionEmbedding
2849
+ assert self .hparams_audio is not None
2850
+ max_timescale = 10000
2851
+ length = 1500
2852
+ channels = self .hparams_audio ["hidden_size" ]
2853
+ log_timescale_increment = np .log (max_timescale ) / (channels // 2 - 1 )
2854
+ inv_timescales = torch .exp (- log_timescale_increment * torch .arange (channels // 2 ).float ())
2855
+ scaled_time = torch .arange (length )[:, np .newaxis ] * inv_timescales [np .newaxis , :]
2856
+ pos_embd = torch .cat ([torch .sin (scaled_time ), torch .cos (scaled_time )], dim = 1 ).to (dtype = torch .float32 )
2857
+ yield ("audio_tower.embed_positions.weight" , pos_embd )
2858
+
2859
+ def tensor_force_quant (self , name , new_name , bid , n_dims ):
2860
+ del bid , new_name , n_dims # unused
2861
+ if ".conv" in name and ".weight" in name :
2862
+ return gguf .GGMLQuantizationType .F16
2863
+ return False
2864
+
2865
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2866
+ if name .startswith ("thinker." ):
2867
+ name = name .replace ("thinker." , "" )
2868
+
2869
+ if name .startswith ("audio_tower" ):
2870
+ # process audio tensors
2871
+ if "conv1.bias" in name or "conv2.bias" in name :
2872
+ # transpose conv1 and conv2 bias
2873
+ data_torch = data_torch .unsqueeze (- 1 )
2874
+ if "audio_bos_eos_token" in name :
2875
+ # this tensor is left unused in transformers code
2876
+ # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
2877
+ return []
2878
+ return [(self .map_tensor_name (name ), data_torch )]
2879
+
2880
+ return super ().modify_tensors (data_torch , name , bid )
2881
+
2882
+
2777
2883
@ModelBase .register ("InternVisionModel" )
2778
2884
class InternVisionModel (MmprojModel ):
2779
2885
def set_gguf_parameters (self ):
0 commit comments