@@ -1124,6 +1124,8 @@ class MmprojModel(ModelBase):
1124
1124
preprocessor_config : dict [str , Any ]
1125
1125
global_config : dict [str , Any ]
1126
1126
1127
+ n_block_keys = ["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ]
1128
+
1127
1129
has_vision_encoder : bool = True # by default
1128
1130
has_audio_encoder : bool = False
1129
1131
@@ -1160,8 +1162,7 @@ def __init__(self, *args, **kwargs):
1160
1162
1161
1163
# TODO @ngxson : this is a hack to support both vision and audio encoders
1162
1164
have_multiple_encoders = self .has_audio_encoder and self .has_vision_encoder
1163
- self .block_count = 128 if have_multiple_encoders else \
1164
- self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ], True )
1165
+ self .block_count = 128 if have_multiple_encoders else self .find_hparam (self .n_block_keys , True )
1165
1166
self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .MMPROJ , self .block_count )
1166
1167
1167
1168
# load preprocessor config
@@ -1185,33 +1186,51 @@ def set_gguf_parameters(self):
1185
1186
self .gguf_writer .add_vision_projection_dim (self .n_embd_text )
1186
1187
1187
1188
# vision config
1188
- self .gguf_writer .add_vision_image_size (self .find_hparam (["image_size" ]))
1189
- self .gguf_writer .add_vision_patch_size (self .find_hparam (["patch_size" ]))
1190
- self .gguf_writer .add_vision_embedding_length (self .find_hparam (["hidden_size" ]))
1191
- self .gguf_writer .add_vision_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1192
- self .gguf_writer .add_vision_block_count (self .block_count )
1193
- self .gguf_writer .add_vision_head_count (self .find_hparam (["num_attention_heads" ]))
1189
+ self .gguf_writer .add_vision_image_size (self .find_vparam (["image_size" ]))
1190
+ self .gguf_writer .add_vision_patch_size (self .find_vparam (["patch_size" ]))
1191
+ self .gguf_writer .add_vision_embedding_length (self .find_vparam (["hidden_size" ]))
1192
+ self .gguf_writer .add_vision_feed_forward_length (self .find_vparam (["intermediate_size" ]))
1193
+ self .gguf_writer .add_vision_block_count (self .find_vparam ( self . n_block_keys ) )
1194
+ self .gguf_writer .add_vision_head_count (self .find_vparam (["num_attention_heads" ]))
1194
1195
1195
1196
# preprocessor config
1196
1197
self .gguf_writer .add_vision_image_mean (self .preprocessor_config ["image_mean" ])
1197
1198
self .gguf_writer .add_vision_image_std (self .preprocessor_config ["image_std" ])
1198
1199
1199
- elif self .has_audio_encoder :
1200
+ if self .has_audio_encoder :
1200
1201
self .gguf_writer .add_clip_has_audio_encoder (True )
1201
1202
self .gguf_writer .add_audio_projection_dim (self .n_embd_text )
1202
1203
1203
1204
# audio config
1204
- self .gguf_writer .add_audio_embedding_length (self .find_hparam (["hidden_size" ]))
1205
- self .gguf_writer .add_audio_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1206
- self .gguf_writer .add_audio_block_count (self .block_count )
1207
- self .gguf_writer .add_audio_head_count (self .find_hparam (["num_attention_heads" ]))
1205
+ self .gguf_writer .add_audio_embedding_length (self .find_aparam (["hidden_size" ]))
1206
+ self .gguf_writer .add_audio_feed_forward_length (self .find_aparam (["intermediate_size" ]))
1207
+ self .gguf_writer .add_audio_block_count (self .find_aparam ( self . n_block_keys ) )
1208
+ self .gguf_writer .add_audio_head_count (self .find_aparam (["num_attention_heads" ]))
1208
1209
1209
1210
else :
1210
1211
raise ValueError ("MmprojModel must have either vision or audio encoder" )
1211
1212
1212
1213
def write_vocab (self ):
1213
1214
raise ValueError ("MmprojModel does not support vocab writing" )
1214
1215
1216
+ def find_vparam (self , keys : Iterable [str ], optional : bool = False ) -> Any :
1217
+ key = next ((k for k in keys if k in self .hparams ), None )
1218
+ assert self .hparams_vision is not None
1219
+ return self ._find_param (self .hparams_vision , keys , optional )
1220
+
1221
+ def find_aparam (self , keys : Iterable [str ], optional : bool = False ) -> Any :
1222
+ key = next ((k for k in keys if k in self .hparams ), None )
1223
+ assert self .hparams_audio is not None
1224
+ return self ._find_param (self .hparams_audio , keys , optional )
1225
+
1226
+ def _find_param (self , obj : dict [str , Any ], keys : Iterable [str ], optional : bool = False ) -> Any :
1227
+ key = next ((k for k in keys if k in obj ), None )
1228
+ if key is not None :
1229
+ return obj [key ]
1230
+ if optional :
1231
+ return None
1232
+ raise KeyError (f"could not find any of: { keys } " )
1233
+
1215
1234
1216
1235
@ModelBase .register ("GPTNeoXForCausalLM" )
1217
1236
class GPTNeoXModel (TextModel ):
@@ -2743,9 +2762,9 @@ def set_gguf_parameters(self):
2743
2762
self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN2VL )
2744
2763
elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni' :
2745
2764
if model_type == 'qwen2_5_omni' :
2746
- self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
2747
- else :
2748
2765
self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25O )
2766
+ else :
2767
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
2749
2768
self .gguf_writer .add_vision_use_silu (True )
2750
2769
# find n_wa_pattern (window attention pattern)
2751
2770
fullatt_block_indexes = hparams .get ("fullatt_block_indexes" )
@@ -2808,6 +2827,19 @@ class Qwen25OmniModel(Qwen2VLVisionModel):
2808
2827
has_vision_encoder = True
2809
2828
has_audio_encoder = True
2810
2829
2830
+ def __init__ (self , * args , ** kwargs ):
2831
+ super ().__init__ (* args , ** kwargs )
2832
+ assert self .hparams_audio is not None
2833
+ self .hparams_audio ["hidden_size" ] = self .hparams_audio ["d_model" ]
2834
+ self .hparams_audio ["intermediate_size" ] = self .hparams_audio ["encoder_ffn_dim" ]
2835
+ self .hparams_audio ["num_attention_heads" ] = self .hparams_audio ["encoder_attention_heads" ]
2836
+
2837
+ def set_gguf_parameters (self ):
2838
+ super ().set_gguf_parameters ()
2839
+ assert self .hparams_audio is not None
2840
+ self .gguf_writer .add_audio_num_mel_bins (self .hparams_audio ["num_mel_bins" ])
2841
+ self .gguf_writer .add_audio_attention_layernorm_eps (self .hparams_audio .get ("layer_norm_eps" , 1e-5 ))
2842
+
2811
2843
def get_vision_config (self ) -> dict [str , Any ] | None :
2812
2844
return self .global_config ["thinker_config" ].get ("vision_config" )
2813
2845
0 commit comments