@@ -81,6 +81,7 @@ static std::string format(const char * fmt, ...) {
81
81
#define KEY_HAS_VIS_ENC " clip.has_vision_encoder"
82
82
#define KEY_HAS_LLAVA_PROJ " clip.has_llava_projector"
83
83
#define KEY_HAS_MINICPMV_PROJ " clip.has_minicpmv_projector"
84
+ #define KEY_MINICPMV_VERSION " clip.minicpmv_version"
84
85
#define KEY_USE_GELU " clip.use_gelu"
85
86
#define KEY_N_EMBD " clip.%s.embedding_length"
86
87
#define KEY_N_FF " clip.%s.feed_forward_length"
@@ -526,6 +527,7 @@ struct clip_ctx {
526
527
bool has_vision_encoder = false ;
527
528
bool has_llava_projector = false ;
528
529
bool has_minicpmv_projector = false ;
530
+ int minicpmv_version = 2 ;
529
531
530
532
struct clip_vision_model vision_model;
531
533
projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -641,7 +643,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
641
643
if (ctx->has_minicpmv_projector ) {
642
644
int pos_w = image_size_width/patch_size;
643
645
int pos_h = image_size_height/patch_size;
644
- pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 4096 , pos_w * pos_h, 1 );
646
+ if (ctx->minicpmv_version == 2 ) {
647
+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 4096 , pos_w * pos_h, 1 );
648
+ }
649
+ else if (ctx->minicpmv_version == 3 ) {
650
+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
651
+ }
645
652
ggml_set_name (pos_embed, " pos_embed" );
646
653
ggml_set_input (pos_embed);
647
654
}
@@ -768,8 +775,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
768
775
embeddings = ggml_gelu (ctx0, embeddings);
769
776
embeddings = ggml_mul_mat (ctx0, model.mm_2_w , embeddings);
770
777
embeddings = ggml_add (ctx0, embeddings, model.mm_2_b );
771
-
772
- } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
778
+ }
779
+ else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
773
780
embeddings = ggml_mul_mat (ctx0, model.mm_0_w , embeddings);
774
781
embeddings = ggml_add (ctx0, embeddings, model.mm_0_b );
775
782
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -949,10 +956,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
949
956
}
950
957
951
958
{ // attention
952
- const int hidden_size = 4096 ;
959
+ int hidden_size = 4096 ;
953
960
const int d_head = 128 ;
954
- const int n_head = hidden_size/d_head;
955
- const int num_query = 96 ;
961
+ int n_head = hidden_size/d_head;
962
+ int num_query = 96 ;
963
+ if (ctx->minicpmv_version == 2 ) {
964
+ hidden_size = 4096 ;
965
+ n_head = hidden_size/d_head;
966
+ num_query = 96 ;
967
+ }
968
+ else if (ctx->minicpmv_version == 3 ) {
969
+ hidden_size = 3584 ;
970
+ n_head = hidden_size/d_head;
971
+ num_query = 64 ;
972
+ }
956
973
957
974
struct ggml_tensor * Q = ggml_add (ctx0, ggml_mul_mat (ctx0, model.mm_model_attn_q_w , q), model.mm_model_attn_q_b );
958
975
Q = ggml_scale_inplace (ctx0, Q, 1 .0f / sqrt ((float )d_head));
@@ -1149,6 +1166,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1149
1166
new_clip->has_minicpmv_projector = gguf_get_val_bool (ctx, idx);
1150
1167
}
1151
1168
1169
+ idx = gguf_find_key (ctx, KEY_MINICPMV_VERSION);
1170
+ if (idx != -1 ) {
1171
+ new_clip->minicpmv_version = gguf_get_val_i32 (ctx, idx);
1172
+ }
1173
+
1152
1174
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
1153
1175
1154
1176
GGML_ASSERT (new_clip->has_vision_encoder );
@@ -1910,10 +1932,12 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
1910
1932
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
1911
1933
// res_imgs memory is being allocated here, previous allocations will be freed if found
1912
1934
bool clip_image_preprocess (struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
1913
- if (clip_is_minicpmv (ctx)) {
1914
- std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image (img);
1935
+
1936
+ if (clip_is_minicpmv (ctx)){
1937
+ int max_slice_nums = 9 ;
1938
+ std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image (img, max_slice_nums);
1915
1939
res_imgs->size = 0 ;
1916
- for (size_t i = 0 ; i < imgs.size (); ++i) {
1940
+ for (size_t i = 0 ; i < imgs.size (); ++i){
1917
1941
res_imgs->size += imgs[i].size ();
1918
1942
}
1919
1943
res_imgs->data = new clip_image_f32[res_imgs->size ];
@@ -2146,7 +2170,12 @@ int clip_n_patches(const struct clip_ctx * ctx) {
2146
2170
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
2147
2171
n_patches /= 4 ;
2148
2172
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2149
- n_patches = 96 ;
2173
+ if (ctx->minicpmv_version == 2 ) {
2174
+ n_patches = 96 ;
2175
+ }
2176
+ else if (ctx->minicpmv_version == 3 ) {
2177
+ n_patches = 64 ;
2178
+ }
2150
2179
}
2151
2180
2152
2181
return n_patches;
@@ -2282,6 +2311,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2282
2311
const int patch_size = hparams.patch_size ;
2283
2312
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
2284
2313
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0 );
2314
+ if (ctx->load_image_size ==nullptr ){
2315
+ ctx->load_image_size = clip_image_size_init ();
2316
+ }
2317
+ const int pos_w = ctx->load_image_size ->width /patch_size;
2318
+ const int pos_h = ctx->load_image_size ->height /patch_size;
2285
2319
2286
2320
{
2287
2321
struct ggml_tensor * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" );
@@ -2316,8 +2350,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2316
2350
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
2317
2351
struct ggml_tensor * positions = ggml_graph_get_tensor (gf, " positions" );
2318
2352
int * positions_data = (int *)malloc (ggml_nbytes (positions));
2319
- for (int i = 0 ; i < num_positions; i++) {
2320
- positions_data[i] = std::floor (70.0 *i/num_positions);
2353
+ int bucket_coords_h[70 ];
2354
+ int bucket_coords_w[70 ];
2355
+ for (int i = 0 ; i < pos_h; i++){
2356
+ bucket_coords_h[i] = std::floor (70.0 *i/pos_h);
2357
+ }
2358
+ for (int i = 0 ; i < pos_w; i++){
2359
+ bucket_coords_w[i] = std::floor (70.0 *i/pos_w);
2360
+ }
2361
+ for (int i = 0 , id = 0 ; i < pos_h; i++){
2362
+ for (int j = 0 ; j < pos_w; j++){
2363
+ positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
2364
+ }
2321
2365
}
2322
2366
ggml_backend_tensor_set (positions, positions_data, 0 , ggml_nbytes (positions));
2323
2367
free (positions_data);
@@ -2328,12 +2372,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2328
2372
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
2329
2373
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
2330
2374
struct ggml_tensor * pos_embed = ggml_graph_get_tensor (gf, " pos_embed" );
2331
- if (ctx->load_image_size ==nullptr ){
2332
- ctx->load_image_size = clip_image_size_init ();
2333
- }
2334
- int pos_w = ctx->load_image_size ->width /patch_size;
2335
- int pos_h = ctx->load_image_size ->height /patch_size;
2336
2375
int embed_dim = 4096 ;
2376
+ if (ctx->minicpmv_version == 2 ) {
2377
+ embed_dim = 4096 ;
2378
+ }
2379
+ else if (ctx->minicpmv_version == 3 ) {
2380
+ embed_dim = 3584 ;
2381
+ }
2337
2382
auto pos_embed_t = get_2d_sincos_pos_embed (embed_dim, std::make_pair (pos_w, pos_h));
2338
2383
2339
2384
float * pos_embed_data = (float *)malloc (ggml_nbytes (pos_embed));
@@ -2346,7 +2391,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2346
2391
ggml_backend_tensor_set (pos_embed, pos_embed_data, 0 , ggml_nbytes (pos_embed));
2347
2392
free (pos_embed_data);
2348
2393
}
2349
- } else {
2394
+ }
2395
+ else {
2350
2396
{
2351
2397
if (ctx->has_class_embedding ) {
2352
2398
struct ggml_tensor * embeddings = ggml_graph_get_tensor (gf, " embeddings" );
@@ -2548,13 +2594,21 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2548
2594
return ctx->vision_model .mm_3_b ->ne [0 ];
2549
2595
}
2550
2596
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2551
- return 4096 ;
2597
+ if (ctx->minicpmv_version == 2 ) {
2598
+ return 4096 ;
2599
+ }
2600
+ else if (ctx->minicpmv_version == 3 ) {
2601
+ return 3584 ;
2602
+ }
2552
2603
}
2553
2604
2554
2605
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type ];
2555
2606
throw std::runtime_error (format (" %s: don't support projector with: %s currently\n " , __func__, proj_type.c_str ()));
2556
2607
}
2557
2608
2558
- bool clip_is_minicpmv (const struct clip_ctx * ctx) {
2559
- return ctx->has_minicpmv_projector ;
2609
+ int clip_is_minicpmv (const struct clip_ctx * ctx) {
2610
+ if (ctx->has_minicpmv_projector ) {
2611
+ return ctx->minicpmv_version ;
2612
+ }
2613
+ return 0 ;
2560
2614
}
0 commit comments