@@ -249,9 +249,11 @@ struct clip_vision_model {
249
249
struct ggml_tensor * mm_4_w = nullptr ;
250
250
struct ggml_tensor * mm_4_b = nullptr ;
251
251
252
- // GLMV-Edge projection
252
+ // GLMV-Edge projection
253
253
struct ggml_tensor * mm_model_adapter_conv_w = nullptr ;
254
254
struct ggml_tensor * mm_model_adapter_conv_b = nullptr ;
255
+ struct ggml_tensor * mm_glm_tok_boi = nullptr ;
256
+ struct ggml_tensor * mm_glm_tok_eoi = nullptr ;
255
257
256
258
// MobileVLM projection
257
259
struct ggml_tensor * mm_model_mlp_1_w = nullptr ;
@@ -1559,6 +1561,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
1559
1561
embeddings = ggml_mul (ctx0, embeddings,x);
1560
1562
embeddings = ggml_mul_mat (ctx0, model.mm_model_mlp_3_w , embeddings);
1561
1563
}
1564
+ // arrangement of BOI/EOI token embeddings
1565
+ // note: these embeddings are not present in text model, hence we cannot process them as text tokens
1566
+ // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
1567
+ {
1568
+ embeddings = ggml_concat (ctx0, model.mm_glm_tok_boi , embeddings, 1 ); // BOI
1569
+ embeddings = ggml_concat (ctx0, embeddings, model.mm_glm_tok_eoi , 1 ); // EOI
1570
+ }
1562
1571
}
1563
1572
1564
1573
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
@@ -1972,12 +1981,14 @@ struct clip_model_loader {
1972
1981
{
1973
1982
vision_model.mm_model_adapter_conv_w = get_tensor (string_format (TN_GLM_ADAPER_CONV, " weight" ));
1974
1983
vision_model.mm_model_adapter_conv_b = get_tensor (string_format (TN_GLM_ADAPER_CONV, " bias" ));
1975
- vision_model.mm_model_mlp_0_w = get_tensor (string_format (TN_GLM_ADAPTER_LINEAR," weight" ));
1976
- vision_model.mm_model_ln_q_w = get_tensor (string_format (TN_GLM_ADAPTER_NORM_1," weight" ));
1977
- vision_model.mm_model_ln_q_b = get_tensor (string_format (TN_GLM_ADAPTER_NORM_1," bias" ));
1978
- vision_model.mm_model_mlp_1_w = get_tensor (string_format (TN_GLM_ADAPTER_D_H_2_4H," weight" ));
1979
- vision_model.mm_model_mlp_2_w = get_tensor (string_format (TN_GLM_ADAPTER_GATE," weight" ));
1980
- vision_model.mm_model_mlp_3_w = get_tensor (string_format (TN_GLM_ADAPTER_D_4H_2_H," weight" ));
1984
+ vision_model.mm_model_mlp_0_w = get_tensor (string_format (TN_GLM_ADAPTER_LINEAR, " weight" ));
1985
+ vision_model.mm_model_ln_q_w = get_tensor (string_format (TN_GLM_ADAPTER_NORM_1, " weight" ));
1986
+ vision_model.mm_model_ln_q_b = get_tensor (string_format (TN_GLM_ADAPTER_NORM_1, " bias" ));
1987
+ vision_model.mm_model_mlp_1_w = get_tensor (string_format (TN_GLM_ADAPTER_D_H_2_4H, " weight" ));
1988
+ vision_model.mm_model_mlp_2_w = get_tensor (string_format (TN_GLM_ADAPTER_GATE, " weight" ));
1989
+ vision_model.mm_model_mlp_3_w = get_tensor (string_format (TN_GLM_ADAPTER_D_4H_2_H, " weight" ));
1990
+ vision_model.mm_glm_tok_boi = get_tensor (string_format (TN_TOK_GLM_BOI, " weight" ));
1991
+ vision_model.mm_glm_tok_eoi = get_tensor (string_format (TN_TOK_GLM_EOI, " weight" ));
1981
1992
} break ;
1982
1993
case PROJECTOR_TYPE_QWEN2VL:
1983
1994
case PROJECTOR_TYPE_QWEN25VL:
@@ -2948,6 +2959,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
2948
2959
2949
2960
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
2950
2961
n_patches /= 4 ;
2962
+ n_patches += 2 ; // for BOI and EOI token embeddings
2951
2963
} else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
2952
2964
if (ctx->minicpmv_version == 2 ) {
2953
2965
n_patches = 96 ;
0 commit comments