From 99874e5e348e2e41dc05758b2d5e60062dd8f060 Mon Sep 17 00:00:00 2001
From: Achazwl <323163497@qq.com>
Date: Fri, 26 Apr 2024 16:37:46 +0800
Subject: [PATCH 1/5] support minicpmv

---
 .gitignore                                    |    1 +
 Makefile                                      |    8 +-
 convert-hf-to-gguf.py                         |    2 +
 examples/CMakeLists.txt                       |    1 +
 examples/minicpmv/CMakeLists.txt              |   37 +
 examples/minicpmv/README.md                   |   19 +
 examples/minicpmv/android/adb_run.sh          |   53 +
 examples/minicpmv/clip.cpp                    | 2134 +++++++++++++++++
 examples/minicpmv/clip.h                      |   85 +
 .../minicpmv/convert-image-encoder-to-gguf.py |  403 ++++
 examples/minicpmv/minicpm-surgery.py          |   48 +
 examples/minicpmv/minicpmv-cli.cpp            |  331 +++
 examples/minicpmv/minicpmv.cpp                |  556 +++++
 examples/minicpmv/minicpmv.h                  |   52 +
 examples/minicpmv/requirements.txt            |    3 +
 gguf-py/gguf/constants.py                     |    3 +
 gguf-py/gguf/gguf_writer.py                   |    3 +
 llama.cpp                                     |   30 +-
 18 files changed, 3754 insertions(+), 15 deletions(-)
 create mode 100644 examples/minicpmv/CMakeLists.txt
 create mode 100644 examples/minicpmv/README.md
 create mode 100755 examples/minicpmv/android/adb_run.sh
 create mode 100644 examples/minicpmv/clip.cpp
 create mode 100644 examples/minicpmv/clip.h
 create mode 100644 examples/minicpmv/convert-image-encoder-to-gguf.py
 create mode 100644 examples/minicpmv/minicpm-surgery.py
 create mode 100644 examples/minicpmv/minicpmv-cli.cpp
 create mode 100644 examples/minicpmv/minicpmv.cpp
 create mode 100644 examples/minicpmv/minicpmv.h
 create mode 100644 examples/minicpmv/requirements.txt

diff --git a/.gitignore b/.gitignore
index 5c14900844435..8f3c1b91c14c0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,6 +59,7 @@ models-mnt
 /libllama.so
 /llama-bench
 /llava-cli
+/minicpmv-cli
 /lookahead
 /lookup
 /lookup-create
diff --git a/Makefile b/Makefile
index b0b2ea997ee12..d56689273bceb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli minicpmv-cli baby-llama beam-search  \
 	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
 
 # Binaries only useful for tests
@@ -846,6 +846,12 @@ llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/cli
 	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
 
+minicpmv-cli: examples/minicpmv/minicpmv-cli.cpp examples/minicpmv/clip.h examples/minicpmv/clip.cpp examples/minicpmv/minicpmv.h examples/minicpmv/minicpmv.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) -c examples/minicpmv/clip.cpp  -o $(call GET_OBJ_FILE, examples/minicpmv/clip.cpp) -Wno-cast-qual
+	$(CXX) $(CXXFLAGS) -c examples/minicpmv/minicpmv.cpp -o $(call GET_OBJ_FILE, examples/minicpmv/minicpmv.cpp)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/minicpmv/clip.cpp examples/minicpmv/minicpmv.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/minicpmv/clip.cpp) $(call GET_OBJ_FILE, examples/minicpmv/minicpmv.cpp) -o $@ $(LDFLAGS)
+
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 4fd916cba3ed7..278edccdccc45 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1638,6 +1638,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
         self.gguf_writer.add_file_type(self.ftype)
+        if "tie_word_embeddings" in self.hparams:
+            self.gguf_writer.add_tie_lm_head(self.hparams["tie_word_embeddings"])
 
     def set_vocab(self):
         self._set_vocab_llama_hf()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f421769cc2f0a..33d3e65bad536 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -26,6 +26,7 @@ else()
     add_subdirectory(infill)
     add_subdirectory(llama-bench)
     add_subdirectory(llava)
+    add_subdirectory(minicpmv)
     if (LLAMA_SYCL)
         add_subdirectory(sycl)
     endif()
diff --git a/examples/minicpmv/CMakeLists.txt b/examples/minicpmv/CMakeLists.txt
new file mode 100644
index 0000000000000..3d247b7e78262
--- /dev/null
+++ b/examples/minicpmv/CMakeLists.txt
@@ -0,0 +1,37 @@
+add_library(minicpmv OBJECT
+            minicpmv.cpp
+            minicpmv.h
+            clip.cpp
+            clip.h
+            )
+
+target_link_libraries(minicpmv PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(minicpmv PUBLIC .)
+target_include_directories(minicpmv PUBLIC ../..)
+target_include_directories(minicpmv PUBLIC ../../common)
+
+target_compile_features(minicpmv PRIVATE cxx_std_11)
+
+add_library(minicpmv_static STATIC $<TARGET_OBJECTS:minicpmv>)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(minicpmv PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(minicpmv PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(minicpmv_shared SHARED $<TARGET_OBJECTS:minicpmv>)
+    target_link_libraries(minicpmv_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS minicpmv_shared LIBRARY)
+endif()
+
+if (NOT MSVC)
+    target_compile_options(minicpmv PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+if(TARGET BUILD_INFO)
+    add_dependencies(minicpmv BUILD_INFO)
+endif()
+
+set(TARGET minicpmv-cli)
+add_executable(minicpmv-cli minicpmv-cli.cpp)
+install(TARGETS minicpmv-cli RUNTIME)
+target_link_libraries(minicpmv-cli PRIVATE common minicpmv ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(minicpmv PRIVATE cxx_std_11)
diff --git a/examples/minicpmv/README.md b/examples/minicpmv/README.md
new file mode 100644
index 0000000000000..a66c2cb1b394d
--- /dev/null
+++ b/examples/minicpmv/README.md
@@ -0,0 +1,19 @@
+# 所有命令在 llama.cpp 根目录执行，模型位于根目录上级目录处
+# All command should be executed under the root path of llama.cpp repo. We assume the MiniCPM-V-2 model are put in its parent folder.
+
+```bash
+make
+make minicpmv-cli
+
+python ./examples/minicpmv/minicpm-surgery.py -m ../MiniCPM-V-2
+python ./examples/minicpmv/convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2 --llava-projector ../MiniCPM-V-2/llava.projector --output-dir ../MiniCPM-V-2 --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
+python ./convert-hf-to-gguf.py ../MiniCPM-V-2/MiniCPM
+./minicpmv-cli -m ../MiniCPM-V-2/MiniCPM/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2/mmproj-model-f16.gguf -c 4096 --temp 0.6 --top-p 0.8 --top-k 100 --repeat-penalty 1.0 --image ../test.jpg -p "这张图里有什么?"
+
+# or run quantize int4 version
+./quantize ../MiniCPM-V-2/MiniCPM/ggml-model-f16.gguf ../MiniCPM-V-2/MiniCPM/ggml-model-Q4_K_M.gguf Q4_K_M
+./minicpmv-cli -m ../MiniCPM-V-2/MiniCPM/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2/mmproj-model-f16.gguf -c 4096 --temp 0.6 --top-p 0.8 --top-k 100 --repeat-penalty 1.0 --image ../test.jpg -p "这张图里有什么?"
+
+# or run in interactive mode
+./minicpmv-cli -m ../MiniCPM-V-2/MiniCPM/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2/mmproj-model-f16.gguf -c 4096 --temp 0.6 --top-p 0.8 --top-k 100 --repeat-penalty 1.0 --image ../test.jpg -i
+```
diff --git a/examples/minicpmv/android/adb_run.sh b/examples/minicpmv/android/adb_run.sh
new file mode 100755
index 0000000000000..2fbcd19b22b11
--- /dev/null
+++ b/examples/minicpmv/android/adb_run.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
+projector_name="mmproj-model-f16.gguf"
+llama_name="ggml-model-q4_k.gguf"
+img_dir="/Users/cxt/model/llm"
+img_name="demo.jpg"
+prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
+# img_name="cat.jpeg"
+# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
+
+program_dir="build_64/bin"
+binName="minicpmv-cli"
+n_threads=4
+
+
+deviceDir="/data/local/tmp"
+saveDir="output"
+if [ ! -d ${saveDir} ]; then
+    mkdir ${saveDir}
+fi
+
+
+function android_run() {
+    # # copy resource into device
+    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
+    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
+    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
+    # copy program into device
+    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
+    adb shell "chmod 0777 ${deviceDir}/${binName}"
+
+    # run
+    adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
+    adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
+    adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
+}
+
+android_run
+
+echo "android_run is Done!"
diff --git a/examples/minicpmv/clip.cpp b/examples/minicpmv/clip.cpp
new file mode 100644
index 0000000000000..dfde452a67cce
--- /dev/null
+++ b/examples/minicpmv/clip.cpp
@@ -0,0 +1,2134 @@
+// NOTE: This is modified from clip.cpp only for LLaVA,
+// so there might be still unnecessary artifacts hanging around
+// I'll gradually clean and extend it
+// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
+#include "clip.h"
+#include "log.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <regex>
+#include <stdexcept>
+#include <vector>
+#include <sstream>
+#include <cinttypes>
+#include <limits>
+
+//#define CLIP_DEBUG_FUNCTIONS
+
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+static std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), buf.size());
+}
+
+//
+// key constants
+//
+
+#define KEY_FTYPE          "general.file_type"
+#define KEY_NAME           "general.name"
+#define KEY_DESCRIPTION    "general.description"
+#define KEY_HAS_TEXT_ENC   "clip.has_text_encoder"
+#define KEY_HAS_VIS_ENC    "clip.has_vision_encoder"
+#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
+#define KEY_USE_GELU       "clip.use_gelu"
+#define KEY_N_EMBD         "clip.%s.embedding_length"
+#define KEY_N_FF           "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK        "clip.%s.block_count"
+#define KEY_N_HEAD         "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
+#define KEY_PROJ_DIM       "clip.%s.projection_dim"
+#define KEY_TOKENS         "tokenizer.ggml.tokens"
+#define KEY_N_POSITIONS    "clip.text.context_length"
+#define KEY_IMAGE_SIZE     "clip.vision.image_size"
+#define KEY_PATCH_SIZE     "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN     "clip.vision.image_mean"
+#define KEY_IMAGE_STD      "clip.vision.image_std"
+#define KEY_PROJ_TYPE      "clip.projector_type"
+
+#define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+
+
+//
+// tensor name constants
+//
+
+#define TN_TOKEN_EMBD      "%s.token_embd.weight"
+#define TN_POS_EMBD        "%s.position_embd"
+// #define TN_CLASS_EMBD      "v.class_embd"
+#define TN_PATCH_EMBD      "v.patch_embd.proj.%s"
+#define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
+#define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
+#define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
+#define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
+#define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
+#define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
+#define TN_LN_1            "%s.blk.%d.ln1.%s"
+#define TN_LN_2            "%s.blk.%d.ln2.%s"
+// #define TN_LN_PRE          "%s.pre_ln.%s"
+#define TN_LN_POST         "%s.post_ln.%s"
+#define TN_TEXT_PROJ       "text_projection.weight"
+#define TN_VIS_PROJ        "visual_projection.weight"
+#define TN_LLAVA_PROJ      "mm.%d.%s"
+#define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s"
+#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+#define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
+#define TN_IMAGE_NEWLINE   "model.image_newline"
+// MINICPMV
+#define TN_MINICPMV_POS_EMBD "resampler.pos_embed"
+#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
+#define TN_MINICPMV_QUERY "resampler.query"
+#define TN_MINICPMV_PROJ "resampler.proj.weight"
+#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
+#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
+#define TN_MINICPMV_LN "resampler.ln_%s.%s"
+
+
+enum projector_type {
+    PROJECTOR_TYPE_MLP,
+    PROJECTOR_TYPE_MLP_NORM,
+    PROJECTOR_TYPE_LDP,
+    PROJECTOR_TYPE_LDPV2,
+    PROJECTOR_TYPE_RESAMPLER,
+    PROJECTOR_TYPE_UNKNOWN,
+};
+
+static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
+    { PROJECTOR_TYPE_MLP, "mlp" },
+    { PROJECTOR_TYPE_LDP, "ldp" },
+    { PROJECTOR_TYPE_LDPV2, "ldpv2"},
+    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
+};
+
+
+//
+// utilities to get data from a gguf file
+//
+
+static int get_key_idx(const gguf_context * ctx, const char * key) {
+    int i = gguf_find_key(ctx, key);
+    if (i == -1) {
+        LOG_TEE("key %s not found in file\n", key);
+        throw std::runtime_error(format("Missing required key: %s", key));
+    }
+
+    return i;
+}
+
+static uint32_t get_u32(const gguf_context * ctx, const std::string & key) {
+    const int i = get_key_idx(ctx, key.c_str());
+
+    return gguf_get_val_u32(ctx, i);
+}
+
+static float get_f32(const gguf_context * ctx, const std::string & key) {
+    const int i = get_key_idx(ctx, key.c_str());
+
+    return gguf_get_val_f32(ctx, i);
+}
+
+static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
+    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
+    if (!cur) {
+        throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
+    }
+
+    return cur;
+}
+
+static std::string get_ftype(int ftype) {
+    return ggml_type_name(static_cast<ggml_type>(ftype));
+}
+
+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+    switch (type) {
+        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return format("unknown type %d", type);
+    }
+}
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
+    }
+    s = std::move(result);
+}
+
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case GGUF_TYPE_STRING:
+            return gguf_get_val_str(ctx_gguf, i);
+        case GGUF_TYPE_ARRAY:
+            {
+                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = gguf_get_arr_n(ctx_gguf, i);
+                const void * data = gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == GGUF_TYPE_STRING) {
+                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        replace_all(val, "\\", "\\\\");
+                        replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
+
+static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
+    size_t tensor_size = ggml_nbytes(tensor);
+    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
+            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
+}
+
+static projector_type clip_projector_type_from_string(const std::string & name) {
+    for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
+        if (kv.second == name) {
+            return kv.first;
+        }
+    }
+    return PROJECTOR_TYPE_UNKNOWN;
+}
+
+#ifdef CLIP_DEBUG_FUNCTIONS
+static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        return;
+    }
+
+    // PPM header: P6 format, width, height, and max color value
+    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
+
+    // Write pixel data
+    for (size_t i = 0; i < img.buf.size(); i += 3) {
+        // PPM expects binary data in RGB format, which matches our image buffer
+        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
+    }
+
+    file.close();
+}
+
+static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        return;
+    }
+
+    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
+    int bytesPerPixel = 3;
+    int widthInBytes = img.nx * bytesPerPixel;
+    int paddingAmount = (4 - (widthInBytes % 4)) % 4;
+    int stride = widthInBytes + paddingAmount;
+
+    // Bitmap file header
+    unsigned char fileHeader[14] = {
+        'B','M',     // Signature
+        0,0,0,0,    // Image file size in bytes
+        0,0,0,0,    // Reserved
+        54,0,0,0    // Start of pixel array
+    };
+
+    // Total file size
+    fileSize = 54 + (stride * img.ny);
+    fileHeader[2] = (unsigned char)(fileSize);
+    fileHeader[3] = (unsigned char)(fileSize >> 8);
+    fileHeader[4] = (unsigned char)(fileSize >> 16);
+    fileHeader[5] = (unsigned char)(fileSize >> 24);
+
+    // Bitmap information header (BITMAPINFOHEADER)
+    unsigned char infoHeader[40] = {
+        40,0,0,0,   // Size of this header (40 bytes)
+        0,0,0,0,    // Image width
+        0,0,0,0,    // Image height
+        1,0,        // Number of color planes
+        24,0,       // Bits per pixel
+        0,0,0,0,    // No compression
+        0,0,0,0,    // Image size (can be 0 for no compression)
+        0,0,0,0,    // X pixels per meter (not specified)
+        0,0,0,0,    // Y pixels per meter (not specified)
+        0,0,0,0,    // Total colors (color table not used)
+        0,0,0,0     // Important colors (all are important)
+    };
+
+    // Width and height in the information header
+    infoHeader[4] = (unsigned char)(img.nx);
+    infoHeader[5] = (unsigned char)(img.nx >> 8);
+    infoHeader[6] = (unsigned char)(img.nx >> 16);
+    infoHeader[7] = (unsigned char)(img.nx >> 24);
+    infoHeader[8] = (unsigned char)(img.ny);
+    infoHeader[9] = (unsigned char)(img.ny >> 8);
+    infoHeader[10] = (unsigned char)(img.ny >> 16);
+    infoHeader[11] = (unsigned char)(img.ny >> 24);
+
+    // Write file headers
+    file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
+    file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
+
+    // Pixel data
+    std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
+    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
+        for (int x = 0; x < img.nx; ++x) {
+            // Each pixel
+            size_t pixelIndex = (y * img.nx + x) * 3;
+            unsigned char pixel[3] = {
+                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
+                img.buf[pixelIndex + 1],
+                img.buf[pixelIndex]
+            };
+            file.write(reinterpret_cast<char*>(pixel), 3);
+        }
+        // Write padding for the row
+        file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
+    }
+
+    file.close();
+}
+
+// debug function to convert f32 to u8
+static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
+    dst.nx = src.nx;
+    dst.ny = src.ny;
+    dst.buf.resize(3 * src.nx * src.ny);
+    for (size_t i = 0; i < src.buf.size(); ++i) {
+        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
+    }
+}
+#endif
+
+
+//
+// clip layers
+//
+
+struct clip_hparams {
+    int32_t image_size;
+    int32_t patch_size;
+    int32_t hidden_size;
+    int32_t n_intermediate;
+    int32_t projection_dim;
+    int32_t n_head;
+    int32_t n_layer;
+
+    float eps;
+
+    char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
+
+    int32_t image_grid_pinpoints[32];
+    int32_t image_crop_resolution;
+};
+
+struct clip_layer {
+    // attention
+    struct ggml_tensor * k_w;
+    struct ggml_tensor * k_b;
+    struct ggml_tensor * q_w;
+    struct ggml_tensor * q_b;
+    struct ggml_tensor * v_w;
+    struct ggml_tensor * v_b;
+
+    struct ggml_tensor * o_w;
+    struct ggml_tensor * o_b;
+
+    // layernorm 1
+    struct ggml_tensor * ln_1_w;
+    struct ggml_tensor * ln_1_b;
+
+    // ff
+    struct ggml_tensor * ff_i_w;
+    struct ggml_tensor * ff_i_b;
+
+    struct ggml_tensor * ff_o_w;
+    struct ggml_tensor * ff_o_b;
+
+    // layernorm 2
+    struct ggml_tensor * ln_2_w;
+    struct ggml_tensor * ln_2_b;
+};
+
+struct clip_vision_model {
+    struct clip_hparams hparams;
+
+    // embeddings
+    // struct ggml_tensor * class_embedding;
+    struct ggml_tensor * patch_embeddings_w;
+    struct ggml_tensor * patch_embeddings_b;
+    struct ggml_tensor * position_embeddings;
+
+    // struct ggml_tensor * pre_ln_w;
+    // struct ggml_tensor * pre_ln_b;
+
+    std::vector<clip_layer> layers;
+
+    struct ggml_tensor * post_ln_w;
+    struct ggml_tensor * post_ln_b;
+
+    struct ggml_tensor * projection;
+
+    // LLaVA projection
+    struct ggml_tensor * mm_0_w = NULL;
+    struct ggml_tensor * mm_0_b = NULL;
+    struct ggml_tensor * mm_2_w = NULL;
+    struct ggml_tensor * mm_2_b = NULL;
+
+    struct ggml_tensor * image_newline = NULL;
+
+    // Yi type models with mlp+normalization projection
+    struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
+    struct ggml_tensor * mm_1_b = NULL;
+    struct ggml_tensor * mm_3_w = NULL;
+    struct ggml_tensor * mm_3_b = NULL;
+    struct ggml_tensor * mm_4_w = NULL;
+    struct ggml_tensor * mm_4_b = NULL;
+
+    // MobileVLM projection
+    struct ggml_tensor * mm_model_mlp_1_w;
+    struct ggml_tensor * mm_model_mlp_1_b;
+    struct ggml_tensor * mm_model_mlp_3_w;
+    struct ggml_tensor * mm_model_mlp_3_b;
+    struct ggml_tensor * mm_model_block_1_block_0_0_w;
+    struct ggml_tensor * mm_model_block_1_block_0_1_w;
+    struct ggml_tensor * mm_model_block_1_block_0_1_b;
+    struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
+    struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
+    struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
+    struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
+    struct ggml_tensor * mm_model_block_1_block_2_0_w;
+    struct ggml_tensor * mm_model_block_1_block_2_1_w;
+    struct ggml_tensor * mm_model_block_1_block_2_1_b;
+    struct ggml_tensor * mm_model_block_2_block_0_0_w;
+    struct ggml_tensor * mm_model_block_2_block_0_1_w;
+    struct ggml_tensor * mm_model_block_2_block_0_1_b;
+    struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
+    struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
+    struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
+    struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
+    struct ggml_tensor * mm_model_block_2_block_2_0_w;
+    struct ggml_tensor * mm_model_block_2_block_2_1_w;
+    struct ggml_tensor * mm_model_block_2_block_2_1_b;
+
+    // MobileVLM_V2 projection
+    struct ggml_tensor * mm_model_mlp_0_w;
+    struct ggml_tensor * mm_model_mlp_0_b;
+    struct ggml_tensor * mm_model_mlp_2_w;
+    struct ggml_tensor * mm_model_mlp_2_b;
+    struct ggml_tensor * mm_model_peg_0_w;
+    struct ggml_tensor * mm_model_peg_0_b;
+
+    // MINICPMV projection
+    struct ggml_tensor * mm_model_pos_embed;
+    struct ggml_tensor * mm_model_pos_embed_k;
+    struct ggml_tensor * mm_model_query;
+    struct ggml_tensor * mm_model_proj;
+    struct ggml_tensor * mm_model_kv_proj;
+    struct ggml_tensor * mm_model_attn_q_w;
+    struct ggml_tensor * mm_model_attn_q_b;
+    struct ggml_tensor * mm_model_attn_k_w;
+    struct ggml_tensor * mm_model_attn_k_b;
+    struct ggml_tensor * mm_model_attn_v_w;
+    struct ggml_tensor * mm_model_attn_v_b;
+    struct ggml_tensor * mm_model_attn_o_w;
+    struct ggml_tensor * mm_model_attn_o_b;
+    struct ggml_tensor * mm_model_ln_q_w;
+    struct ggml_tensor * mm_model_ln_q_b;
+    struct ggml_tensor * mm_model_ln_kv_w;
+    struct ggml_tensor * mm_model_ln_kv_b;
+    struct ggml_tensor * mm_model_ln_post_w;
+    struct ggml_tensor * mm_model_ln_post_b;
+};
+
+struct clip_ctx {
+    bool has_text_encoder    = false;
+    bool has_vision_encoder  = false;
+    bool has_llava_projector = false;
+
+    struct clip_vision_model vision_model;
+    projector_type proj_type = PROJECTOR_TYPE_MLP;
+
+    float image_mean[3];
+    float image_std[3];
+    bool use_gelu = false;
+    int32_t ftype = 1;
+
+    struct gguf_context * ctx_gguf;
+    struct ggml_context * ctx_data;
+
+    std::vector<uint8_t> buf_compute_meta;
+
+    // memory buffers to evaluate the model
+    ggml_backend_buffer_t params_buffer  = NULL;
+
+    ggml_backend_t backend       = NULL;
+    ggml_gallocr_t compute_alloc = NULL;
+};
+
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
+    if (!ctx->has_vision_encoder) {
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        return nullptr;
+    }
+
+    const auto & model = ctx->vision_model;
+    const auto & hparams = model.hparams;
+
+    const int image_size           = hparams.image_size;
+    const int patch_size           = hparams.patch_size;
+    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
+    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
+    const int num_positions        = num_patches;
+    const int hidden_size          = hparams.hidden_size;
+    const int n_head               = hparams.n_head;
+    const int d_head               = hidden_size / n_head;
+    const int n_layer              = hparams.n_layer;
+    const float eps                = hparams.eps;
+
+    const int batch_size = imgs->size;
+
+    if (ctx->has_llava_projector) {
+        GGML_ASSERT(batch_size == 1);
+    }
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
+        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
+    ggml_set_name(inp_raw, "inp_raw");
+    ggml_set_input(inp_raw);
+
+    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_w, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
+    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+    inp = ggml_add(ctx0, inp, model.patch_embeddings_b);
+
+    // concat class_embeddings and patch_embeddings
+    // struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+    // ggml_set_name(embeddings, "embeddings");
+    // ggml_set_input(embeddings);
+
+    // embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+    //         embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+
+    // embeddings = ggml_acc(ctx0, embeddings, inp,
+    //         embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+
+    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    struct ggml_tensor * embeddings =
+        ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
+
+    // // pre-layernorm
+    // {
+        // embeddings = ggml_norm(ctx0, embeddings, eps);
+        // ggml_set_name(embeddings, "pre_ln");
+
+        // embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
+    // }
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
+
+        //const size_t nb_q_w = model.layers[il].q_w->nb[0];
+
+        // layernorm1
+        {
+            cur = ggml_norm(ctx0, cur, eps);
+
+            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
+                           model.layers[il].ln_1_b);
+        }
+
+        // self-attention
+        {
+
+            struct ggml_tensor * Q =
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
+
+            Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
+            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
+            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
+
+            struct ggml_tensor * K =
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
+
+            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
+
+            struct ggml_tensor * V =
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
+
+            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
+            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
+            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
+
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            KQ = ggml_soft_max_inplace(ctx0, KQ);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
+            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
+            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
+        }
+
+        // attention output
+        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, embeddings);
+
+        embeddings = cur; // embeddings = residual, cur = hidden_states
+
+        // layernorm2
+        {
+            cur = ggml_norm(ctx0, cur, eps);
+
+            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
+        }
+
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
+
+        if (ctx->use_gelu) {
+            cur = ggml_gelu_inplace(ctx0, cur);
+        } else {
+            cur = ggml_gelu_quick_inplace(ctx0, cur);
+        }
+
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
+
+        // residual 2
+        cur = ggml_add(ctx0, embeddings, cur);
+
+        embeddings = cur;
+    }
+
+    { // post layernorm
+        embeddings = ggml_norm(ctx0, embeddings, eps);
+        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
+    }
+
+    // llava projector
+    {
+        // embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
+
+        // struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, 64);
+        // ggml_set_name(patches, "patches");
+        // ggml_set_input(patches);
+
+        // shape [1, 576, 1024]
+        // ne is whcn, ne = [1024, 576, 1, 1]
+        // embeddings = ggml_get_rows(ctx0, embeddings, patches);
+
+        // print_tensor_info(embeddings, "embeddings");
+
+        // llava projector
+        if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+            embeddings = ggml_gelu(ctx0, embeddings);
+            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+
+        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+            // First LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+                                model.mm_1_b);
+
+            // GELU activation
+            embeddings = ggml_gelu(ctx0, embeddings);
+
+            // Second linear layer
+            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+            // Second LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+                                model.mm_4_b);
+        }
+        else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projector
+            int n_patch = 24;
+            struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+            mlp_1 = ggml_gelu(ctx0, mlp_1);
+            struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
+
+            // block 1
+            struct ggml_tensor * block_1 = nullptr;
+            {
+                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
+                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+                // stride = 1, padding = 1, bias is nullptr
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+
+                // layer norm
+                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+
+                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // hardswish
+                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // residual
+                block_1 = ggml_add(ctx0, mlp_3, block_1);
+            }
+
+            // block_2
+            {
+                // stride = 2
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // layer norm
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // hardswish
+                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                // not sure the parameters is right for globalAvgPooling
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+
+                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
+                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+            }
+            embeddings = block_1;
+        }
+        else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
+        {
+            int n_patch = 24;
+            struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
+            mlp_0 = ggml_gelu(ctx0, mlp_0);
+            struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
+            // mlp_2 ne = [2048, 576, 1, 1]
+            // // AVG Pool Layer 2*2, strides = 2
+            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
+            // mlp_2 ne = [576, 2048, 1, 1]
+            mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+            // mlp_2 ne [24, 24, 2048, 1]
+            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
+            // weight ne = [3, 3, 2048, 1]
+            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
+            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
+            embeddings = peg_0;
+        }
+        else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+            struct ggml_tensor * q = model.mm_model_query;
+            { // layernorm
+                q = ggml_norm(ctx0, q, eps);
+                q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+            }
+            struct ggml_tensor *k, *v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+            { // layernorm
+                v = ggml_norm(ctx0, v, eps);
+                v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
+            }
+            { // position
+                q = ggml_add(ctx0, q, model.mm_model_pos_embed);
+                k = ggml_add(ctx0, v, model.mm_model_pos_embed_k);
+            }
+            { // attention
+                const int hidden_size = 2304;
+                const int d_head = 128;
+                const int n_head = hidden_size/d_head;
+                const int num_query = 64;
+
+                struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
+                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
+                struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
+                struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
+                // permute
+                Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
+                Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+                Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
+                K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+                K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+                K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
+                V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
+                V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
+                V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
+                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+                KQ = ggml_soft_max_inplace(ctx0, KQ);
+                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
+                KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
+                KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+                KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
+
+                embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
+            }
+            { // layernorm
+                embeddings = ggml_norm(ctx0, embeddings, eps);
+                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
+            }
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+        }
+        else {
+            GGML_ASSERT(false);
+        }
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+// read and create ggml_context containing the tensors and their data
+struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+    struct ggml_context * meta = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ &meta,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname, params);
+    if (!ctx) {
+        throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
+    }
+
+    if (verbosity >= 1) {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+        const int n_kv = gguf_get_n_kv(ctx);
+        const int ftype = get_u32(ctx, KEY_FTYPE);
+        const std::string ftype_str = get_ftype(ftype);
+        const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
+        const std::string description = gguf_get_val_str(ctx, idx_desc);
+        const int idx_name = gguf_find_key(ctx, KEY_NAME);
+        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
+            const std::string name = gguf_get_val_str(ctx, idx_name);
+            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
+        }
+        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
+        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
+        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
+        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        LOG_TEE("\n");
+    }
+    const int n_tensors = gguf_get_n_tensors(ctx);
+
+    // kv
+    const int n_kv = gguf_get_n_kv(ctx);
+    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+        __func__, n_kv, n_tensors, fname);
+    {
+        std::map<enum ggml_type, uint32_t> n_type;
+
+        for (int i = 0; i < n_tensors; i++) {
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);
+
+            n_type[type]++;
+        }
+
+        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        for (int i = 0; i < n_kv; i++) {
+            const char * name           = gguf_get_key(ctx, i);
+            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
+            const std::string type_name =
+                type == GGUF_TYPE_ARRAY
+                ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
+                : gguf_type_name(type);
+
+            std::string value          = gguf_kv_to_str(ctx, i);
+            const size_t MAX_VALUE_LEN = 40;
+            if (value.size() > MAX_VALUE_LEN) {
+                value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+            }
+            replace_all(value, "\n", "\\n");
+
+            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+        }
+
+        // print type counts
+        for (auto & kv : n_type) {
+            if (kv.second == 0) {
+                continue;
+            }
+
+            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+        }
+    }
+
+    // data
+    size_t model_size = 0;
+    {
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);
+            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
+            size_t tensor_size = ggml_nbytes(cur);
+            model_size += tensor_size;
+            if (verbosity >= 3) {
+                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
+            }
+        }
+    }
+
+    clip_ctx * new_clip = new clip_ctx;
+
+    // update projector type
+    {
+        int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
+        if (idx != -1) {
+            const std::string proj_type = gguf_get_val_str(ctx, idx);
+            new_clip->proj_type = clip_projector_type_from_string(proj_type);
+        } else {
+            new_clip->proj_type = PROJECTOR_TYPE_MLP;
+        }
+
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+            if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
+                new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
+            }
+        }
+    }
+
+#ifdef GGML_USE_CUDA
+    new_clip->backend = ggml_backend_cuda_init(0);
+    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_METAL
+    new_clip->backend = ggml_backend_metal_init();
+    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
+#endif
+
+
+    if (!new_clip->backend) {
+        new_clip->backend = ggml_backend_cpu_init();
+        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
+    }
+
+    // model size and capabilities
+    {
+        int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
+        new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx);
+
+        idx = get_key_idx(ctx, KEY_HAS_VIS_ENC);
+        new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx);
+
+        idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ);
+        if (idx != -1) {
+            new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
+        }
+
+        GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
+        GGML_ASSERT(new_clip->has_vision_encoder);
+        GGML_ASSERT(!new_clip->has_text_encoder);
+
+        idx = get_key_idx(ctx, KEY_USE_GELU);
+        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
+
+        if (verbosity >= 1) {
+            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+        }
+    }
+
+    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+
+    // load tensors
+    {
+        std::vector<uint8_t> read_buf;
+        struct ggml_init_params params = {
+            /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc =*/ true,
+        };
+
+        new_clip->ctx_data = ggml_init(params);
+        if (!new_clip->ctx_data) {
+            LOG_TEE("%s: ggml_init() failed\n", __func__);
+            clip_free(new_clip);
+            gguf_free(ctx);
+            return nullptr;
+        }
+
+        auto fin = std::ifstream(fname, std::ios::binary);
+        if (!fin) {
+            LOG_TEE("cannot open model file for loading tensors\n");
+            clip_free(new_clip);
+            gguf_free(ctx);
+            return nullptr;
+        }
+
+        // add tensors to context
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(ctx, i);
+            struct ggml_tensor * t = ggml_get_tensor(meta, name);
+            struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
+            ggml_set_name(cur, name);
+        }
+
+        // alloc memory and offload data
+        new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend);
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(ctx, i);
+            struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
+            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
+            fin.seekg(offset, std::ios::beg);
+            if (!fin) {
+                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
+                clip_free(new_clip);
+                gguf_free(ctx);
+                return nullptr;
+            }
+            int num_bytes = ggml_nbytes(cur);
+            if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
+                // for the CPU and Metal backend, we can read directly into the tensor
+                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
+            } else {
+                // read into a temporary buffer first, then copy to device memory
+                read_buf.resize(num_bytes);
+                fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
+                ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
+            }
+        }
+        fin.close();
+    }
+
+    // vision model
+    if (new_clip->has_vision_encoder) {
+        // load vision model
+        auto & vision_model = new_clip->vision_model;
+        auto & hparams = vision_model.hparams;
+        hparams.hidden_size    = get_u32(ctx, format(KEY_N_EMBD, "vision"));
+        hparams.n_head         = get_u32(ctx, format(KEY_N_HEAD, "vision"));
+        hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
+        hparams.n_layer        = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
+        hparams.image_size     = get_u32(ctx, KEY_IMAGE_SIZE);
+        hparams.patch_size     = get_u32(ctx, KEY_PATCH_SIZE);
+        hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
+        hparams.eps            = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
+
+        try {
+            int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
+            int n = gguf_get_arr_n(ctx, idx);
+            const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
+            for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
+                hparams.image_grid_pinpoints[i] = pinpoints[i];
+            }
+            if (n < 32)
+                hparams.image_grid_pinpoints[n] = 0;
+        } catch (std::runtime_error & e) {
+            hparams.image_grid_pinpoints[0]=0;
+        }
+
+        try {
+            int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
+            strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
+        } catch (std::runtime_error & e) {
+            strcpy(hparams.mm_patch_merge_type, "flat");
+        }
+
+        try {
+            hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
+        } catch(const std::exception& e) {
+            hparams.image_crop_resolution = hparams.image_size;
+        }
+
+        int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
+        int idx_std  = get_key_idx(ctx, KEY_IMAGE_STD);
+
+        const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean);
+        const float * std_data  = (const float *)gguf_get_arr_data(ctx, idx_std);
+
+        for (int i = 0; i < 3; ++i) {
+            new_clip->image_mean[i] = mean_data[i];
+            new_clip->image_std[i]  = std_data[i];
+        }
+
+        if (verbosity >= 2) {
+            LOG_TEE("\n%s: vision model hparams\n", __func__);
+            LOG_TEE("image_size         %d\n", hparams.image_size);
+            LOG_TEE("patch_size         %d\n", hparams.patch_size);
+            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
+            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
+            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
+            LOG_TEE("v_n_head           %d\n", hparams.n_head);
+            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
+            LOG_TEE("v_eps              %f\n", hparams.eps);
+            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            LOG_TEE("v_image_grid_pinpoints: ");
+            for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
+                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
+            }
+            LOG_TEE("\n");
+            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+
+        }
+
+        try {
+            vision_model.patch_embeddings_w    = get_tensor(new_clip->ctx_data, format(TN_PATCH_EMBD, "weight"));
+            vision_model.patch_embeddings_b    = get_tensor(new_clip->ctx_data, format(TN_PATCH_EMBD, "bias"));
+            // vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
+            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+            // vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+            // vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
+            vision_model.post_ln_w           = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
+            vision_model.post_ln_b           = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
+        } catch(const std::exception& e) {
+            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
+        }
+
+        // LLaVA projection
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
+            vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
+            try {
+                // Yi-type llava
+                vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
+                vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // missing in Yi-type llava
+                vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+                vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // Yi-type llava
+                vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
+                vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // Yi-type llava
+                vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
+                vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
+                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
+            } catch (std::runtime_error & e) {  }
+        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projection
+            vision_model.mm_model_mlp_1_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
+            vision_model.mm_model_mlp_1_b               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
+            vision_model.mm_model_mlp_3_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
+            vision_model.mm_model_mlp_3_b               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
+            vision_model.mm_model_block_1_block_0_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
+            vision_model.mm_model_block_1_block_0_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
+            vision_model.mm_model_block_1_block_0_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
+            vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
+            vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
+            vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
+            vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
+            vision_model.mm_model_block_1_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
+            vision_model.mm_model_block_1_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
+            vision_model.mm_model_block_1_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
+            vision_model.mm_model_block_2_block_0_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
+            vision_model.mm_model_block_2_block_0_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
+            vision_model.mm_model_block_2_block_0_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
+            vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
+            vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
+            vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
+            vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
+            vision_model.mm_model_block_2_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
+            vision_model.mm_model_block_2_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
+            vision_model.mm_model_block_2_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
+        {
+            // MobilVLM_V2 projection
+            vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
+            vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
+            vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
+            vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
+            vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
+            vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+            vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
+            vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
+            vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
+            vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
+            vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
+            vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
+            vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
+            vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
+            vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
+            vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
+            vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
+            vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
+            vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
+            vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
+            vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
+            vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
+            vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
+            vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
+            vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
+        }
+        else {
+            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
+            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+        }
+
+        vision_model.layers.resize(hparams.n_layer);
+
+        for (int il = 0; il < hparams.n_layer; ++il) {
+            auto & layer = vision_model.layers[il];
+            layer.k_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "weight"));
+            layer.q_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q,      "v", il, "weight"));
+            layer.v_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_V,      "v", il, "weight"));
+            layer.o_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
+            layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1,        "v", il, "weight"));
+            layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2,        "v", il, "weight"));
+            layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN,    "v", il, "weight"));
+            layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP,      "v", il, "weight"));
+            layer.k_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "bias"));
+            layer.q_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q,      "v", il, "bias"));
+            layer.v_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_V,      "v", il, "bias"));
+            layer.o_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
+            layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1,        "v", il, "bias"));
+            layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2,        "v", il, "bias"));
+            layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN,    "v", il, "bias"));
+            layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP,      "v", il, "bias"));
+        }
+    }
+
+    ggml_free(meta);
+
+    new_clip->ctx_gguf = ctx;
+
+    // measure mem requirement and allocate
+    {
+        new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
+        new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
+        clip_image_f32_batch batch;
+        batch.size = 1;
+        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
+        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
+        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
+        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+    }
+
+    return new_clip;
+}
+
+struct clip_image_u8 * clip_image_u8_init() {
+    return new clip_image_u8();
+}
+
+struct clip_image_f32 * clip_image_f32_init() {
+    return new clip_image_f32();
+}
+
+void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
+void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
+void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
+    }
+}
+void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
+    }
+}
+
+static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
+    img->nx = nx;
+    img->ny = ny;
+    img->buf.resize(3 * nx * ny);
+    memcpy(img->buf.data(), data, img->buf.size());
+}
+
+bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
+    int nx, ny, nc;
+    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
+    if (!data) {
+        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
+        return false;
+    }
+    build_clip_img_from_data(data, nx, ny, img);
+    stbi_image_free(data);
+    return true;
+}
+
+bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
+    int nx, ny, nc;
+    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
+    if (!data) {
+        LOG_TEE("%s: failed to decode image bytes\n", __func__);
+        return false;
+    }
+    build_clip_img_from_data(data, nx, ny, img);
+    stbi_image_free(data);
+    return true;
+}
+
+// Linear interpolation between two points
+inline float lerp(float s, float e, float t) {
+    return s + (e - s) * t;
+}
+// Bilinear resize function
+static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
+    dst.nx = target_width;
+    dst.ny = target_height;
+    dst.buf.resize(3 * target_width * target_height);
+
+    float x_ratio = static_cast<float>(src.nx - 1) / target_width;
+    float y_ratio = static_cast<float>(src.ny - 1) / target_height;
+
+    for (int y = 0; y < target_height; y++) {
+        for (int x = 0; x < target_width; x++) {
+            float px = x_ratio * x;
+            float py = y_ratio * y;
+            int x_floor = static_cast<int>(px);
+            int y_floor = static_cast<int>(py);
+            float x_lerp = px - x_floor;
+            float y_lerp = py - y_floor;
+
+            for (int c = 0; c < 3; c++) {
+                float top = lerp(
+                    static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
+                    static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
+                    x_lerp
+                );
+                float bottom = lerp(
+                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
+                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
+                    x_lerp
+                );
+                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
+            }
+        }
+    }
+}
+
+// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
+static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
+    dst->nx = src->nx;
+    dst->ny = src->ny;
+    dst->buf.resize(src->buf.size());
+
+    for (size_t i = 0; i < src->buf.size(); ++i) {
+        int c = i % 3; // rgb
+        dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
+    }
+}
+
+inline float clip(float x, float lower, float upper) {
+    return std::max(lower, std::min(x, upper));
+}
+
+static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
+    const int nx = img.nx;
+    const int ny = img.ny;
+
+    dst.nx = target_width;
+    dst.ny = target_height;
+    dst.buf.resize(3 * target_width * target_height);
+
+    float Cc;
+    float C[5];
+    float d0, d2, d3, a0, a1, a2, a3;
+    int i, j, k, jj;
+    int x, y;
+    float dx, dy;
+    float tx, ty;
+
+    tx = (float)nx / (float)target_width;
+    ty = (float)ny / (float)target_height;
+
+    // Bicubic interpolation; adapted from ViT.cpp, inspired from :
+    //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
+    //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation
+
+    for (i = 0; i < target_height; i++) {
+        for (j = 0; j < target_width; j++) {
+            x = (int)(tx * j);
+            y = (int)(ty * i);
+
+            dx = tx * j - x;
+            dy = ty * i - y;
+
+            for (k = 0; k < 3; k++) {
+                for (jj = 0; jj <= 3; jj++) {
+                    d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+
+                    a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                    a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
+                    a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
+
+                    C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
+
+                    d0 = C[0] - C[1];
+                    d2 = C[2] - C[1];
+                    d3 = C[3] - C[1];
+                    a0 = C[1];
+                    a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                    a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
+                    a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
+                    Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
+
+                    const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
+                    dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+// llava-1.6 type of resize_and_pad (black)
+static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) {
+    int target_width = target_resolution.first;
+    int target_height = target_resolution.second;
+
+    float scale_w = static_cast<float>(target_width) / image.nx;
+    float scale_h = static_cast<float>(target_height) / image.ny;
+
+    int new_width, new_height;
+
+    if (scale_w < scale_h) {
+        new_width = target_width;
+        new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
+    } else {
+        new_height = target_height;
+        new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
+    }
+
+    clip_image_u8 resized_image;
+    // bilinear_resize(image, resized_image, new_width, new_height);
+    bicubic_resize(image, resized_image, new_width, new_height);
+
+    clip_image_u8 padded_image;
+    padded_image.nx = target_width;
+    padded_image.ny = target_height;
+    padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black
+
+    // Calculate padding offsets
+    int pad_x = (target_width - new_width) / 2;
+    int pad_y = (target_height - new_height) / 2;
+
+    // Copy the resized image into the center of the padded buffer
+    for (int y = 0; y < new_height; ++y) {
+        for (int x = 0; x < new_width; ++x) {
+            for (int c = 0; c < 3; ++c) {
+                padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
+            }
+        }
+    }
+    image_output = std::move(padded_image);
+}
+
+/**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * @param original_size The original size of the image in the format (width, height).
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+ * @return The best fit resolution in the format (width, height).
+ */
+static std::pair<int, int> select_best_resolution(const std::pair<int, int> & original_size, const std::vector<std::pair<int, int>> & possible_resolutions) {
+    int original_width = original_size.first;
+    int original_height = original_size.second;
+    std::pair<int, int> best_fit;
+    int max_effective_resolution = 0;
+    int min_wasted_resolution = std::numeric_limits<int>::max();
+
+    for (const auto& resolution : possible_resolutions) {
+        int width = resolution.first;
+        int height = resolution.second;
+        float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
+        int downscaled_width = static_cast<int>(original_width * scale);
+        int downscaled_height = static_cast<int>(original_height * scale);
+        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
+        int wasted_resolution = (width * height) - effective_resolution;
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+            max_effective_resolution = effective_resolution;
+            min_wasted_resolution = wasted_resolution;
+            best_fit = resolution;
+        }
+    }
+
+    return best_fit;
+}
+
+static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
+    std::vector<clip_image_u8*> patches;
+    int width = image.nx;
+    int height = image.ny;
+    for (int i = 0; i < height; i += patch_size) {
+        for (int j = 0; j < width; j += patch_size) {
+            clip_image_u8 *patch = clip_image_u8_init();
+            patch->nx = std::min(patch_size, width - j);
+            patch->ny = std::min(patch_size, height - i);
+            patch->buf.resize(3 * patch->nx * patch->ny);
+            for (int y = 0; y < patch->ny; ++y) {
+                for (int x = 0; x < patch->nx; ++x) {
+                    for (int c = 0; c < 3; ++c) {
+                        patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c];
+                    }
+                }
+            }
+            patches.push_back(patch);
+        }
+    }
+    return patches;
+}
+
+// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
+// res_imgs memory is being allocated here, previous allocations will be freed if found
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
+    bool pad_to_square = true;
+    if (!ctx->has_vision_encoder) {
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        return false;
+    }
+    auto & params = ctx->vision_model.hparams;
+    // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
+    if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
+        pad_to_square = false;
+    }
+    // free the previous res_imgs if any set
+    if (res_imgs->size > 0) {
+        clip_image_f32_batch_free(res_imgs);
+    }
+    res_imgs->data = nullptr;
+    res_imgs->size = 0;
+
+    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
+    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
+
+    clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
+    if (pad_to_square && img->nx != img->ny) {
+        int longer_side = std::max(img->nx, img->ny);
+        temp->nx = longer_side;
+        temp->ny = longer_side;
+        temp->buf.resize(3 * longer_side * longer_side);
+        const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
+
+        // fill with background color
+        for (size_t i = 0; i < temp->buf.size(); i++) {
+            temp->buf[i] = bc[i % 3];
+        }
+
+        // copy from the input image
+        for (int y = 0; y < img->ny; y++) {
+            for (int x = 0; x < img->nx; x++) {
+                const int i = 3 * (y * img->nx + x);
+                const int j = 3 * (y * temp->nx + x);
+                temp->buf[j]   = img->buf[i];
+                temp->buf[j+1] = img->buf[i+1];
+                temp->buf[j+2] = img->buf[i+2];
+            }
+        }
+    } else {
+        if (params.image_grid_pinpoints[0] != 0) {
+            // "spatial_unpad" with "anyres" processing for llava-1.6
+            std::vector<std::pair<int, int>> possible_resolutions;
+            for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+                possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+            }
+            std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
+            // clip_image_save_to_bmp(*img, "input.bmp");
+            resize_and_pad_image(*img, *temp, best_resolution);  // we do not pad with mean-bg color anymore in llava-1.6
+            // clip_image_save_to_bmp(*temp, "resized.bmp");
+            // visually verify normalized image:
+            // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
+            // {
+            //     clip_image_u8 * temp2 = clip_image_u8_init();
+            //     clip_image_convert_f32_to_u8(*res, *temp2);
+            //     clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
+            //     clip_image_u8_free(temp2);
+            // }
+
+            std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
+
+            clip_image_u8 *image_original_resize = clip_image_u8_init();
+            // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
+            bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
+            patches.insert(patches.begin(), image_original_resize);
+            // clip_image_f32_batch_init(patches.size());
+            res_imgs->size = patches.size();
+            res_imgs->data = new clip_image_f32[res_imgs->size];
+            int num=0;
+            for (auto& patch : patches) {
+                normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
+                num++;
+            }
+
+            for (size_t i = 0; i < patches.size(); i++) {
+                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                clip_image_u8_free(patches[i]);
+            }
+
+            clip_image_u8_free(temp);
+
+            return true;
+        } else {
+            temp->nx = img->nx;
+            temp->ny = img->ny;
+            temp->buf.resize(img->buf.size());
+            memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
+        }
+    }
+
+    const int nx = temp->nx;
+    const int ny = temp->ny;
+    // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
+
+    const int nx2 = ctx->vision_model.hparams.image_size;
+    const int ny2 = ctx->vision_model.hparams.image_size;
+    clip_image_f32 * res = clip_image_f32_init();
+    res->nx = nx2;
+    res->ny = ny2;
+    res->buf.resize(3 * nx2 * ny2);
+
+    const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
+
+    const int nx3 = int(nx / scale + 0.5f);
+    const int ny3 = int(ny / scale + 0.5f);
+
+    const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
+    const auto & s3 = ctx->image_std;  // {0.26862954f, 0.26130258f, 0.27577711f};
+
+    for (int y = 0; y < ny3; y++) {
+        for (int x = 0; x < nx3; x++) {
+            for (int c = 0; c < 3; c++) {
+                // linear interpolation
+                const float sx = (x + 0.5f) * scale - 0.5f;
+                const float sy = (y + 0.5f) * scale - 0.5f;
+
+                const int x0 = std::max(0, (int)std::floor(sx));
+                const int y0 = std::max(0, (int)std::floor(sy));
+
+                const int x1 = std::min(x0 + 1, nx - 1);
+                const int y1 = std::min(y0 + 1, ny - 1);
+
+                const float dx = sx - x0;
+                const float dy = sy - y0;
+
+                const int j00 = 3 * (y0 * nx + x0) + c;
+                const int j01 = 3 * (y0 * nx + x1) + c;
+                const int j10 = 3 * (y1 * nx + x0) + c;
+                const int j11 = 3 * (y1 * nx + x1) + c;
+
+                const float v00 = temp->buf[j00];
+                const float v01 = temp->buf[j01];
+                const float v10 = temp->buf[j10];
+                const float v11 = temp->buf[j11];
+
+                const float v0 = v00 * (1.0f - dx) + v01 * dx;
+                const float v1 = v10 * (1.0f - dx) + v11 * dx;
+
+                const float v = v0 * (1.0f - dy) + v1 * dy;
+
+                const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
+
+                const int i = 3 * (y * nx3 + x) + c;
+
+                res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
+            }
+        }
+    }
+    clip_image_u8_free(temp);
+
+    // {
+    //     clip_image_u8 * temp2 = clip_image_u8_init();
+    //     clip_image_convert_f32_to_u8(*res, *temp2);
+    //     clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
+    //     clip_image_u8_free(temp2);
+    // }
+    // res_imgs.push_back(res);
+
+    res_imgs->size = 1;
+    res_imgs->data = new clip_image_f32[res_imgs->size];
+    res_imgs->data[0] = *res;
+    clip_image_f32_free(res);
+
+    return true;
+}
+
+ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
+    return ctx->vision_model.image_newline;
+}
+
+void clip_free(clip_ctx * ctx) {
+    ggml_free(ctx->ctx_data);
+    gguf_free(ctx->ctx_gguf);
+
+    ggml_backend_buffer_free(ctx->params_buffer);
+    ggml_backend_free(ctx->backend);
+    ggml_gallocr_free(ctx->compute_alloc);
+    delete ctx;
+}
+
+size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
+    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
+}
+
+int32_t clip_image_size(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.image_size;
+}
+
+int32_t clip_patch_size(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.patch_size;
+}
+
+int32_t clip_hidden_size(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.hidden_size;
+}
+
+const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.mm_patch_merge_type;
+}
+
+const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.image_grid_pinpoints;
+}
+
+int clip_n_patches(const struct clip_ctx * ctx) {
+    const auto & params = ctx->vision_model.hparams;
+
+    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
+        n_patches /= 4;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+        n_patches = 64;
+    }
+
+    return n_patches;
+}
+
+bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
+    if (!ctx->has_vision_encoder) {
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        return false;
+    }
+
+    clip_image_f32_batch imgs{};
+    imgs.size = 1;
+    imgs.data = img;
+    return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
+}
+
+bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
+    if (!ctx->has_vision_encoder) {
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        return false;
+    }
+
+    int batch_size = imgs->size;
+    if (ctx->has_llava_projector) {
+        GGML_ASSERT(batch_size == 1); // TODO: support multiple images
+    }
+
+    // build the inference graph
+    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
+    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
+
+    // set inputs
+    const auto & model = ctx->vision_model;
+    const auto & hparams = model.hparams;
+
+    const int image_size    = hparams.image_size;
+    const int patch_size    = hparams.patch_size;
+    const int num_patches   = ((image_size / patch_size) * (image_size / patch_size));
+    const int num_positions = num_patches;
+
+    {
+        struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
+        float * data = (float *)malloc(ggml_nbytes(inp_raw));
+
+        for (size_t i = 0; i < imgs->size; i++) {
+            const int nx = imgs->data[i].nx;
+            const int ny = imgs->data[i].ny;
+            GGML_ASSERT(nx == image_size && ny == image_size);
+
+            const int n = nx * ny;
+
+            for (int b = 0; b < batch_size; b++) {
+                for (int k = 0; k < 3; k++) {
+                    for (int y = 0; y < ny; y++) {
+                        for (int x = 0; x < nx; x++) {
+                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
+                        }
+                    }
+                }
+            }
+        }
+        ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
+        free(data);
+    }
+
+    {
+        struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
+
+        int* positions_data = (int*)malloc(ggml_nbytes(positions));
+        for (int i = 0; i < num_positions; i++) {
+            positions_data[i] = i;
+        }
+        ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
+        free(positions_data);
+    }
+
+    // {
+    //     struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+    //     int* patches_data = (int*)malloc(ggml_nbytes(patches));
+    //     for (int i = 0; i < num_patches; i++) {
+    //         patches_data[i] = i + 1;
+    //     }
+    //     ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+    //     free(patches_data);
+    // }
+
+    if (ggml_backend_is_cpu(ctx->backend)) {
+        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
+    }
+
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(ctx->backend)) {
+        ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
+    }
+#endif
+    ggml_backend_graph_compute(ctx->backend, gf);
+
+    // the last node is the embedding tensor
+    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
+
+    // copy the embeddings to the location passed by the user
+    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+
+    return true;
+}
+
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
+    ggml_type type = GGML_TYPE_Q4_1;
+
+    assert(itype < GGML_TYPE_COUNT);
+    type = static_cast<ggml_type>(itype);
+
+    auto * ctx_clip = clip_model_load(fname_inp, 2);
+
+    const auto & ctx_src = ctx_clip->ctx_gguf;
+    const auto & ctx_data = ctx_clip->ctx_data;
+
+    auto * ctx_out = gguf_init_empty();
+    gguf_set_kv(ctx_out, ctx_src);
+    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
+    gguf_set_val_u32(ctx_out, "general.file_type", itype);
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+
+    const int n_tensors = gguf_get_n_tensors(ctx_src);
+
+    for (int i = 0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(ctx_src, i);
+        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+        gguf_add_tensor(ctx_out, cur);
+    }
+
+    const size_t meta_size = gguf_get_meta_size(ctx_out);
+    for (size_t i = 0; i < meta_size; ++i) {
+        fout.put(0);
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> k_names = {
+        ".*weight",
+    };
+
+    std::vector<uint8_t> work(512);
+    std::vector<float> conv_buf(512);
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    for (int i = 0; i < n_tensors; ++i) {
+        const std::string name = gguf_get_tensor_name(ctx_src, i);
+        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
+
+        enum ggml_type new_type;
+        void * new_data;
+        size_t new_size;
+
+        bool quantize = false;
+        for (const auto & s : k_names) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = true;
+                break;
+            }
+        }
+
+        // quantize only 2D tensors
+        quantize &= (ggml_n_dims(cur) == 2);
+
+        if (quantize) {
+            new_type = type;
+            if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
+                new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
+                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+            }
+            const size_t n_elms = ggml_nelements(cur);
+            float * f32_data;
+
+            switch (cur->type) {
+            case GGML_TYPE_F32:
+                f32_data = (float *)cur->data;
+                break;
+            case GGML_TYPE_F16:
+                if (conv_buf.size() < n_elms) {
+                    conv_buf.resize(n_elms);
+                }
+                for (size_t j = 0; j < n_elms; ++j) {
+                    conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
+                }
+                f32_data = (float *)conv_buf.data();
+                break;
+            default:
+                LOG_TEE("Please use an input file in f32 or f16\n");
+                gguf_free(ctx_out);
+                return false;
+            }
+
+            if (work.size() < n_elms * 4) {
+                work.resize(n_elms * 4);
+            }
+            new_data = work.data();
+
+            new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
+        } else {
+            new_type = cur->type;
+            new_data = cur->data;
+            new_size = ggml_nbytes(cur);
+        }
+        const size_t orig_size = ggml_nbytes(cur);
+        total_size_org += orig_size;
+        total_size_new += new_size;
+        gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
+        gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
+        fout.write((const char *)new_data, new_size);
+        size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
+        for (size_t j = 0; j < pad; ++j) {
+            fout.put(0);
+        }
+
+        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+    }
+
+    // go back to beginning of file and write the updated metadata
+    fout.seekp(0, std::ios::beg);
+    std::vector<uint8_t> meta(meta_size);
+    gguf_get_meta_data(ctx_out, meta.data());
+    fout.write((const char *)meta.data(), meta_size);
+
+    fout.close();
+
+    clip_free(ctx_clip);
+    gguf_free(ctx_out);
+
+    {
+        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+    }
+
+    return true;
+}
+
+int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
+    }
+    if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
+        return ctx->vision_model.mm_model_peg_0_b->ne[0];
+    }
+    if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+        return ctx->vision_model.mm_2_b->ne[0];
+    }
+    if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+        return ctx->vision_model.mm_3_b->ne[0];
+    }
+    if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+        return 2304;
+    }
+
+    std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
+    throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+}
diff --git a/examples/minicpmv/clip.h b/examples/minicpmv/clip.h
new file mode 100644
index 0000000000000..45bdad6897658
--- /dev/null
+++ b/examples/minicpmv/clip.h
@@ -0,0 +1,85 @@
+#ifndef CLIP_H
+#define CLIP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define CLIP_API __declspec(dllexport)
+#        else
+#            define CLIP_API __declspec(dllimport)
+#        endif
+#    else
+#        define CLIP_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define CLIP_API
+#endif
+
+struct clip_ctx;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct clip_ctx;
+
+struct clip_image_u8_batch {
+    struct clip_image_u8 * data;
+    size_t size;
+};
+
+struct clip_image_f32_batch {
+    struct clip_image_f32 * data;
+    size_t size;
+};
+
+CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
+CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
+
+CLIP_API void clip_free(struct clip_ctx * ctx);
+
+CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+
+CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
+
+// TODO: should be enum, not string
+CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
+
+CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
+
+CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+
+CLIP_API struct clip_image_u8  * clip_image_u8_init ();
+CLIP_API struct clip_image_f32 * clip_image_f32_init();
+
+CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
+CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
+CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+
+CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+
+/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
+CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+
+/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
+
+CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
+
+CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
+CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+
+CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CLIP_H
diff --git a/examples/minicpmv/convert-image-encoder-to-gguf.py b/examples/minicpmv/convert-image-encoder-to-gguf.py
new file mode 100644
index 0000000000000..25a255f54cad9
--- /dev/null
+++ b/examples/minicpmv/convert-image-encoder-to-gguf.py
@@ -0,0 +1,403 @@
+import argparse
+import os
+import json
+import re
+
+import torch
+import numpy as np
+from gguf import *
+import timm
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    if has_llava and name in ["visual_projection.weight"]:
+        return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+    if "mm_projector" in name:
+        name = name.replace("model.mm_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False,
+                help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False,
+                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
+ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+                help="The clip model is from openclip (for ViT-SO400M type))")
+ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
+default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+default_image_std = [0.26862954, 0.26130258, 0.27577711]
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+
+# with proper
+args = ap.parse_args()
+
+
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+# if args.clip_model_is_vision or args.clip_model_is_openclip:
+#     model = CLIPVisionModel.from_pretrained(dir_model)
+#     processor = None
+# else:
+#     model = CLIPModel.from_pretrained(dir_model)
+#     processor = CLIPProcessor.from_pretrained(dir_model)
+model = timm.create_model(
+    "vit_so400m_patch14_siglip_384.webli",
+    pretrained=False,
+    num_classes=0,
+    dynamic_img_size=True,
+    dynamic_img_pad=True,
+)
+processor = None
+if model.attn_pool is not None:
+    model.attn_pool = torch.nn.Identity()
+
+model.blocks = model.blocks[:-1]
+model.load_state_dict(torch.load(os.path.join(dir_model, "llava.clip")))
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_llava_projector = False
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.llava_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_llava_projector = True
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_llava_projector", has_llava_projector)
+fout.add_file_type(ftype)
+if args.text_only:
+    fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_llava_projector:
+    fout.add_description("vision-only CLIP model")
+elif has_llava_projector:
+    fout.add_description("image encoder for LLaVA")
+    # add projector type
+    fout.add_string("clip.projector_type", "resampler")
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_vision_encoder:
+    # vision_model hparams
+    fout.add_uint32("clip.vision.image_size", 448)
+    fout.add_uint32("clip.vision.patch_size", 14)
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), 1152)
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
+    fout.add_uint32("clip.vision.projection_dim", 0)
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    block_count = 26
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+
+    if processor is not None:
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+    else:
+        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+        image_std = args.image_std if args.image_std is not None else default_image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = True
+fout.add_bool("clip.use_gelu", use_gelu)
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_h_size, grid_w_size = grid_size, grid_size
+    else:
+        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+def _replace_name_resampler(s, v):
+    if re.match("resampler.pos_embed", s):
+        return {
+            s: v,
+            re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(2304, (448//14, 448//14))),
+        }
+    if re.match("resampler.proj", s):
+        return {
+            re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
+        }
+    if re.match("resampler.attn.in_proj_.*", s):
+        return {
+            re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
+            re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
+            re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
+        }
+    return {s: v}
+
+if has_llava_projector:
+    projector = torch.load(args.llava_projector)
+    new_state_dict = {}
+    for k, v in projector.items():
+        kvs = _replace_name_resampler(k, v)
+        for nk, nv in kvs.items():
+            new_state_dict[nk] = nv
+    projector = new_state_dict
+    for name, data in projector.items():
+        name = get_tensor_name(name)
+        data = data.squeeze().numpy()
+
+        n_dims = len(data.shape)
+        if ftype == 1:
+            if name[-7:] == ".weight" and n_dims == 2:
+                print("  Converting to float16")
+                data = data.astype(np.float16)
+                ftype_cur = 1
+            else:
+                print("  Converting to float32")
+                data = data.astype(np.float32)
+                ftype_cur = 0
+        else:
+            if data.dtype != np.float32:
+                print("  Converting to float32")
+                data = data.astype(np.float32)
+                ftype_cur = 0
+
+        fout.add_tensor(name, data)
+        print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+
+    print("Projector tensors added\n")
+
+def _replace_name(s, v):
+    if re.match("blocks.([0-9]+).attn.qkv.weight", s):
+        return {
+            re.sub("blocks.([0-9]+).attn.qkv.weight", "vision_model.encoder.layers.\\1.self_attn.q_proj.weight", s): v.chunk(3, dim=0)[0],
+            re.sub("blocks.([0-9]+).attn.qkv.weight", "vision_model.encoder.layers.\\1.self_attn.k_proj.weight", s): v.chunk(3, dim=0)[1],
+            re.sub("blocks.([0-9]+).attn.qkv.weight", "vision_model.encoder.layers.\\1.self_attn.v_proj.weight", s): v.chunk(3, dim=0)[2],
+        }
+    if re.match("blocks.([0-9]+).attn.qkv.bias", s):
+        return {
+            re.sub("blocks.([0-9]+).attn.qkv.bias", "vision_model.encoder.layers.\\1.self_attn.q_proj.bias", s): v.chunk(3, dim=0)[0],
+            re.sub("blocks.([0-9]+).attn.qkv.bias", "vision_model.encoder.layers.\\1.self_attn.k_proj.bias", s): v.chunk(3, dim=0)[1],
+            re.sub("blocks.([0-9]+).attn.qkv.bias", "vision_model.encoder.layers.\\1.self_attn.v_proj.bias", s): v.chunk(3, dim=0)[2],
+        }
+    if re.match("pos_embed", s):
+        from timm.layers import resample_abs_pos_embed
+        s = re.sub("pos_embed", "vision_model.embeddings.position_embedding", s)
+        v = resample_abs_pos_embed(v, (448//14, 448//14), num_prefix_tokens=0)
+        return {s: v}
+
+    s = re.sub("patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.proj.weight", s)
+    s = re.sub("patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.proj.bias", s)
+
+    # norm
+    s = re.sub("blocks.([0-9]+).norm([0-9]+).weight", "vision_model.encoder.layers.\\1.layer_norm\\2.weight", s)
+    s = re.sub("blocks.([0-9]+).norm([0-9]+).bias", "vision_model.encoder.layers.\\1.layer_norm\\2.bias", s)
+
+    s = re.sub("blocks.([0-9]+).attn.proj.weight", "vision_model.encoder.layers.\\1.self_attn.out_proj.weight", s)
+    s = re.sub("blocks.([0-9]+).attn.proj.bias", "vision_model.encoder.layers.\\1.self_attn.out_proj.bias", s)
+
+    s = re.sub("blocks.([0-9]+).mlp.fc([0-9]+).weight", "vision_model.encoder.layers.\\1.mlp.fc\\2.weight", s)
+    s = re.sub("blocks.([0-9]+).mlp.fc([0-9]+).bias", "vision_model.encoder.layers.\\1.mlp.fc\\2.bias", s)
+
+    s = re.sub("norm.weight", "vision_model.post_layernorm.weight", s)
+    s = re.sub("norm.bias", "vision_model.post_layernorm.bias", s)
+
+    return {s: v}
+
+state_dict = model.state_dict()
+new_state_dict = {}
+for k, v in state_dict.items():
+    kvs = _replace_name(k, v)
+    for nk, nv in kvs.items():
+        new_state_dict[nk] = nv
+state_dict = new_state_dict
+for name, data in state_dict.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/examples/minicpmv/minicpm-surgery.py b/examples/minicpmv/minicpm-surgery.py
new file mode 100644
index 0000000000000..97a02c6ca9b50
--- /dev/null
+++ b/examples/minicpmv/minicpm-surgery.py
@@ -0,0 +1,48 @@
+import argparse
+import glob
+import os
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
+args = ap.parse_args()
+
+# find the model part that includes the the multimodal projector weights
+model = AutoModel.from_pretrained(args.model, trust_remote_code=True)
+checkpoint = model.state_dict()
+
+# get a list of mm tensor names
+mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
+
+# store these tensors in a new dictionary and torch.save them
+projector = {name: checkpoint[name].float() for name in mm_tensors}
+torch.save(projector, f"{args.model}/llava.projector")
+
+clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
+if len(clip_tensors) > 0:
+    clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
+    torch.save(clip, f"{args.model}/llava.clip")
+
+    # added tokens should be removed to be able to convert Mistral models
+    if os.path.exists(f"{args.model}/added_tokens.json"):
+        with open(f"{args.model}/added_tokens.json", "w") as f:
+            f.write("{}\n")
+
+config = model.llm.config
+config._name_or_path = "openbmb/CPM-2B"
+config.auto_map = {
+    "AutoConfig": "configuration_minicpm.MiniCPMConfig",
+    "AutoModel": "modeling_minicpm.MiniCPMModel",
+    "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
+    "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
+}
+model.llm.save_pretrained(f"{args.model}/MiniCPM")
+tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+tok.save_pretrained(f"{args.model}/MiniCPM")
+os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM/modeling_minicpm.py")
+
+print("Done!")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/examples/minicpmv/minicpmv-cli.cpp b/examples/minicpmv/minicpmv-cli.cpp
new file mode 100644
index 0000000000000..f8d8e61caad34
--- /dev/null
+++ b/examples/minicpmv/minicpmv-cli.cpp
@@ -0,0 +1,331 @@
+#include "ggml.h"
+#include "log.h"
+#include "common.h"
+#include "clip.h"
+#include "minicpmv.h"
+#include "llama.h"
+
+#include "base64.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+    int N = (int) tokens.size();
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past);
+}
+
+static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
+    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+    return true;
+}
+
+static const char * sample(struct llama_sampling_context * ctx_sampling,
+                           struct llama_context * ctx_llama,
+                           int * n_past) {
+    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
+    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
+    static std::string ret;
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+        ret = "</s>";
+    } else {
+        ret = llama_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past);
+    return ret.c_str();
+}
+
+static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
+static const char* IMG_BASE64_TAG_END = "\">";
+
+static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
+    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
+    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
+}
+
+static bool prompt_contains_image(const std::string& prompt) {
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    return (begin != std::string::npos);
+}
+
+// replaces the base64 image tag in the prompt with `replacement`
+static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
+    size_t img_base64_str_start, img_base64_str_end;
+    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
+    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
+        LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        return NULL;
+    }
+
+    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
+    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
+    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
+
+    auto required_bytes = base64::required_encode_size(base64_str.size());
+    auto img_bytes = std::vector<unsigned char>(required_bytes);
+    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
+
+    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
+    if (!embed) {
+        LOG_TEE("%s: could not load image from base64 string.\n", __func__);
+        return NULL;
+    }
+
+    return embed;
+}
+
+static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    if (begin == std::string::npos || end == std::string::npos) {
+        return prompt;
+    }
+    auto pre = prompt.substr(0, begin);
+    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
+    return pre + replacement + post;
+}
+
+struct llava_context {
+    struct clip_ctx * ctx_clip = NULL;
+    struct llama_context * ctx_llama = NULL;
+    struct llama_model * model = NULL;
+};
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
+
+    // load and preprocess the image
+    llava_image_embed * embed = NULL;
+    auto prompt = params->prompt;
+    if (prompt_contains_image(prompt)) {
+        if (!params->image.empty()) {
+            LOG_TEE("using base64 encoded image instead of command line image path\n");
+        }
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+        if (!embed) {
+            LOG_TEE("%s: can't load image from prompt\n", __func__);
+            return NULL;
+        }
+        params->prompt = remove_image_from_prompt(prompt);
+    } else {
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
+        if (!embed) {
+            LOG_TEE("%s: is %s really an image file?\n", __func__, params->image.c_str());
+            return NULL;
+        }
+    }
+
+    return embed;
+}
+
+static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, std::vector<std::vector<struct llava_image_embed *>> image_embed_slices, gpt_params * params, int &n_past) {
+    std::string system_prompt;
+
+    system_prompt = "<用户>";
+
+    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, true);
+    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
+    if (!image_embed_slices.empty()) {
+        eval_string(ctx_llava->ctx_llama, std::string("</image><slice>").c_str(), params->n_batch, &n_past, false);
+        for (int i = 0; i < image_embed_slices.size(); ++i) {
+            eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
+            for (int j = 0; j < image_embed_slices[i].size(); ++j) {
+                llava_eval_image_embed(ctx_llava->ctx_llama, image_embed_slices[i][j], params->n_batch, &n_past);
+                if (j != image_embed_slices[i].size() - 1) {
+                    eval_string(ctx_llava->ctx_llama, std::string("</image><image>").c_str(), params->n_batch, &n_past, false);
+                } else {
+                    if (i != image_embed_slices.size() - 1) {
+                        eval_string(ctx_llava->ctx_llama, std::string("</image>\n").c_str(), params->n_batch, &n_past, false);
+                    } else {
+                        eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
+                    }
+                }
+            }
+        }
+        eval_string(ctx_llava->ctx_llama, std::string("</slice>\n").c_str(), params->n_batch, &n_past, false);
+
+    } else {
+        eval_string(ctx_llava->ctx_llama, std::string("</image>\n").c_str(), params->n_batch, &n_past, false);
+    }
+}
+
+static void process_user_input(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false) {
+    std::string user_prompt = prompt + "<AI>";
+    if (!is_first) user_prompt = "<用户>" + prompt;
+    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+
+    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
+
+    LOG_TEE("\n");
+
+    // generate the response
+
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
+    std::string response = "";
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+        response += tmp;
+        if (strcmp(tmp, "</s>") == 0) break;
+        if (strstr(tmp, "###")) break; // Yi-VL behavior
+        printf("%s", tmp);
+        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
+        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
+        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
+        if (strstr(response.c_str(), "<用户>")) break; // minicpm-v 
+
+        fflush(stdout);
+    }
+
+    llama_sampling_free(ctx_sampling);
+    printf("\n");
+}
+
+static struct llava_context * llava_init(gpt_params * params) {
+    const char * clip_path = params->mmproj.c_str();
+
+    auto prompt = params->prompt;
+    if (prompt.empty()) {
+        prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+
+    llama_backend_init();
+    llama_numa_init(params->numa);
+
+    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
+
+    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        LOG_TEE("%s: error: unable to load model\n" , __func__);
+        return NULL;
+    }
+
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
+    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
+
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
+        return NULL;
+    }
+
+    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+
+    ctx_llava->ctx_llama = ctx_llama;
+    ctx_llava->ctx_clip = ctx_clip;
+    ctx_llava->model = model;
+    return ctx_llava;
+}
+
+static void llava_free(struct llava_context * ctx_llava) {
+    if (ctx_llava->ctx_clip) {
+        clip_free(ctx_llava->ctx_clip);
+        ctx_llava->ctx_clip = NULL;
+    }
+
+    llama_free(ctx_llava->ctx_llama);
+    llama_free_model(ctx_llava->model);
+    llama_backend_free();
+}
+
+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("llava", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS
+
+    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+        gpt_print_usage(argc, argv, params);
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    auto ctx_llava = llava_init(&params);
+    if (ctx_llava == NULL) {
+        LOG_TEE("%s: error: failed to init llava\n", __func__);
+        return 1;
+    }
+
+    // auto image_embed = load_image(ctx_llava, &params);
+    auto image_embed_and_slices = llava_image_embed_make_with_filename_slice(ctx_llava->ctx_clip, params.n_threads, params.image.c_str());
+    auto image_embed = image_embed_and_slices.first;
+    auto image_embed_slices = image_embed_and_slices.second;
+
+    if (!image_embed) {
+        LOG_TEE("%s: is %s really an image file?\n", __func__, params.image.c_str());
+        return 1;
+    }
+
+    // process the prompt
+    if (params.prompt.empty() && params.interactive == false) {
+        LOG_TEE("prompt should be given or interactive mode should be on");
+        return 1;
+    }
+
+    int n_past = 0;
+    process_image(ctx_llava, image_embed, image_embed_slices, &params, n_past);
+
+    if (!params.prompt.empty()) {
+        LOG_TEE("<用户>%s\n", params.prompt.c_str());
+        LOG_TEE("<AI>");
+        process_user_input(ctx_llava, &params, params.prompt, n_past, true);
+    } else {
+        while (true) {
+            LOG_TEE("<用户>");
+            std::string prompt;
+            std::getline(std::cin, prompt);
+            LOG_TEE("<AI>");
+            process_user_input(ctx_llava, &params, prompt, n_past, true);
+        }
+    }
+
+    llama_print_timings(ctx_llava->ctx_llama);
+
+    llava_image_embed_free(image_embed);
+    llava_free(ctx_llava);
+    return 0;
+}
diff --git a/examples/minicpmv/minicpmv.cpp b/examples/minicpmv/minicpmv.cpp
new file mode 100644
index 0000000000000..94fe662ba4ff1
--- /dev/null
+++ b/examples/minicpmv/minicpmv.cpp
@@ -0,0 +1,556 @@
+#include "clip.h"
+#include "common.h"
+#include "llama.h"
+#include "minicpmv.h"
+#include "base64.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+#include <numeric>
+
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+struct clip_image_grid_shape {
+    int first;
+    int second;
+};
+
+/**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * @param original_size The original size of the image in the format (width, height).
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+ * @return The best fit resolution in the format (width, height).
+ */
+static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
+    int original_width  = original_size.first;
+    int original_height = original_size.second;
+
+    std::pair<int, int> best_fit;
+    int max_effective_resolution = 0;
+    int min_wasted_resolution = std::numeric_limits<int>::max();
+
+    for (const auto& resolution : possible_resolutions) {
+        int width = resolution.first;
+        int height = resolution.second;
+        float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
+        int downscaled_width  = static_cast<int>(original_width * scale);
+        int downscaled_height = static_cast<int>(original_height * scale);
+        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
+        int wasted_resolution = (width * height) - effective_resolution;
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+            max_effective_resolution = effective_resolution;
+            min_wasted_resolution = wasted_resolution;
+            best_fit = resolution;
+        }
+    }
+
+    return best_fit;
+}
+
+/**
+ * @brief Get the anyres image grid shape object
+ *
+ * @param image_size
+ * @param grid_pinpoints
+ * @param image_patch_size
+ * @return <int, int>
+ */
+static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
+    /**
+        Conversion from gguf flat array to vector:
+        std::vector<std::pair<int, int>> possible_resolutions;
+        for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+            possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+        }
+     */
+    auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
+    return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
+}
+
+// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
+static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
+    struct {
+        struct ggml_tensor * newline;
+        struct ggml_context * ctx;
+    } model;
+
+    const int32_t image_size = clip_image_size(ctx_clip);
+    const int32_t patch_size = clip_patch_size(ctx_clip);
+
+    int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
+
+    int num_patches_width  = grid_shape.first;  // grid 1-4
+    int num_patches_height = grid_shape.second; // grid 1-4
+
+    const size_t num_images = num_patches_width * num_patches_height + 1;
+
+    // TODO: size calculation is not calculated - it's only tens of MB
+    size_t ctx_size = 0;
+
+    {
+        ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
+        ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
+    }
+
+    struct ggml_init_params params {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
+    };
+
+    // Python reference code for full unpad:
+    /*
+        base_image_feature = image_feature[0]
+        image_feature = image_feature[1:]
+        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+        image_feature = unpad_image(image_feature, image_sizes[image_idx])
+        image_feature = torch.cat((
+            image_feature,
+            self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
+        ), dim=-1)
+        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+    */
+    // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
+    // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
+    // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
+    // Once all images are processed to prepended the base_image_features without any changes.
+
+    // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
+    /*
+        image_feature = image_feature.view(2, 2, 24, 24, 4096)
+        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+        image_feature = image_feature.view(2, 24, 2, 24, 4096)
+        image_feature = image_feature.flatten(0, 3)
+
+        // Reshape to 4D tensor by merging the last two dimensions
+        image_feature = image_feature.view(2, 2, 24, 24*4096)
+        image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
+        image_feature = image_feature.view(-1, 4096)
+    */
+
+    model.ctx = ggml_init(params);
+
+    ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
+    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
+    if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
+        if (newline_tmp->buffer == NULL) {
+            LOG_TEE("newline_tmp tensor buffer is NULL\n");
+        }
+        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
+    } else {
+        model.newline->data = newline_tmp->data;
+        if (model.newline->data == NULL) {
+            LOG_TEE("newline_tmp tensor data is NULL\n");
+        }
+    }
+
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
+    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
+    // fill it with the image embeddings, ignoring the base
+    for (size_t i = 1; i < num_images; i++) {
+        size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
+        memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
+    }
+
+    struct ggml_cgraph  * gf = ggml_new_graph(model.ctx);
+    size_t size_ele = ggml_type_size(GGML_TYPE_F32);
+
+    struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
+                                                                num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
+                                                                num_patches_per_side,
+                                                                num_patches_width,
+                                                                num_patches_height,
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
+    // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
+    struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
+    /**
+     At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
+         image_feature = torch.cat((
+        image_feature,
+        self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
+    ), dim=-1)
+     *
+     */
+
+    // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
+    struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
+    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
+    ggml_build_forward_expand(gf, flatten);
+    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
+    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
+
+    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
+    // append without newline tokens (default behavior in llava_arch when not using unpad ):
+    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
+    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
+
+    // Debug: Test single segments
+    // Current findings: sending base image, sending a segment embedding all works similar to python
+    // However, permuted embeddings do not work yet (stride issue?)
+    // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
+    // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
+    // *n_img_pos_out=576;
+
+    ggml_free(model.ctx);
+    return true;
+}
+
+
+static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
+    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
+    clip_image_f32_batch img_res_v;
+    img_res_v.size = 0;
+    img_res_v.data = nullptr;
+    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
+        LOG_TEE("%s: unable to preprocess image\n", __func__);
+        delete[] img_res_v.data;
+        return false;
+    }
+
+    const int64_t t_img_enc_start_us = ggml_time_us();
+
+    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
+
+    if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
+        // flat / default llava-1.5 type embedding
+        *n_img_pos = clip_n_patches(ctx_clip);
+        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
+        delete[] img_res_v.data;
+        if (!encoded) {
+            LOG_TEE("Unable to encode image\n");
+
+            return false;
+        }
+    } else {
+        // spatial_unpad llava-1.6 type embedding
+        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
+        std::vector<float *> image_embd_v;
+        image_embd_v.resize(img_res_v.size);
+        for (size_t i = 0; i < img_res_v.size; i++) {
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
+            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
+            if (!encoded) {
+                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                return false;
+            }
+        }
+        const int64_t t_img_enc_batch_us = ggml_time_us();
+        LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+
+        const int32_t * image_grid = clip_image_grid(ctx_clip);
+
+        std::vector<std::pair<int, int>> grid_pinpoints;
+        for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
+            grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
+        }
+
+        // free all img_res_v - not needed anymore
+        delete[] img_res_v.data;
+        img_res_v.size = 0;
+        img_res_v.data = nullptr;
+
+        const int32_t image_size = clip_image_size(ctx_clip);
+
+        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
+
+        int n_img_pos_out;
+        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
+        *n_img_pos = n_img_pos_out;
+
+        for (size_t i = 0; i < image_embd_v.size(); i++) {
+            free(image_embd_v[i]);
+        }
+        image_embd_v.clear();
+
+        // debug image/segment/normalization content:
+        // clip_image_u8 * tmp = clip_image_u8_init();
+        // clip_image_convert_f32_to_u8(*image_feature, *tmp);
+        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
+    }
+
+    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+
+    const int64_t t_img_enc_end_us = ggml_time_us();
+    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+
+    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+
+    return true;
+}
+
+bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
+        // make sure that the correct mmproj was used, i.e., compare apples to apples
+    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
+    if (n_image_embd != n_llama_embd) {
+        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        return false;
+    }
+    return true;
+}
+
+bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
+    if (!image_embd) {
+        LOG_TEE("Unable to allocate memory for image embeddings\n");
+        return false;
+    }
+
+    int n_img_pos;
+    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
+        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
+        free(image_embd);
+        return false;
+    }
+    *image_embd_out = image_embd;
+    *n_img_pos_out = n_img_pos;
+
+    return true;
+}
+
+bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+
+    for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
+        int n_eval = image_embed->n_image_pos - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        if (llama_decode(ctx_llama, batch)) {
+            LOG_TEE("%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
+    clip_image_u8 * img = clip_image_u8_init();
+    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
+        clip_image_u8_free(img);
+        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
+        return NULL;
+    }
+
+    float* image_embed = NULL;
+    int n_image_pos = 0;
+    bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
+    if (!image_embed_result) {
+        clip_image_u8_free(img);
+        LOG_TEE("%s: coulnd't embed the image\n", __func__);
+        return NULL;
+    }
+
+    clip_image_u8_free(img);
+    auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+    result->embed = image_embed;
+    result->n_image_pos = n_image_pos;
+    return result;
+}
+
+std::vector<std::vector<clip_image_u8 *>> slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14, const bool never_split=false) {
+
+    const int original_width = img->nx;
+    const int original_height = img->ny;
+    const float log_ratio = log(1.0*original_width/original_height); //
+    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
+    const int multiple = fmin(ceil(ratio), max_slice_nums);
+
+    std::vector<std::vector<clip_image_u8 *>> images;
+    LOG_TEE("%s: multiple %d\n", __func__, multiple);
+    if(multiple > 1){
+
+        std::vector<int> candidate_split_grids_nums;
+        for (int i : {multiple - 1, multiple, multiple + 1}) {
+            if (i == 1 || i > max_slice_nums) {
+                continue;
+            }
+            candidate_split_grids_nums.push_back(i);
+        }
+
+        std::vector<std::pair<int, int>> candidate_grids;
+
+        for (int split_grids_nums : candidate_split_grids_nums) {
+            int m = 1;
+            while (m <= split_grids_nums) {
+                if (split_grids_nums % m == 0) {
+                    candidate_grids.emplace_back(m, split_grids_nums / m);
+                }
+                ++m;
+            }
+        }
+
+        std::pair<int, int> best_grid{1, 1};
+        float min_error = std::numeric_limits<float>::infinity();
+
+        for (const auto& grid : candidate_grids) {
+            float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
+            if (error < min_error) {
+                best_grid = grid;
+                min_error = error;
+            }
+        }
+        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
+        
+        // split_to_patches
+        int width = img->nx;
+        int height = img->ny;
+        int grid_x = int(width / best_grid.first);
+        int grid_y = int(height / best_grid.second);
+        for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
+            images.push_back(std::vector<clip_image_u8 *>());
+            for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
+                clip_image_u8 * patch = clip_image_u8_init();
+                patch->nx = grid_x;
+                patch->ny = grid_y;
+                patch->buf.resize(3 * patch->nx * patch->ny);
+                for (int y = patches_i; y < patches_i + grid_y; ++y) {
+                    for (int x = patches_j; x < patches_j + grid_x; ++x) {
+                        const int i = 3 * (y * img->nx + x);
+                        const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
+                        patch->buf[j]   = img->buf[i];
+                        patch->buf[j+1] = img->buf[i+1];
+                        patch->buf[j+2] = img->buf[i+2];
+                    }
+                }
+                images[images.size()-1].push_back(patch);
+            }
+        }
+
+
+    }
+
+    return images;
+}
+
+
+std::vector<std::vector<struct llava_image_embed *>> llava_image_embed_make_with_bytes_slice(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
+    clip_image_u8 * img = clip_image_u8_init();
+    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
+        clip_image_u8_free(img);
+        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
+        return std::vector<std::vector<struct llava_image_embed *>>();
+    }
+    std::vector<std::vector<clip_image_u8 *>> imgs = slice_image(img);
+    for (size_t i = 0; i < imgs.size(); ++i){
+        for (size_t j = 0; j < imgs[i].size(); ++j) {
+            LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
+        }
+    }
+    std::vector<std::vector<llava_image_embed *>> results;
+
+    for (size_t i = 0; i < imgs.size(); ++i){
+        results.push_back(std::vector<llava_image_embed *>());
+        for (size_t j = 0; j < imgs[i].size(); ++j) {
+            float* image_embed = NULL;
+            int n_image_pos = 0;
+            bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos);
+            if (!image_embed_result) {
+                clip_image_u8_free(img);
+                LOG_TEE("%s: coulnd't embed the image\n", __func__);
+                return std::vector<std::vector<struct llava_image_embed *>>();
+            }
+
+            auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+            result->embed = image_embed;
+            result->n_image_pos = n_image_pos;
+            results[i].push_back(result);
+        }
+    }
+    clip_image_u8_free(img);
+    return results;
+}
+
+
+static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
+    auto file = fopen(path, "rb");
+    if (file == NULL) {
+        LOG_TEE("%s: can't read file %s\n", __func__, path);
+        return false;
+    }
+
+    fseek(file, 0, SEEK_END);
+    auto fileSize = ftell(file);
+    fseek(file, 0, SEEK_SET);
+
+    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
+    if (buffer == NULL) {
+        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        perror("Memory allocation error");
+        fclose(file);
+        return false;
+    }
+    errno = 0;
+    size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
+    if (ferror(file)) {
+        die_fmt("read error: %s", strerror(errno));
+    }
+    if (ret != (size_t) fileSize) {
+        die("unexpectedly reached end of file");
+    }
+    fclose(file); // Close the file
+
+    *bytesOut = buffer;
+    *sizeOut = fileSize;
+    return true;
+}
+
+struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
+    unsigned char* image_bytes;
+    long image_bytes_length;
+    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
+    if (!loaded) {
+        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
+        return NULL;
+    }
+
+    llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
+    free(image_bytes);
+
+    return embed;
+}
+
+std::pair<struct llava_image_embed *, std::vector<std::vector<struct llava_image_embed *>>> llava_image_embed_make_with_filename_slice(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
+    unsigned char* image_bytes;
+    long image_bytes_length;
+    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
+    if (!loaded) {
+        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
+        return {NULL, std::vector<std::vector<struct llava_image_embed *>>()};
+    }
+
+    llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
+    std::vector<std::vector<struct llava_image_embed *>> embeds = llava_image_embed_make_with_bytes_slice(ctx_clip, n_threads, image_bytes, image_bytes_length);
+    free(image_bytes);
+    return {embed, embeds};
+}
+
+
+void llava_image_embed_free(struct llava_image_embed * embed) {
+    free(embed->embed);
+    free(embed);
+}
diff --git a/examples/minicpmv/minicpmv.h b/examples/minicpmv/minicpmv.h
new file mode 100644
index 0000000000000..068b8d44b2132
--- /dev/null
+++ b/examples/minicpmv/minicpmv.h
@@ -0,0 +1,52 @@
+#ifndef LLAVA_H
+#define LLAVA_H
+
+#include "ggml.h"
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define LLAVA_API __declspec(dllexport)
+#        else
+#            define LLAVA_API __declspec(dllimport)
+#        endif
+#    else
+#        define LLAVA_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define LLAVA_API
+#endif
+
+struct clip_ctx;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct llava_image_embed {
+    float * embed;
+    int n_image_pos;
+};
+
+/** sanity check for clip <-> llava embed size match */
+LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
+
+LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+
+/** build an image embed from image file bytes */
+LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+/** build an image embed from a path to an image filename */
+LLAVA_API std::vector<std::vector<struct llava_image_embed *>> llava_image_embed_make_with_bytes_slice(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+LLAVA_API std::pair<struct llava_image_embed *, std::vector<std::vector<struct llava_image_embed *>>> llava_image_embed_make_with_filename_slice(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
+/** free an embedding made with llava_image_embed_make_* */
+
+/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
+LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/examples/minicpmv/requirements.txt b/examples/minicpmv/requirements.txt
new file mode 100644
index 0000000000000..f80f727a79307
--- /dev/null
+++ b/examples/minicpmv/requirements.txt
@@ -0,0 +1,3 @@
+-r ../../requirements/requirements-convert.txt
+pillow~=10.2.0
+torch~=2.1.1
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 06cb26a7d495b..cfd48d16d6ecb 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -44,6 +44,7 @@ class LLM:
         EXPERT_USED_COUNT     = "{arch}.expert_used_count"
         POOLING_TYPE          = "{arch}.pooling_type"
         LOGIT_SCALE           = "{arch}.logit_scale"
+        TIE_LM_HEAD           = "{arch}.tie_lm_head"
 
     class Attention:
         HEAD_COUNT        = "{arch}.attention.head_count"
@@ -596,6 +597,7 @@ class MODEL_TENSOR(IntEnum):
     ],
     MODEL_ARCH.MINICPM: [
         MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_NORM,
@@ -899,6 +901,7 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_FEED_FORWARD_LENGTH   = Keys.LLM.FEED_FORWARD_LENGTH
 KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
 KEY_TENSOR_DATA_LAYOUT    = Keys.LLM.TENSOR_DATA_LAYOUT
+KEY_TIE_LM_HEAD           = Keys.LLM.TIE_LM_HEAD
 
 # attention
 KEY_ATTENTION_HEAD_COUNT        = Keys.Attention.HEAD_COUNT
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index e3dbca454ae05..b75dd472d9fee 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -350,6 +350,9 @@ def add_feed_forward_length(self, length: int) -> None:
     def add_parallel_residual(self, use: bool) -> None:
         self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
 
+    def add_tie_lm_head(self, tie_lm_head: bool) -> None:
+        self.add_bool(Keys.LLM.TIE_LM_HEAD.format(arch=self.arch), tie_lm_head)
+
     def add_head_count(self, count: int) -> None:
         self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
 
diff --git a/llama.cpp b/llama.cpp
index 7440c740fefbc..7894aac87c5cb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -285,6 +285,7 @@ enum llm_kv {
     LLM_KV_EXPERT_USED_COUNT,
     LLM_KV_POOLING_TYPE,
     LLM_KV_LOGIT_SCALE,
+    LLM_KV_TIE_LM_HEAD,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -361,6 +362,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
     { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"          },
     { LLM_KV_LOGIT_SCALE,                   "%s.logit_scale"           },
+    { LLM_KV_TIE_LM_HEAD,                   "%s.tie_lm_head"           },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
@@ -1825,6 +1827,7 @@ struct llama_hparams {
 
     bool causal_attn = true;
     bool need_kq_pos = false;
+    bool tie_lm_head = true;
 
     enum llama_pooling_type      pooling_type            = LLAMA_POOLING_TYPE_NONE;
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
@@ -3696,6 +3699,7 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_BLOCK_COUNT,          hparams.n_layer);
     ml.get_key(LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);
+    ml.get_key(LLM_KV_TIE_LM_HEAD,          hparams.tie_lm_head,   false);
 
     GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
     GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@@ -4707,20 +4711,12 @@ static bool llm_load_tensors(
             case LLM_ARCH_MINICPM:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    if (!hparams.tie_lm_head){
+                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
+                    }
 
                     // output
-                    {
-                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        if (model.arch != LLM_ARCH_MINICPM){
-                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
-                            // if output is NULL, init from the input tok embed
-                            if (model.output == NULL) {
-                                model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                                ml.n_created--; // artificial tensor
-                                ml.size_data += ggml_nbytes(model.output);
-                            }
-                        }
-                    }
+                    model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
 
                     for (int i = 0; i < n_layer; ++i) {
                         ggml_context * ctx_layer = ctx_for_layer(i);
@@ -9542,7 +9538,9 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
 
         // scale the input embeddings
-        inpL = ggml_scale(ctx0, inpL, scale_embd);
+        if (batch.token) {
+            inpL = ggml_scale(ctx0, inpL, scale_embd);
+        }
         cb(inpL, "inp_scaled", -1);
 
         // inp_pos - contains the positions
@@ -9658,7 +9656,11 @@ struct llm_build_context {
         cb(cur, "lmhead_scaling", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
+        if (hparams.tie_lm_head){
+            cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
+        } else {
+            cur = ggml_mul_mat(ctx0, model.output, cur);
+        }
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);

From 6c1c4b4688622d8ee026643a07fabf8e0b514c09 Mon Sep 17 00:00:00 2001
From: Achazwl <323163497@qq.com>
Date: Wed, 1 May 2024 11:17:45 +0800
Subject: [PATCH 2/5] move stl return out of extern C

---
 examples/minicpmv/minicpmv.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/minicpmv/minicpmv.h b/examples/minicpmv/minicpmv.h
index 068b8d44b2132..27c8362c6902d 100644
--- a/examples/minicpmv/minicpmv.h
+++ b/examples/minicpmv/minicpmv.h
@@ -36,9 +36,7 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip,
 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 /** build an image embed from a path to an image filename */
-LLAVA_API std::vector<std::vector<struct llava_image_embed *>> llava_image_embed_make_with_bytes_slice(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
-LLAVA_API std::pair<struct llava_image_embed *, std::vector<std::vector<struct llava_image_embed *>>> llava_image_embed_make_with_filename_slice(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
 LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 /** free an embedding made with llava_image_embed_make_* */
 
@@ -49,4 +47,7 @@ LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const st
 }
 #endif
 
+LLAVA_API std::vector<std::vector<struct llava_image_embed *>> llava_image_embed_make_with_bytes_slice(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+LLAVA_API std::pair<struct llava_image_embed *, std::vector<std::vector<struct llava_image_embed *>>> llava_image_embed_make_with_filename_slice(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+
 #endif

From 36bff51a7ae4ea5bc5eaf6f49460699692ad4b54 Mon Sep 17 00:00:00 2001
From: Achazwl <323163497@qq.com>
Date: Fri, 3 May 2024 10:06:36 +0800
Subject: [PATCH 3/5] fix tokenizer.json tokenizer_config.json cpu()

---
 examples/minicpmv/minicpm-surgery.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/examples/minicpmv/minicpm-surgery.py b/examples/minicpmv/minicpm-surgery.py
index 97a02c6ca9b50..85b498c97b9e2 100644
--- a/examples/minicpmv/minicpm-surgery.py
+++ b/examples/minicpmv/minicpm-surgery.py
@@ -1,6 +1,6 @@
 import argparse
 import glob
-import os
+import os, json
 import torch
 from transformers import AutoModel, AutoTokenizer
 
@@ -16,12 +16,12 @@
 mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
 
 # store these tensors in a new dictionary and torch.save them
-projector = {name: checkpoint[name].float() for name in mm_tensors}
+projector = {name: checkpoint[name].float().cpu() for name in mm_tensors}
 torch.save(projector, f"{args.model}/llava.projector")
 
 clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
 if len(clip_tensors) > 0:
-    clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
+    clip = {name.replace("vpm.", ""): checkpoint[name].float().cpu() for name in clip_tensors}
     torch.save(clip, f"{args.model}/llava.clip")
 
     # added tokens should be removed to be able to convert Mistral models
@@ -42,6 +42,15 @@
 tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
 tok.save_pretrained(f"{args.model}/MiniCPM")
 os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM/modeling_minicpm.py")
+os.system(f"cp {args.model}/tokenizer.json {args.model}/MiniCPM/tokenizer.json")
+with open(f"{args.model}/MiniCPM/tokenizer_config.json", "r") as f:
+    d = json.load(f)
+    d.pop("auto_map")
+    d["tokenizer_class"] = "LlamaTokenizer"
+    d.pop("add_prefix_space")
+with open(f"{args.model}/MiniCPM/tokenizer_config.json", "w") as f:
+    json.dump(d, f, indent=2)
+
 
 print("Done!")
 print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")

From a76fbcd05054e39e8be325c10320397775d42ac3 Mon Sep 17 00:00:00 2001
From: Achazwl <acha131441373@gmail.com>
Date: Wed, 8 May 2024 20:08:36 +0800
Subject: [PATCH 4/5] fix LLaVA side effect

---
 convert-hf-to-gguf.py                |  2 --
 examples/minicpmv/minicpm-surgery.py |  3 ++-
 gguf-py/gguf/constants.py            |  2 --
 gguf-py/gguf/gguf_writer.py          |  3 ---
 llama.cpp                            | 30 ++++++++++++++++++----------
 5 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 278edccdccc45..4fd916cba3ed7 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1638,8 +1638,6 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
         self.gguf_writer.add_file_type(self.ftype)
-        if "tie_word_embeddings" in self.hparams:
-            self.gguf_writer.add_tie_lm_head(self.hparams["tie_word_embeddings"])
 
     def set_vocab(self):
         self._set_vocab_llama_hf()
diff --git a/examples/minicpmv/minicpm-surgery.py b/examples/minicpmv/minicpm-surgery.py
index 85b498c97b9e2..5916744a0ba0d 100644
--- a/examples/minicpmv/minicpm-surgery.py
+++ b/examples/minicpmv/minicpm-surgery.py
@@ -47,7 +47,8 @@
     d = json.load(f)
     d.pop("auto_map")
     d["tokenizer_class"] = "LlamaTokenizer"
-    d.pop("add_prefix_space")
+    if "add_prefix_space" in d:
+        d.pop("add_prefix_space")
 with open(f"{args.model}/MiniCPM/tokenizer_config.json", "w") as f:
     json.dump(d, f, indent=2)
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index cfd48d16d6ecb..6562c66607db9 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -44,7 +44,6 @@ class LLM:
         EXPERT_USED_COUNT     = "{arch}.expert_used_count"
         POOLING_TYPE          = "{arch}.pooling_type"
         LOGIT_SCALE           = "{arch}.logit_scale"
-        TIE_LM_HEAD           = "{arch}.tie_lm_head"
 
     class Attention:
         HEAD_COUNT        = "{arch}.attention.head_count"
@@ -901,7 +900,6 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_FEED_FORWARD_LENGTH   = Keys.LLM.FEED_FORWARD_LENGTH
 KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
 KEY_TENSOR_DATA_LAYOUT    = Keys.LLM.TENSOR_DATA_LAYOUT
-KEY_TIE_LM_HEAD           = Keys.LLM.TIE_LM_HEAD
 
 # attention
 KEY_ATTENTION_HEAD_COUNT        = Keys.Attention.HEAD_COUNT
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index b75dd472d9fee..e3dbca454ae05 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -350,9 +350,6 @@ def add_feed_forward_length(self, length: int) -> None:
     def add_parallel_residual(self, use: bool) -> None:
         self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
 
-    def add_tie_lm_head(self, tie_lm_head: bool) -> None:
-        self.add_bool(Keys.LLM.TIE_LM_HEAD.format(arch=self.arch), tie_lm_head)
-
     def add_head_count(self, count: int) -> None:
         self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
 
diff --git a/llama.cpp b/llama.cpp
index 7894aac87c5cb..7f8d3ca577a27 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -285,7 +285,6 @@ enum llm_kv {
     LLM_KV_EXPERT_USED_COUNT,
     LLM_KV_POOLING_TYPE,
     LLM_KV_LOGIT_SCALE,
-    LLM_KV_TIE_LM_HEAD,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -362,7 +361,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
     { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"          },
     { LLM_KV_LOGIT_SCALE,                   "%s.logit_scale"           },
-    { LLM_KV_TIE_LM_HEAD,                   "%s.tie_lm_head"           },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
@@ -1827,7 +1825,6 @@ struct llama_hparams {
 
     bool causal_attn = true;
     bool need_kq_pos = false;
-    bool tie_lm_head = true;
 
     enum llama_pooling_type      pooling_type            = LLAMA_POOLING_TYPE_NONE;
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
@@ -3314,6 +3311,7 @@ struct llama_model_loader {
         ggml_set_name(tensor, ggml_get_name(cur));
 
         n_created++;
+        printf("%s: created tensor '%s'\n", __func__, ggml_get_name(tensor));
 
         return tensor;
     }
@@ -3382,6 +3380,8 @@ struct llama_model_loader {
         ggml_set_name(tensor, name.c_str());
 
         n_created++;
+        printf("%s: created tensor '%s'\n", __func__, name.c_str());
+
 
         return tensor;
     }
@@ -3699,7 +3699,6 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_BLOCK_COUNT,          hparams.n_layer);
     ml.get_key(LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);
-    ml.get_key(LLM_KV_TIE_LM_HEAD,          hparams.tie_lm_head,   false);
 
     GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
     GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@@ -4711,8 +4710,12 @@ static bool llm_load_tensors(
             case LLM_ARCH_MINICPM:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    if (!hparams.tie_lm_head){
-                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
+                    model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
+                    // if output is NULL, init from the input tok embed
+                    if (model.output == NULL) {
+                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
                     }
 
                     // output
@@ -4793,6 +4796,7 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese GROK\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -4922,6 +4926,7 @@ static bool llm_load_tensors(
                         if (!model.output) {
                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese FALCON\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5127,6 +5132,7 @@ static bool llm_load_tensors(
                         if (!model.output) {
                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese MPT\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5249,6 +5255,7 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese QWEN2\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5539,6 +5546,7 @@ static bool llm_load_tensors(
                     model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                     model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
                     ml.n_created--; // artificial tensor
+                    printf("created tensor decrese GEMMA\n");
                     ml.size_data += ggml_nbytes(model.output);
 
                     const int64_t n_ff          = hparams.n_ff;
@@ -5579,6 +5587,7 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese STARCODER2\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
 
@@ -5635,6 +5644,7 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese MAMBA\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5698,6 +5708,7 @@ static bool llm_load_tensors(
                         // init output from the input tok embed
                         model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                         ml.n_created--; // artificial tensor
+                        printf("created tensor decrese COMMAND_R\n");
                         ml.size_data += ggml_nbytes(model.output);
                     }
 
@@ -5735,6 +5746,7 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
+                            printf("created tensor decrese OLMO\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -9656,11 +9668,7 @@ struct llm_build_context {
         cb(cur, "lmhead_scaling", -1);
 
         // lm_head
-        if (hparams.tie_lm_head){
-            cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
-        } else {
-            cur = ggml_mul_mat(ctx0, model.output, cur);
-        }
+        cur = ggml_mul_mat(ctx0, model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);

From f0d7be409d1d53b6fcd26aae7ce62a636933a8eb Mon Sep 17 00:00:00 2001
From: Achazwl <acha131441373@gmail.com>
Date: Sun, 12 May 2024 16:40:51 +0800
Subject: [PATCH 5/5] remove unrelevant debug code

---
 llama.cpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 7f8d3ca577a27..2ec87e626ef40 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3311,7 +3311,6 @@ struct llama_model_loader {
         ggml_set_name(tensor, ggml_get_name(cur));
 
         n_created++;
-        printf("%s: created tensor '%s'\n", __func__, ggml_get_name(tensor));
 
         return tensor;
     }
@@ -3380,8 +3379,6 @@ struct llama_model_loader {
         ggml_set_name(tensor, name.c_str());
 
         n_created++;
-        printf("%s: created tensor '%s'\n", __func__, name.c_str());
-
 
         return tensor;
     }
@@ -4796,7 +4793,6 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
-                            printf("created tensor decrese GROK\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -4926,7 +4922,6 @@ static bool llm_load_tensors(
                         if (!model.output) {
                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
                             ml.n_created--; // artificial tensor
-                            printf("created tensor decrese FALCON\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5132,7 +5127,6 @@ static bool llm_load_tensors(
                         if (!model.output) {
                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
                             ml.n_created--; // artificial tensor
-                            printf("created tensor decrese MPT\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5255,7 +5249,6 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
-                            printf("created tensor decrese QWEN2\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5546,7 +5539,6 @@ static bool llm_load_tensors(
                     model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                     model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
                     ml.n_created--; // artificial tensor
-                    printf("created tensor decrese GEMMA\n");
                     ml.size_data += ggml_nbytes(model.output);
 
                     const int64_t n_ff          = hparams.n_ff;
@@ -5587,7 +5579,6 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
-                            printf("created tensor decrese STARCODER2\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
 
@@ -5644,7 +5635,6 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
-                            printf("created tensor decrese MAMBA\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }
@@ -5708,7 +5698,6 @@ static bool llm_load_tensors(
                         // init output from the input tok embed
                         model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                         ml.n_created--; // artificial tensor
-                        printf("created tensor decrese COMMAND_R\n");
                         ml.size_data += ggml_nbytes(model.output);
                     }
 
@@ -5746,7 +5735,6 @@ static bool llm_load_tensors(
                         if (model.output == NULL) {
                             model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                             ml.n_created--; // artificial tensor
-                            printf("created tensor decrese OLMO\n");
                             ml.size_data += ggml_nbytes(model.output);
                         }
                     }