From 235340d3ef0359c3d275047148cc894e9392e56c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 9 Apr 2025 09:32:01 +0200
Subject: [PATCH 01/11] wip llava2

---
 examples/llava/clip-impl.h    |  55 +++++++++++
 examples/llava/clip.cpp       |  23 ++---
 examples/llava/clip.h         |   3 +
 examples/llava/gemma3-cli.cpp |  20 ++--
 examples/llava/llava2.cpp     | 174 ++++++++++++++++++++++++++++++++++
 examples/llava/llava2.h       | 102 ++++++++++++++++++++
 6 files changed, 350 insertions(+), 27 deletions(-)
 create mode 100644 examples/llava/llava2.cpp
 create mode 100644 examples/llava/llava2.h
diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index 685d6e7e09ad1..e9c23a59db7ee 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -1,6 +1,8 @@
 #include "ggml.h"
 #include "gguf.h"
 
+#include "clip.h"
+
 #include <climits>
 #include <cstdarg>
 #include <string>
@@ -120,6 +122,23 @@ static projector_type clip_projector_type_from_string(const std::string & str) {
     return PROJECTOR_TYPE_UNKNOWN;
 }
 
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
 //
 // logging
 //
@@ -178,6 +197,28 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  __VA_ARGS__)
 
+//
+// cpp wrappers
+//
+
+struct clip_image_u8_deleter {
+    void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
+};
+
+struct clip_image_f32_deleter {
+    void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
+};
+
+struct clip_image_f32_batch_deleter {
+    void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
+};
+
+typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
+typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
+typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
+
+// TODO @ngxson : we're currently having a naming clash between struct clip_image_size and function clip_image_size()
+
 //
 // common utils
 //
@@ -214,6 +255,20 @@ static void string_replace_all(std::string & s, const std::string & search, cons
     s = std::move(builder);
 }
 
+// split string by a `std::string delim` instead of `char delim`
+static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
+    std::vector<std::string> tokens;
+    size_t pos = 0;
+    std::string token;
+    while ((pos = s.find(delimiter)) != std::string::npos) {
+        token = s.substr(0, pos);
+        tokens.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(s);
+    return tokens;
+}
+
 //
 // gguf utils
 //
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index e9520f3d1a378..f29f1c7aab7ce 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -32,23 +32,6 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac
 
 //#define CLIP_DEBUG_FUNCTIONS
 
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
-
 #ifdef CLIP_DEBUG_FUNCTIONS
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
     std::ofstream file(filename, std::ios::binary);
@@ -1618,6 +1601,12 @@ struct clip_image_f32 * clip_image_f32_init() {
     return new clip_image_f32();
 }
 
+unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
+    if (nx) *nx = img->nx;
+    if (ny) *ny = img->ny;
+    return img->buf.data();
+}
+
 void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
 void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
 void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index d806465bf68bb..9f6cc83e0280a 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -77,6 +77,9 @@ CLIP_API struct clip_image_size * clip_image_size_init();
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();
 
+// nx, ny are the output image dimensions
+CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
+
 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 4f89c0e15b4e9..b5e8a305ce05e 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -2,11 +2,10 @@
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
-#include "clip.h"
-#include "stb_image.h"
 #include "llama.h"
 #include "ggml.h"
 #include "console.h"
+#include "llava2.h"
 
 #include <vector>
 #include <limits.h>
@@ -57,8 +56,8 @@ static void sigint_handler(int signo) {
 #endif
 
 struct gemma3_context {
-    struct clip_ctx    * ctx_clip = NULL;
-    common_init_result   llama_init;
+    llava2_context_ptr ctx_llava2;
+    common_init_result llama_init;
 
     llama_model       * model;
     llama_context     * lctx;
@@ -79,16 +78,16 @@ struct gemma3_context {
 
     void init_clip_model(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
-        ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
-        if (!ctx_clip) {
+        ctx_llava2 = llava2_init_from_file(clip_path, model, llava2_context_params{
+            /* use_gpu */   true,
+            /* n_threads */ params.cpuparams.n_threads,
+            /* verbosity */ GGML_LOG_LEVEL_INFO,
+        });
+        if (!ctx_llava2.get()) {
             LOG_ERR("Failed to load CLIP model from %s\n", clip_path);
             exit(1);
         }
     }
-
-    ~gemma3_context() {
-        clip_free(ctx_clip);
-    }
 };
 
 struct decode_embd_batch {
@@ -271,6 +270,7 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
+        std::string prompt = "<start_of_turn>user\n<image>" + params.prompt + "<end_of_turn><start_of_turn>model\n";
         if (eval_text(ctx, "<start_of_turn>user\n")) {
             return 1;
         }
diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp
new file mode 100644
index 0000000000000..a80ffe5fdf386
--- /dev/null
+++ b/examples/llava/llava2.cpp
@@ -0,0 +1,174 @@
+#include "clip.h"
+#include "clip-impl.h"
+#include "llava2.h"
+
+#include "llama.h"
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <vector>
+
+static const char * IMG_MARKER = "<image>";
+
+struct llava2_context {
+    struct clip_ctx * ctx_clip;
+    const struct llama_model * text_model;
+    std::vector<float> image_embd_v; // image embedding vector
+    int n_threads;
+
+    llava2_context(const char * mmproj_fname, 
+                   const struct llama_model * text_model,
+                   const struct llava2_context_params & ctx_params) : n_threads(ctx_params.n_threads) {
+        clip_context_params ctx_clip_params;
+        ctx_clip_params.use_gpu   = ctx_params.use_gpu;
+        ctx_clip_params.verbosity = ctx_params.verbosity;
+        ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
+        if (!ctx_clip) {
+            throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
+        }
+        this->text_model = text_model;
+    }
+
+    ~llava2_context() {
+        clip_free(ctx_clip);
+    }
+};
+
+struct llava2_image_tokens_data {
+    clip_image_f32_batch_ptr batch_f32; // preprocessed image patches
+};
+
+llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
+        const struct llama_model * text_model,
+        const struct llava2_context_params ctx_params) {
+    try {
+        auto ctx = std::make_shared<llava2_context>(mmproj_fname, text_model, ctx_params);
+        return ctx;
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return nullptr;
+    }
+}
+
+int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output) {
+    clip_image_u8_ptr img_u8(clip_image_u8_init());
+    bool ok = clip_image_load_from_file(fname, img_u8.get());
+    if (!ok) {
+        LOG_ERR("Unable to load image %s\n", fname);
+        return 1;
+    }
+    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
+    output.data.resize(output.nx * output.ny * 3);
+    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
+    return 0;
+}
+
+// copied from common_tokenize
+static std::vector<llama_token> llava2_tokenize_text_internal(
+    const struct llama_vocab * vocab,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + 2 * add_special;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
+int32_t llava2_tokenize(llava2_context_ptr & ctx,
+        std::vector<llava2_input_chunk> & output,
+        const std::string & prompt,
+        bool add_special,
+        bool parse_special,
+        const std::vector<llava2_bitmap> & bitmaps) {
+    auto vocab = llama_model_get_vocab(ctx->text_model);
+
+    std::vector<std::string> parts = string_split_str(prompt, IMG_MARKER);
+    output.clear();
+    output.reserve(parts.size());
+
+    size_t i_img = 0;
+
+    for (const auto & part : parts) {
+        //printf("tokenizing part: %s\n", part.c_str());
+        bool add_bos = &parts.front() == &part;
+        auto tokens = llava2_tokenize_text_internal(vocab, part, add_special && add_bos, parse_special);
+        if (tokens.empty()) {
+            continue;
+        }
+        output.push_back({
+            LLAVA2_INPUT_CHUNK_TYPE_TEXT,
+            std::move(tokens),
+            {},
+        });
+
+        if (&parts.back() != &part) {
+            // add image token to middle of 2 parts
+
+            if (i_img >= bitmaps.size()) {
+                LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
+                return 2;
+            }
+
+            // shim layer
+            clip_image_u8_ptr img_u8(clip_image_u8_init());
+            img_u8->nx = bitmaps[i_img].nx;
+            img_u8->ny = bitmaps[i_img].ny;
+            img_u8->buf.resize(bitmaps[i_img].data.size());
+            std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
+
+            // preprocess image
+            clip_image_f32_batch_ptr batch_f32;
+            bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), batch_f32.get());
+            if (!ok) {
+                LOG_ERR("Unable to preprocess image\n");
+                return 1;
+            }
+
+            llava2_image_tokens image_tokens;
+            //image_tokens.nx = ...;
+            //image_tokens.ny = ...;
+            image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
+            image_tokens.data = std::unique_ptr<llava2_image_tokens_data>(
+                new llava2_image_tokens_data{
+                    std::move(batch_f32),
+                }
+            );
+
+            output.push_back({
+                LLAVA2_INPUT_CHUNK_TYPE_IMAGE,
+                {},
+                std::move(image_tokens),
+            });
+            i_img++;
+        }
+    }
+
+    return 0;
+}
+
+LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
+                            const llava2_image_tokens & image_tokens) {
+    ctx->image_embd_v.reserve(image_tokens.n_tokens * clip_n_mmproj_embd(ctx->ctx_clip));
+    return clip_image_batch_encode(
+        ctx->ctx_clip,
+        ctx->n_threads,
+        image_tokens.data->batch_f32.get(),
+        ctx->image_embd_v.data());
+}
+
+LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx) {
+    return ctx->image_embd_v.data();
+}
diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h
new file mode 100644
index 0000000000000..188c18fdc259b
--- /dev/null
+++ b/examples/llava/llava2.h
@@ -0,0 +1,102 @@
+#ifndef LLAVA2_H
+#define LLAVA2_H
+
+#include "ggml.h"
+#include "llama.h"
+#include "clip.h"
+
+#include <vector>
+#include <cinttypes>
+#include <memory>
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define LLAVA2_API __declspec(dllexport)
+#        else
+#            define LLAVA2_API __declspec(dllimport)
+#        endif
+#    else
+#        define LLAVA2_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define LLAVA2_API
+#endif
+
+#ifdef __cplusplus
+
+enum llava2_input_chunk_type {
+    LLAVA2_INPUT_CHUNK_TYPE_TEXT,
+    LLAVA2_INPUT_CHUNK_TYPE_IMAGE,
+};
+
+struct llava2_context;
+struct llava2_image_tokens_data; // internal data
+
+using llava2_context_ptr           = std::shared_ptr<struct llava2_context>;
+using llava2_image_tokens_data_ptr = std::shared_ptr<struct llava2_image_tokens_data>;
+
+// represents raw image data, layout is RGBRGBRGB...
+// length of data must be nx * ny * 3
+struct llava2_bitmap {
+    uint32_t nx;
+    uint32_t ny;
+    std::vector<unsigned char> data;
+};
+
+// represents the processed image as tokens (to be encoded)
+struct llava2_image_tokens {
+    uint32_t nx; // number of tokens in x direction
+    uint32_t ny; // number of tokens in y direction
+    uint32_t n_tokens; // == nx * ny
+    llava2_image_tokens_data_ptr data; // internal data
+};
+
+struct llava2_input_chunk {
+    llava2_input_chunk_type type;
+    std::vector<int32_t> tokens_text;
+    llava2_image_tokens tokens_image;
+};
+
+struct llava2_context_params {
+    bool use_gpu = true;
+    int n_threads = 4;
+    enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
+};
+
+LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
+                                                const llama_model * text_model,
+                                                const llava2_context_params ctx_params);
+
+// helper function to load an image from a file
+LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output);
+
+// tokenize an input text prompt and an image
+// the prompt must have the input image marker <image> in it
+// the marker will be replaced with the image tokens
+// for example:
+//   "here is an image: <image>\ndescribe it in detail."
+//   this will gives 3 chunks:
+//   1. "here is an image: <start_of_image>"
+//   2. <image> (image tokens)
+//   3. "<end_of_image>\ndescribe it in detail."
+// number of bitmaps must be equal to the number of <image> markers in the prompt
+LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx,
+                                std::vector<llava2_input_chunk> & output,
+                                const std::string & prompt,
+                                bool add_special,
+                                bool parse_special,
+                                const std::vector<llava2_bitmap> & bitmaps);
+
+LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
+                            const llava2_image_tokens & image_tokens);
+
+LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx);
+
+#else
+
+static_assert(false && "C header is not yet supported by this library");
+
+#endif
+
+#endif

From 96bf95ed5fdc762f3bc280eac559874a949dfc15 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 9 Apr 2025 12:11:10 +0200
Subject: [PATCH 02/11] migrated gemma3 to llava2

---
 examples/llava/CMakeLists.txt |  32 +++++-
 examples/llava/clip.cpp       |   2 +
 examples/llava/gemma3-cli.cpp | 180 ++++++++++++++--------------------
 examples/llava/llava2.cpp     | 131 +++++++++++++++++++++++--
 examples/llava/llava2.h       |  29 +++++-
 5 files changed, 257 insertions(+), 117 deletions(-)

diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index f275ce1ccd003..491435ec2342e 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -1,3 +1,5 @@
+# llava (legacy)
+
 add_library(llava OBJECT
             llava.cpp
             llava.h
@@ -22,12 +24,40 @@ if (BUILD_SHARED_LIBS)
     install(TARGETS llava_shared LIBRARY)
 endif()
 
+# llava2
+
+add_library(llava2 OBJECT
+            llava2.cpp
+            llava2.h
+            clip.cpp
+            clip.h
+            clip-impl.h
+            )
+
+target_link_libraries(llava2 PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(llava2 PUBLIC .)
+target_include_directories(llava2 PUBLIC ../..)
+
+target_compile_features(llava2 PRIVATE cxx_std_17)
+
+add_library(llava2_static STATIC $<TARGET_OBJECTS:llava2>)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(llava2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(llava2 PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(llava2_shared SHARED $<TARGET_OBJECTS:llava2>)
+    target_link_libraries(llava2_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS llava2_shared LIBRARY)
+endif()
+
 if (NOT MSVC)
     target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
+    target_compile_options(llava2 PRIVATE -Wno-cast-qual) # stb_image.h
 endif()
 
 if(TARGET BUILD_INFO)
     add_dependencies(llava BUILD_INFO)
+    add_dependencies(llava2 BUILD_INFO)
 endif()
 
 set(TARGET llama-llava-cli)
@@ -55,7 +85,7 @@ set(TARGET llama-gemma3-cli)
 add_executable(${TARGET} gemma3-cli.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llava2 ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-llava-clip-quantize-cli)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index f29f1c7aab7ce..2072a318cfda6 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2330,6 +2330,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
         int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
         int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
         n_patches = x_patch * y_patch;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
+        n_patches = 256;
     }
 
     return n_patches;
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index b5e8a305ce05e..fd3778f918def 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -5,6 +5,7 @@
 #include "llama.h"
 #include "ggml.h"
 #include "console.h"
+#include "chat.h"
 #include "llava2.h"
 
 #include <vector>
@@ -56,13 +57,18 @@ static void sigint_handler(int signo) {
 #endif
 
 struct gemma3_context {
-    llava2_context_ptr ctx_llava2;
+    llava2_context_ptr ctx_vision;
     common_init_result llama_init;
 
     llama_model       * model;
     llama_context     * lctx;
     const llama_vocab * vocab;
     llama_batch         batch;
+    int                 n_batch;
+
+    // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
+    // so here we don't need to keep track of chat history
+    common_chat_templates_ptr tmpls;
 
     int n_threads    = 1;
     llama_pos n_past = 0;
@@ -73,18 +79,20 @@ struct gemma3_context {
         vocab = llama_model_get_vocab(model);
         n_threads = params.cpuparams.n_threads;
         batch = llama_batch_init(params.n_batch, 0, 1);
-        init_clip_model(params);
+        n_batch = params.n_batch;
+        tmpls = common_chat_templates_init(model, params.chat_template);
+        init_vision_context(params);
     }
 
-    void init_clip_model(common_params & params) {
+    void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
-        ctx_llava2 = llava2_init_from_file(clip_path, model, llava2_context_params{
+        ctx_vision = llava2_init_from_file(clip_path, model, llava2_context_params{
             /* use_gpu */   true,
             /* n_threads */ params.cpuparams.n_threads,
             /* verbosity */ GGML_LOG_LEVEL_INFO,
         });
-        if (!ctx_llava2.get()) {
-            LOG_ERR("Failed to load CLIP model from %s\n", clip_path);
+        if (!ctx_vision.get()) {
+            LOG_ERR("Failed to load vision model from %s\n", clip_path);
             exit(1);
         }
     }
@@ -123,77 +131,6 @@ struct decode_embd_batch {
     }
 };
 
-static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
-    llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
-    common_batch_clear(ctx.batch);
-    for (llama_token & t : tokens) {
-        common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
-    }
-    if (logits_last) {
-        ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
-    }
-    // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
-    if (llama_decode(ctx.lctx, ctx.batch)) {
-        LOG_ERR("Failed to decode text\n");
-        return 1;
-    }
-    return 0;
-}
-
-static int eval_image(gemma3_context & ctx, std::string & fname) {
-    std::vector<float> image_embd_v;
-    int n_embd = llama_model_n_embd(ctx.model);
-    int n_tokens = 256;
-    image_embd_v.resize(n_tokens * n_embd);
-
-    bool ok;
-    struct clip_image_u8 * img_u8 = clip_image_u8_init();
-    ok = clip_image_load_from_file(fname.c_str(), img_u8);
-    if (!ok) {
-        LOG_ERR("Unable to load image %s\n", fname.c_str());
-        clip_image_u8_free(img_u8);
-        return 2; // non-fatal error
-    }
-
-    clip_image_f32_batch batch_f32;
-    ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
-    if (!ok) {
-        LOG_ERR("Unable to preprocess image\n");
-        clip_image_f32_batch_free(&batch_f32);
-        clip_image_u8_free(img_u8);
-        return 1;
-    }
-
-    int64_t t0 = ggml_time_ms();
-    LOG("Encoding image %s\n", fname.c_str());
-    ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
-    if (!ok) {
-        LOG_ERR("Unable to encode image\n");
-        clip_image_f32_batch_free(&batch_f32);
-        clip_image_u8_free(img_u8);
-        return 1;
-    }
-    LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
-
-    clip_image_f32_batch_free(&batch_f32);
-    clip_image_u8_free(img_u8);
-
-    // decode image embeddings
-    int64_t t1 = ggml_time_ms();
-    eval_text(ctx, "<start_of_image>");
-    llama_set_causal_attn(ctx.lctx, false);
-    decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
-    if (llama_decode(ctx.lctx, batch_img.batch)) {
-        LOG_ERR("failed to decode image\n");
-        return 1;
-    }
-    ctx.n_past += n_tokens;
-    llama_set_causal_attn(ctx.lctx, true);
-    eval_text(ctx, "<end_of_image>");
-    LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
-    return 0;
-}
-
 static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
     for (int i = 0; i < n_predict; i++) {
         if (i > n_predict || !g_is_generating) {
@@ -223,6 +160,41 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
     return 0;
 }
 
+static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
+    std::vector<llava2_bitmap> bitmaps;
+
+    common_chat_templates_inputs tmpl_inputs;
+    tmpl_inputs.messages = {msg};
+    tmpl_inputs.add_generation_prompt = true;
+    tmpl_inputs.use_jinja = false; // jinja is buggy here
+    auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
+    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
+
+    for (auto & fname : images_fname) {
+        llava2_bitmap bitmap;
+        if (llava2_bitmap_init_from_file(fname.c_str(), bitmap)) {
+            LOG_ERR("Unable to load image %s\n", fname.c_str());
+            return 2; // image not found
+        }
+        bitmaps.push_back(std::move(bitmap));
+    }
+
+    std::vector<llava2_input_chunk> chunks;
+    if (llava2_tokenize(ctx.ctx_vision, chunks, formatted_chat.prompt, add_bos, true, bitmaps)) {
+        LOG_ERR("Unable to tokenize prompt\n");
+        return 1;
+    }
+
+    if (llava2_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
+        LOG_ERR("Unable to eval prompt\n");
+        return 1;
+    }
+
+    ctx.n_past += llava2_helper_get_n_tokens(chunks);
+
+    return 0;
+}
+
 int main(int argc, char ** argv) {
     ggml_time_init();
 
@@ -264,22 +236,15 @@ int main(int argc, char ** argv) {
 #endif
     }
 
-    if (eval_text(ctx, "<bos>")) {
-        return 1;
-    }
-
     if (is_single_turn) {
         g_is_generating = true;
-        std::string prompt = "<start_of_turn>user\n<image>" + params.prompt + "<end_of_turn><start_of_turn>model\n";
-        if (eval_text(ctx, "<start_of_turn>user\n")) {
-            return 1;
-        }
-        for (auto & fname : params.image) {
-            if (eval_image(ctx, fname)) {
-                return 1;
-            }
+        if (params.prompt.find("<__image__>") == std::string::npos) {
+            params.prompt += " <__image__>";
         }
-        if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
+        common_chat_msg msg;
+        msg.role = "user";
+        msg.content = params.prompt;
+        if (eval_message(ctx, msg, params.image, true)) {
             return 1;
         }
         if (generate_response(ctx, smpl, n_predict)) {
@@ -293,9 +258,9 @@ int main(int argc, char ** argv) {
         LOG("\n   /quit or /exit   exit the program");
         LOG("\n");
 
-        if (eval_text(ctx, "<start_of_turn>user\n")) {
-            return 1;
-        }
+        bool is_first_msg = true;
+        std::vector<std::string> images_fname;
+        std::string content;
 
         while (true) {
             g_is_generating = false;
@@ -320,24 +285,31 @@ int main(int argc, char ** argv) {
             g_is_generating = true;
             if (line.find("/image") == 0) {
                 std::string image = line.substr(7);
-                int res = eval_image(ctx, image);
-                if (res == 2) {
-                    continue; // image not found
-                }
-                if (res) {
-                    return 1;
-                }
+                images_fname.push_back(string_strip(image));
+                content += "<__image__>";
                 continue;
+            } else {
+                content += line;
             }
-            if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
-                return 1;
+            common_chat_msg msg;
+            msg.role = "user";
+            msg.content = content;
+            int ret = eval_message(ctx, msg, images_fname, is_first_msg);
+            if (ret == 2) {
+                // non-fatal error
+                images_fname.clear();
+                content.clear();
+                continue;
             }
-            if (generate_response(ctx, smpl, n_predict)) {
+            if (ret) {
                 return 1;
             }
-            if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
+            if (generate_response(ctx, smpl, n_predict)) {
                 return 1;
             }
+            images_fname.clear();
+            content.clear();
+            is_first_msg = false;
         }
     }
 
diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp
index a80ffe5fdf386..379a061715838 100644
--- a/examples/llava/llava2.cpp
+++ b/examples/llava/llava2.cpp
@@ -12,17 +12,18 @@
 #include <limits>
 #include <vector>
 
-static const char * IMG_MARKER = "<image>";
-
 struct llava2_context {
     struct clip_ctx * ctx_clip;
     const struct llama_model * text_model;
     std::vector<float> image_embd_v; // image embedding vector
     int n_threads;
+    std::string image_marker;
+
+    // TODO @ngxson : add timings
 
-    llava2_context(const char * mmproj_fname, 
+    llava2_context(const char * mmproj_fname,
                    const struct llama_model * text_model,
-                   const struct llava2_context_params & ctx_params) : n_threads(ctx_params.n_threads) {
+                   const struct llava2_context_params & ctx_params) : n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -95,7 +96,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
         const std::vector<llava2_bitmap> & bitmaps) {
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
-    std::vector<std::string> parts = string_split_str(prompt, IMG_MARKER);
+    std::vector<std::string> parts = string_split_str(prompt, ctx->image_marker);
     output.clear();
     output.reserve(parts.size());
 
@@ -130,7 +131,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
             std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
 
             // preprocess image
-            clip_image_f32_batch_ptr batch_f32;
+            clip_image_f32_batch_ptr batch_f32(new clip_image_f32_batch);
             bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), batch_f32.get());
             if (!ok) {
                 LOG_ERR("Unable to preprocess image\n");
@@ -161,14 +162,128 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
 
 LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
                             const llava2_image_tokens & image_tokens) {
-    ctx->image_embd_v.reserve(image_tokens.n_tokens * clip_n_mmproj_embd(ctx->ctx_clip));
-    return clip_image_batch_encode(
+    int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
+    ctx->image_embd_v.resize(image_tokens.n_tokens * n_mmproj_embd);
+    bool ok = clip_image_batch_encode(
         ctx->ctx_clip,
         ctx->n_threads,
         image_tokens.data->batch_f32.get(),
         ctx->image_embd_v.data());
+    return ok ? 0 : 1;
 }
 
 LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx) {
     return ctx->image_embd_v.data();
 }
+
+size_t llava2_helper_get_n_tokens(std::vector<llava2_input_chunk> & chunks) {
+    size_t n_tokens = 0;
+    for (auto & chunk : chunks) {
+        if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) {
+            n_tokens += chunk.tokens_text.size();
+        } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) {
+            n_tokens += chunk.tokens_image.n_tokens;
+        } else {
+            GGML_ASSERT(false && "chunk type not supported");
+        }
+    }
+    return n_tokens;
+}
+
+// helper struct to make working with embd batch easier
+// note: this will be removed after llama_batch_ext refactoring
+struct decode_embd_batch {
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_batch batch;
+    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        pos     .resize(n_tokens);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+        for (int i = 0; i < n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+};
+
+int32_t llava2_helper_eval(llava2_context_ptr & ctx,
+        llama_context * lctx,
+        std::vector<llava2_input_chunk> & chunks,
+        llama_pos pos0,
+        llama_seq_id seq_id,
+        int32_t n_batch) {
+    int32_t ret;
+    llama_pos n_past = pos0;
+    llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
+
+    for (auto & chunk : chunks) {
+        bool is_last = &chunk == &chunks.back();
+        if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) {
+            // TODO @ngxson : may need to split into smaller batches
+            text_batch.n_tokens = chunk.tokens_text.size();
+            for (size_t i = 0; i < chunk.tokens_text.size(); i++) {
+                text_batch.token   [i]    = chunk.tokens_text[i];
+                text_batch.pos     [i]    = n_past++;
+                text_batch.n_seq_id[i]    = 1;
+                text_batch.seq_id  [i][0] = seq_id;
+                text_batch.logits  [i]    = false;
+            }
+            if (is_last) {
+                // always get logits for last input chunk
+                text_batch.logits[text_batch.n_tokens - 1] = true;
+            }
+            ret = llama_decode(lctx, text_batch);
+            if (ret != 0) {
+                LOG_ERR("failed to decode text\n");
+                llama_batch_free(text_batch);
+                return ret;
+            }
+
+        } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) {
+            GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
+            ret = llava2_encode(ctx, chunk.tokens_image);
+            if (ret != 0) {
+                LOG_ERR("failed to encode image\n");
+                llama_batch_free(text_batch);
+                return ret;
+            }
+
+            int32_t n_tokens = chunk.tokens_image.n_tokens;
+            float * embd = llava2_get_output_embd(ctx);
+            decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
+            ret = llama_decode(lctx, batch_img.batch);
+            if (ret != 0) {
+                LOG_ERR("failed to decode image\n");
+                llama_batch_free(text_batch);
+                return ret;
+            }
+
+            n_past += n_tokens;
+
+        } else {
+            GGML_ASSERT(false && "chunk type not supported");
+        }
+    }
+
+    llama_batch_free(text_batch);
+    return 0;
+}
diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h
index 188c18fdc259b..3691e30c386cd 100644
--- a/examples/llava/llava2.h
+++ b/examples/llava/llava2.h
@@ -62,25 +62,29 @@ struct llava2_context_params {
     bool use_gpu = true;
     int n_threads = 4;
     enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
+    const char * image_marker = "<__image__>";
 };
 
+// initialize the llava2 context
+// return nullptr on failure
 LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
                                                 const llama_model * text_model,
                                                 const llava2_context_params ctx_params);
 
 // helper function to load an image from a file
+// returns 0 on success
 LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output);
 
 // tokenize an input text prompt and an image
-// the prompt must have the input image marker <image> in it
+// the prompt must have the input image marker (default: "<__image__>") in it
 // the marker will be replaced with the image tokens
 // for example:
-//   "here is an image: <image>\ndescribe it in detail."
+//   "here is an image: <__image__>\ndescribe it in detail."
 //   this will gives 3 chunks:
 //   1. "here is an image: <start_of_image>"
-//   2. <image> (image tokens)
+//   2. (image tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
-// number of bitmaps must be equal to the number of <image> markers in the prompt
+// number of bitmaps must be equal to the number of image markers in the prompt
 LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx,
                                 std::vector<llava2_input_chunk> & output,
                                 const std::string & prompt,
@@ -88,11 +92,28 @@ LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx,
                                 bool parse_special,
                                 const std::vector<llava2_bitmap> & bitmaps);
 
+// returns 0 on success
 LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
                             const llava2_image_tokens & image_tokens);
 
+// get output embeddings from the last encode pass
 LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx);
 
+// simple helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
+LLAVA2_API size_t llava2_helper_get_n_tokens(std::vector<llava2_input_chunk> & chunks);
+
+// helper function that automatically:
+// 1. run llama_decode() on text chunks
+// 2. run llava2_encode() on image chunks, then llava2_get_output_embd() and then llama_decode()
+// if any of the llava2_encode() or llama_decode() calls return non-zero, stop and forward the error
+// otherwise, returns 0 on success
+LLAVA2_API int32_t llava2_helper_eval(llava2_context_ptr & ctx,
+                                llama_context * lctx,
+                                std::vector<llava2_input_chunk> & chunks,
+                                llama_pos pos0,
+                                llama_seq_id seq_id,
+                                int32_t n_batch);
+
 #else
 
 static_assert(false && "C header is not yet supported by this library");

From 7cc4108a9b6bfe406839a186c5f488661b365f8f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 9 Apr 2025 14:47:07 +0200
Subject: [PATCH 03/11] add timings

---
 examples/llava/CMakeLists.txt |  1 +
 examples/llava/gemma3-cli.cpp |  1 +
 examples/llava/llava2.cpp     | 11 ++++++++++-
 examples/llava/llava2.h       |  1 +
 4 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index 491435ec2342e..390a8ff9a132a 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -38,6 +38,7 @@ target_link_libraries(llava2 PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
 
 target_include_directories(llava2 PUBLIC .)
 target_include_directories(llava2 PUBLIC ../..)
+target_include_directories(llava2 PUBLIC ../../common) # for stb_image.h
 
 target_compile_features(llava2 PRIVATE cxx_std_17)
 
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index fd3778f918def..ae32d146e3401 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -88,6 +88,7 @@ struct gemma3_context {
         const char * clip_path = params.mmproj.path.c_str();
         ctx_vision = llava2_init_from_file(clip_path, model, llava2_context_params{
             /* use_gpu */   true,
+            /* timings */   true,
             /* n_threads */ params.cpuparams.n_threads,
             /* verbosity */ GGML_LOG_LEVEL_INFO,
         });
diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp
index 379a061715838..a50bf6138ca9a 100644
--- a/examples/llava/llava2.cpp
+++ b/examples/llava/llava2.cpp
@@ -16,6 +16,7 @@ struct llava2_context {
     struct clip_ctx * ctx_clip;
     const struct llama_model * text_model;
     std::vector<float> image_embd_v; // image embedding vector
+    bool print_timings;
     int n_threads;
     std::string image_marker;
 
@@ -23,7 +24,7 @@ struct llava2_context {
 
     llava2_context(const char * mmproj_fname,
                    const struct llama_model * text_model,
-                   const struct llava2_context_params & ctx_params) : n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
+                   const struct llava2_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -260,22 +261,30 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
 
         } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) {
             GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
+            int64_t t0 = ggml_time_ms();
             ret = llava2_encode(ctx, chunk.tokens_image);
             if (ret != 0) {
                 LOG_ERR("failed to encode image\n");
                 llama_batch_free(text_batch);
                 return ret;
             }
+            if (ctx->print_timings) {
+                LOG_INF("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
+            }
 
             int32_t n_tokens = chunk.tokens_image.n_tokens;
             float * embd = llava2_get_output_embd(ctx);
             decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
+            int64_t t1 = ggml_time_ms();
             ret = llama_decode(lctx, batch_img.batch);
             if (ret != 0) {
                 LOG_ERR("failed to decode image\n");
                 llama_batch_free(text_batch);
                 return ret;
             }
+            if (ctx->print_timings) {
+                LOG_INF("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
+            }
 
             n_past += n_tokens;
 
diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h
index 3691e30c386cd..501fe781ffadd 100644
--- a/examples/llava/llava2.h
+++ b/examples/llava/llava2.h
@@ -60,6 +60,7 @@ struct llava2_input_chunk {
 
 struct llava2_context_params {
     bool use_gpu = true;
+    bool print_timings = true;
     int n_threads = 4;
     enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
     const char * image_marker = "<__image__>";

From a9ef623187133720765b33ef5d107a07b1a7e478 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 9 Apr 2025 14:53:29 +0200
Subject: [PATCH 04/11] correct pre/postfix

---
 examples/llava/clip-impl.h |  6 ++++++
 examples/llava/clip.cpp    |  8 ++++++++
 examples/llava/llava2.cpp  | 11 +++++++++++
 3 files changed, 25 insertions(+)

diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index e9c23a59db7ee..8fa8dcdc692a8 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -326,3 +326,9 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
             return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
     }
 }
+
+//
+// API used internally with llava2
+//
+
+projector_type clip_get_projector_type(const struct clip_ctx * ctx);
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 07c3df23c53bb..f8ab6a89e6a38 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2884,3 +2884,11 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
     clip_image_encode(ctx, n_threads, &clip_img, vec);
     return true;
 }
+
+//
+// API used internally with llava2
+//
+
+projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
+    return ctx->proj_type;
+}
diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp
index a50bf6138ca9a..fa2d73cd38a34 100644
--- a/examples/llava/llava2.cpp
+++ b/examples/llava/llava2.cpp
@@ -97,6 +97,17 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
         const std::vector<llava2_bitmap> & bitmaps) {
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
+    std::string prompt_modified(prompt);
+    std::string marker_modified(ctx->image_marker);
+    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
+    // a bit hacky here, but works for now
+    // for some models, we need to add prefix and suffix to the image embeddings
+    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+        // <start_of_image> ... (image embeddings) ... <end_of_image>
+        marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
+        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+    }
+
     std::vector<std::string> parts = string_split_str(prompt, ctx->image_marker);
     output.clear();
     output.reserve(parts.size());

From 3b25bd944cbea7c67ec2fd641f8a0ebe900a82e0 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 9 Apr 2025 15:00:32 +0200
Subject: [PATCH 05/11] fix missing include

---
 examples/llava/clip-impl.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index 8fa8dcdc692a8..d923b04add7aa 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -9,6 +9,7 @@
 #include <map>
 #include <sstream>
 #include <vector>
+#include <memory>
 
 // Internal header for clip.cpp
 

From 1576c82c0fea39548e9f589ffe93db3cc55e5420 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 9 Apr 2025 15:03:07 +0200
Subject: [PATCH 06/11] fix compilation unused var warn

---
 examples/llava/llava2.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp
index fa2d73cd38a34..77abb689566d8 100644
--- a/examples/llava/llava2.cpp
+++ b/examples/llava/llava2.cpp
@@ -151,8 +151,8 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
             }
 
             llava2_image_tokens image_tokens;
-            //image_tokens.nx = ...;
-            //image_tokens.ny = ...;
+            image_tokens.nx = 0; // TODO
+            image_tokens.ny = 0; // TODO
             image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
             image_tokens.data = std::unique_ptr<llava2_image_tokens_data>(
                 new llava2_image_tokens_data{

From 117bf734cc36107031eebe4e01adab9ab5759137 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 9 Apr 2025 17:55:14 +0200
Subject: [PATCH 07/11] update llava2_tokenize

---
 examples/llava/gemma3-cli.cpp |  6 +++++-
 examples/llava/llava2.cpp     | 17 +++++++++--------
 examples/llava/llava2.h       | 12 +++++++++---
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index ae32d146e3401..423efc58024e0 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -181,7 +181,11 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
     }
 
     std::vector<llava2_input_chunk> chunks;
-    if (llava2_tokenize(ctx.ctx_vision, chunks, formatted_chat.prompt, add_bos, true, bitmaps)) {
+    llava2_input_text text;
+    text.text          = formatted_chat.prompt;
+    text.add_special   = add_bos;
+    text.parse_special = true;
+    if (llava2_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {
         LOG_ERR("Unable to tokenize prompt\n");
         return 1;
     }
diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp
index 77abb689566d8..1bc153af0edad 100644
--- a/examples/llava/llava2.cpp
+++ b/examples/llava/llava2.cpp
@@ -91,13 +91,11 @@ static std::vector<llama_token> llava2_tokenize_text_internal(
 
 int32_t llava2_tokenize(llava2_context_ptr & ctx,
         std::vector<llava2_input_chunk> & output,
-        const std::string & prompt,
-        bool add_special,
-        bool parse_special,
+        const llava2_input_text & text,
         const std::vector<llava2_bitmap> & bitmaps) {
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
-    std::string prompt_modified(prompt);
+    std::string prompt_modified(text.text);
     std::string marker_modified(ctx->image_marker);
     projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
     // a bit hacky here, but works for now
@@ -108,7 +106,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
     }
 
-    std::vector<std::string> parts = string_split_str(prompt, ctx->image_marker);
+    std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
     output.clear();
     output.reserve(parts.size());
 
@@ -117,7 +115,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
     for (const auto & part : parts) {
         //printf("tokenizing part: %s\n", part.c_str());
         bool add_bos = &parts.front() == &part;
-        auto tokens = llava2_tokenize_text_internal(vocab, part, add_special && add_bos, parse_special);
+        auto tokens = llava2_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
         if (tokens.empty()) {
             continue;
         }
@@ -273,6 +271,9 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
         } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) {
             GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
             int64_t t0 = ggml_time_ms();
+            if (ctx->print_timings) {
+                LOG_INF("encoding image...\n");
+            }
             ret = llava2_encode(ctx, chunk.tokens_image);
             if (ret != 0) {
                 LOG_ERR("failed to encode image\n");
@@ -280,7 +281,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
                 return ret;
             }
             if (ctx->print_timings) {
-                LOG_INF("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
+                LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
             }
 
             int32_t n_tokens = chunk.tokens_image.n_tokens;
@@ -294,7 +295,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
                 return ret;
             }
             if (ctx->print_timings) {
-                LOG_INF("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
+                LOG_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
             }
 
             n_past += n_tokens;
diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h
index 501fe781ffadd..06126a8f12680 100644
--- a/examples/llava/llava2.h
+++ b/examples/llava/llava2.h
@@ -66,6 +66,12 @@ struct llava2_context_params {
     const char * image_marker = "<__image__>";
 };
 
+struct llava2_input_text {
+    std::string text;
+    bool add_special;
+    bool parse_special;
+};
+
 // initialize the llava2 context
 // return nullptr on failure
 LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
@@ -74,6 +80,7 @@ LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
 
 // helper function to load an image from a file
 // returns 0 on success
+// this function is thread-safe
 LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output);
 
 // tokenize an input text prompt and an image
@@ -86,11 +93,10 @@ LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitma
 //   2. (image tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
 // number of bitmaps must be equal to the number of image markers in the prompt
+// this function is thread-safe (shared ctx)
 LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx,
                                 std::vector<llava2_input_chunk> & output,
-                                const std::string & prompt,
-                                bool add_special,
-                                bool parse_special,
+                                const llava2_input_text & text,
                                 const std::vector<llava2_bitmap> & bitmaps);
 
 // returns 0 on success

From a6625fa68b0c234dbd1349b4b05efdda0d4ee9e1 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 10 Apr 2025 12:00:30 +0200
Subject: [PATCH 08/11] change name llava2 --> mtmd

---
 examples/llava/CMakeLists.txt           | 36 +++++++--------
 examples/llava/clip-impl.h              |  2 +-
 examples/llava/clip.cpp                 |  2 +-
 examples/llava/gemma3-cli.cpp           | 22 +++++-----
 examples/llava/{llava2.cpp => mtmd.cpp} | 54 +++++++++++------------
 examples/llava/{llava2.h => mtmd.h}     | 58 ++++++++++++-------------
 6 files changed, 87 insertions(+), 87 deletions(-)
 rename examples/llava/{llava2.cpp => mtmd.cpp} (85%)
 rename examples/llava/{llava2.h => mtmd.h} (60%)

diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index 390a8ff9a132a..c76555c59f0fd 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -24,41 +24,41 @@ if (BUILD_SHARED_LIBS)
     install(TARGETS llava_shared LIBRARY)
 endif()
 
-# llava2
+# mtmd
 
-add_library(llava2 OBJECT
-            llava2.cpp
-            llava2.h
+add_library(mtmd OBJECT
+            mtmd.cpp
+            mtmd.h
             clip.cpp
             clip.h
             clip-impl.h
             )
 
-target_link_libraries(llava2 PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
 
-target_include_directories(llava2 PUBLIC .)
-target_include_directories(llava2 PUBLIC ../..)
-target_include_directories(llava2 PUBLIC ../../common) # for stb_image.h
+target_include_directories(mtmd PUBLIC .)
+target_include_directories(mtmd PUBLIC ../..)
+target_include_directories(mtmd PUBLIC ../../common) # for stb_image.h
 
-target_compile_features(llava2 PRIVATE cxx_std_17)
+target_compile_features(mtmd PRIVATE cxx_std_17)
 
-add_library(llava2_static STATIC $<TARGET_OBJECTS:llava2>)
+add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
 if (BUILD_SHARED_LIBS)
-    set_target_properties(llava2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llava2 PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    add_library(llava2_shared SHARED $<TARGET_OBJECTS:llava2>)
-    target_link_libraries(llava2_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
-    install(TARGETS llava2_shared LIBRARY)
+    set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
+    target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS mtmd_shared LIBRARY)
 endif()
 
 if (NOT MSVC)
     target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
-    target_compile_options(llava2 PRIVATE -Wno-cast-qual) # stb_image.h
+    target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
 endif()
 
 if(TARGET BUILD_INFO)
     add_dependencies(llava BUILD_INFO)
-    add_dependencies(llava2 BUILD_INFO)
+    add_dependencies(mtmd BUILD_INFO)
 endif()
 
 set(TARGET llama-llava-cli)
@@ -86,7 +86,7 @@ set(TARGET llama-gemma3-cli)
 add_executable(${TARGET} gemma3-cli.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava2 ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-llava-clip-quantize-cli)
diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index d923b04add7aa..4c03529874924 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -329,7 +329,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
 }
 
 //
-// API used internally with llava2
+// API used internally with mtmd
 //
 
 projector_type clip_get_projector_type(const struct clip_ctx * ctx);
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index f8ab6a89e6a38..710309edaecd6 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2886,7 +2886,7 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
 }
 
 //
-// API used internally with llava2
+// API used internally with mtmd
 //
 
 projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 423efc58024e0..9b643fcd36d72 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -6,7 +6,7 @@
 #include "ggml.h"
 #include "console.h"
 #include "chat.h"
-#include "llava2.h"
+#include "mtmd.h"
 
 #include <vector>
 #include <limits.h>
@@ -57,7 +57,7 @@ static void sigint_handler(int signo) {
 #endif
 
 struct gemma3_context {
-    llava2_context_ptr ctx_vision;
+    mtmd_context_ptr ctx_vision;
     common_init_result llama_init;
 
     llama_model       * model;
@@ -86,7 +86,7 @@ struct gemma3_context {
 
     void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
-        ctx_vision = llava2_init_from_file(clip_path, model, llava2_context_params{
+        ctx_vision = mtmd_init_from_file(clip_path, model, mtmd_context_params{
             /* use_gpu */   true,
             /* timings */   true,
             /* n_threads */ params.cpuparams.n_threads,
@@ -162,7 +162,7 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
 }
 
 static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
-    std::vector<llava2_bitmap> bitmaps;
+    std::vector<mtmd_bitmap> bitmaps;
 
     common_chat_templates_inputs tmpl_inputs;
     tmpl_inputs.messages = {msg};
@@ -172,30 +172,30 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
     LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
 
     for (auto & fname : images_fname) {
-        llava2_bitmap bitmap;
-        if (llava2_bitmap_init_from_file(fname.c_str(), bitmap)) {
+        mtmd_bitmap bitmap;
+        if (mtmd_bitmap_init_from_file(fname.c_str(), bitmap)) {
             LOG_ERR("Unable to load image %s\n", fname.c_str());
             return 2; // image not found
         }
         bitmaps.push_back(std::move(bitmap));
     }
 
-    std::vector<llava2_input_chunk> chunks;
-    llava2_input_text text;
+    std::vector<mtmd_input_chunk> chunks;
+    mtmd_input_text text;
     text.text          = formatted_chat.prompt;
     text.add_special   = add_bos;
     text.parse_special = true;
-    if (llava2_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {
+    if (mtmd_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {
         LOG_ERR("Unable to tokenize prompt\n");
         return 1;
     }
 
-    if (llava2_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
+    if (mtmd_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
         LOG_ERR("Unable to eval prompt\n");
         return 1;
     }
 
-    ctx.n_past += llava2_helper_get_n_tokens(chunks);
+    ctx.n_past += mtmd_helper_get_n_tokens(chunks);
 
     return 0;
 }
diff --git a/examples/llava/llava2.cpp b/examples/llava/mtmd.cpp
similarity index 85%
rename from examples/llava/llava2.cpp
rename to examples/llava/mtmd.cpp
index 1bc153af0edad..544f53d7ba90d 100644
--- a/examples/llava/llava2.cpp
+++ b/examples/llava/mtmd.cpp
@@ -1,6 +1,6 @@
 #include "clip.h"
 #include "clip-impl.h"
-#include "llava2.h"
+#include "mtmd.h"
 
 #include "llama.h"
 
@@ -12,7 +12,7 @@
 #include <limits>
 #include <vector>
 
-struct llava2_context {
+struct mtmd_context {
     struct clip_ctx * ctx_clip;
     const struct llama_model * text_model;
     std::vector<float> image_embd_v; // image embedding vector
@@ -22,9 +22,9 @@ struct llava2_context {
 
     // TODO @ngxson : add timings
 
-    llava2_context(const char * mmproj_fname,
+    mtmd_context(const char * mmproj_fname,
                    const struct llama_model * text_model,
-                   const struct llava2_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
+                   const struct mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -35,20 +35,20 @@ struct llava2_context {
         this->text_model = text_model;
     }
 
-    ~llava2_context() {
+    ~mtmd_context() {
         clip_free(ctx_clip);
     }
 };
 
-struct llava2_image_tokens_data {
+struct mtmd_image_tokens_data {
     clip_image_f32_batch_ptr batch_f32; // preprocessed image patches
 };
 
-llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
+mtmd_context_ptr mtmd_init_from_file(const char * mmproj_fname,
         const struct llama_model * text_model,
-        const struct llava2_context_params ctx_params) {
+        const struct mtmd_context_params ctx_params) {
     try {
-        auto ctx = std::make_shared<llava2_context>(mmproj_fname, text_model, ctx_params);
+        auto ctx = std::make_shared<mtmd_context>(mmproj_fname, text_model, ctx_params);
         return ctx;
     } catch (const std::exception & e) {
         LOG_ERR("%s: error: %s\n", __func__, e.what());
@@ -56,7 +56,7 @@ llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
     }
 }
 
-int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output) {
+int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
     clip_image_u8_ptr img_u8(clip_image_u8_init());
     bool ok = clip_image_load_from_file(fname, img_u8.get());
     if (!ok) {
@@ -70,7 +70,7 @@ int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output)
 }
 
 // copied from common_tokenize
-static std::vector<llama_token> llava2_tokenize_text_internal(
+static std::vector<llama_token> mtmd_tokenize_text_internal(
     const struct llama_vocab * vocab,
            const std::string & text,
                         bool   add_special,
@@ -89,10 +89,10 @@ static std::vector<llama_token> llava2_tokenize_text_internal(
     return result;
 }
 
-int32_t llava2_tokenize(llava2_context_ptr & ctx,
-        std::vector<llava2_input_chunk> & output,
-        const llava2_input_text & text,
-        const std::vector<llava2_bitmap> & bitmaps) {
+int32_t mtmd_tokenize(mtmd_context_ptr & ctx,
+        std::vector<mtmd_input_chunk> & output,
+        const mtmd_input_text & text,
+        const std::vector<mtmd_bitmap> & bitmaps) {
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
     std::string prompt_modified(text.text);
@@ -115,7 +115,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
     for (const auto & part : parts) {
         //printf("tokenizing part: %s\n", part.c_str());
         bool add_bos = &parts.front() == &part;
-        auto tokens = llava2_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
+        auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
         if (tokens.empty()) {
             continue;
         }
@@ -148,12 +148,12 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
                 return 1;
             }
 
-            llava2_image_tokens image_tokens;
+            mtmd_image_tokens image_tokens;
             image_tokens.nx = 0; // TODO
             image_tokens.ny = 0; // TODO
             image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
-            image_tokens.data = std::unique_ptr<llava2_image_tokens_data>(
-                new llava2_image_tokens_data{
+            image_tokens.data = std::unique_ptr<mtmd_image_tokens_data>(
+                new mtmd_image_tokens_data{
                     std::move(batch_f32),
                 }
             );
@@ -170,8 +170,8 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
     return 0;
 }
 
-LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
-                            const llava2_image_tokens & image_tokens) {
+LLAVA2_API int32_t mtmd_encode(mtmd_context_ptr & ctx,
+                            const mtmd_image_tokens & image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens.n_tokens * n_mmproj_embd);
     bool ok = clip_image_batch_encode(
@@ -182,11 +182,11 @@ LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
     return ok ? 0 : 1;
 }
 
-LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx) {
+LLAVA2_API float * mtmd_get_output_embd(mtmd_context_ptr & ctx) {
     return ctx->image_embd_v.data();
 }
 
-size_t llava2_helper_get_n_tokens(std::vector<llava2_input_chunk> & chunks) {
+size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks) {
     size_t n_tokens = 0;
     for (auto & chunk : chunks) {
         if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) {
@@ -235,9 +235,9 @@ struct decode_embd_batch {
     }
 };
 
-int32_t llava2_helper_eval(llava2_context_ptr & ctx,
+int32_t mtmd_helper_eval(mtmd_context_ptr & ctx,
         llama_context * lctx,
-        std::vector<llava2_input_chunk> & chunks,
+        std::vector<mtmd_input_chunk> & chunks,
         llama_pos pos0,
         llama_seq_id seq_id,
         int32_t n_batch) {
@@ -274,7 +274,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
             if (ctx->print_timings) {
                 LOG_INF("encoding image...\n");
             }
-            ret = llava2_encode(ctx, chunk.tokens_image);
+            ret = mtmd_encode(ctx, chunk.tokens_image);
             if (ret != 0) {
                 LOG_ERR("failed to encode image\n");
                 llama_batch_free(text_batch);
@@ -285,7 +285,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
             }
 
             int32_t n_tokens = chunk.tokens_image.n_tokens;
-            float * embd = llava2_get_output_embd(ctx);
+            float * embd = mtmd_get_output_embd(ctx);
             decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
             int64_t t1 = ggml_time_ms();
             ret = llama_decode(lctx, batch_img.batch);
diff --git a/examples/llava/llava2.h b/examples/llava/mtmd.h
similarity index 60%
rename from examples/llava/llava2.h
rename to examples/llava/mtmd.h
index 06126a8f12680..38e29246f6de2 100644
--- a/examples/llava/llava2.h
+++ b/examples/llava/mtmd.h
@@ -25,40 +25,40 @@
 
 #ifdef __cplusplus
 
-enum llava2_input_chunk_type {
+enum mtmd_input_chunk_type {
     LLAVA2_INPUT_CHUNK_TYPE_TEXT,
     LLAVA2_INPUT_CHUNK_TYPE_IMAGE,
 };
 
-struct llava2_context;
-struct llava2_image_tokens_data; // internal data
+struct mtmd_context;
+struct mtmd_image_tokens_data; // internal data
 
-using llava2_context_ptr           = std::shared_ptr<struct llava2_context>;
-using llava2_image_tokens_data_ptr = std::shared_ptr<struct llava2_image_tokens_data>;
+using mtmd_context_ptr           = std::shared_ptr<struct mtmd_context>;
+using mtmd_image_tokens_data_ptr = std::shared_ptr<struct mtmd_image_tokens_data>;
 
 // represents raw image data, layout is RGBRGBRGB...
 // length of data must be nx * ny * 3
-struct llava2_bitmap {
+struct mtmd_bitmap {
     uint32_t nx;
     uint32_t ny;
     std::vector<unsigned char> data;
 };
 
 // represents the processed image as tokens (to be encoded)
-struct llava2_image_tokens {
+struct mtmd_image_tokens {
     uint32_t nx; // number of tokens in x direction
     uint32_t ny; // number of tokens in y direction
     uint32_t n_tokens; // == nx * ny
-    llava2_image_tokens_data_ptr data; // internal data
+    mtmd_image_tokens_data_ptr data; // internal data
 };
 
-struct llava2_input_chunk {
-    llava2_input_chunk_type type;
+struct mtmd_input_chunk {
+    mtmd_input_chunk_type type;
     std::vector<int32_t> tokens_text;
-    llava2_image_tokens tokens_image;
+    mtmd_image_tokens tokens_image;
 };
 
-struct llava2_context_params {
+struct mtmd_context_params {
     bool use_gpu = true;
     bool print_timings = true;
     int n_threads = 4;
@@ -66,22 +66,22 @@ struct llava2_context_params {
     const char * image_marker = "<__image__>";
 };
 
-struct llava2_input_text {
+struct mtmd_input_text {
     std::string text;
     bool add_special;
     bool parse_special;
 };
 
-// initialize the llava2 context
+// initialize the mtmd context
 // return nullptr on failure
-LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
+LLAVA2_API mtmd_context_ptr mtmd_init_from_file(const char * mmproj_fname,
                                                 const llama_model * text_model,
-                                                const llava2_context_params ctx_params);
+                                                const mtmd_context_params ctx_params);
 
 // helper function to load an image from a file
 // returns 0 on success
 // this function is thread-safe
-LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output);
+LLAVA2_API int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
 
 // tokenize an input text prompt and an image
 // the prompt must have the input image marker (default: "<__image__>") in it
@@ -94,29 +94,29 @@ LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitma
 //   3. "<end_of_image>\ndescribe it in detail."
 // number of bitmaps must be equal to the number of image markers in the prompt
 // this function is thread-safe (shared ctx)
-LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx,
-                                std::vector<llava2_input_chunk> & output,
-                                const llava2_input_text & text,
-                                const std::vector<llava2_bitmap> & bitmaps);
+LLAVA2_API int32_t mtmd_tokenize(mtmd_context_ptr & ctx,
+                                std::vector<mtmd_input_chunk> & output,
+                                const mtmd_input_text & text,
+                                const std::vector<mtmd_bitmap> & bitmaps);
 
 // returns 0 on success
-LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
-                            const llava2_image_tokens & image_tokens);
+LLAVA2_API int32_t mtmd_encode(mtmd_context_ptr & ctx,
+                            const mtmd_image_tokens & image_tokens);
 
 // get output embeddings from the last encode pass
-LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx);
+LLAVA2_API float * mtmd_get_output_embd(mtmd_context_ptr & ctx);
 
 // simple helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
-LLAVA2_API size_t llava2_helper_get_n_tokens(std::vector<llava2_input_chunk> & chunks);
+LLAVA2_API size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks);
 
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
-// 2. run llava2_encode() on image chunks, then llava2_get_output_embd() and then llama_decode()
-// if any of the llava2_encode() or llama_decode() calls return non-zero, stop and forward the error
+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
 // otherwise, returns 0 on success
-LLAVA2_API int32_t llava2_helper_eval(llava2_context_ptr & ctx,
+LLAVA2_API int32_t mtmd_helper_eval(mtmd_context_ptr & ctx,
                                 llama_context * lctx,
-                                std::vector<llava2_input_chunk> & chunks,
+                                std::vector<mtmd_input_chunk> & chunks,
                                 llama_pos pos0,
                                 llama_seq_id seq_id,
                                 int32_t n_batch);

From 430dbd85e269122858312f5ac23c3f7ec8b319be Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 10 Apr 2025 15:47:43 +0200
Subject: [PATCH 09/11] improve api

---
 examples/llava/CMakeLists.txt |   4 +-
 examples/llava/gemma3-cli.cpp |  12 ++--
 examples/llava/mtmd.cpp       | 110 ++++++++++++++++++++--------------
 examples/llava/mtmd.h         |  71 ++++++++++++----------
 4 files changed, 111 insertions(+), 86 deletions(-)

diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index c76555c59f0fd..2d5061de460c0 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -37,8 +37,8 @@ add_library(mtmd OBJECT
 target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
 
 target_include_directories(mtmd PUBLIC .)
-target_include_directories(mtmd PUBLIC ../..)
-target_include_directories(mtmd PUBLIC ../../common) # for stb_image.h
+target_include_directories(mtmd PRIVATE ../..)
+target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
 
 target_compile_features(mtmd PRIVATE cxx_std_17)
 
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 9b643fcd36d72..26d18921a5c75 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -86,12 +86,12 @@ struct gemma3_context {
 
     void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
-        ctx_vision = mtmd_init_from_file(clip_path, model, mtmd_context_params{
+        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
             /* use_gpu */   true,
             /* timings */   true,
             /* n_threads */ params.cpuparams.n_threads,
             /* verbosity */ GGML_LOG_LEVEL_INFO,
-        });
+        }));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);
             exit(1);
@@ -180,22 +180,22 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
         bitmaps.push_back(std::move(bitmap));
     }
 
-    std::vector<mtmd_input_chunk> chunks;
     mtmd_input_text text;
     text.text          = formatted_chat.prompt;
     text.add_special   = add_bos;
     text.parse_special = true;
-    if (mtmd_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {
+    mtmd_input_chunks_ptr chunks(mtmd_tokenize(ctx.ctx_vision.get(), text, bitmaps));
+    if (chunks == nullptr) {
         LOG_ERR("Unable to tokenize prompt\n");
         return 1;
     }
 
-    if (mtmd_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
+    if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks.get(), ctx.n_past, 0, ctx.n_batch)) {
         LOG_ERR("Unable to eval prompt\n");
         return 1;
     }
 
-    ctx.n_past += mtmd_helper_get_n_tokens(chunks);
+    ctx.n_past += mtmd_helper_get_n_tokens(chunks.get());
 
     return 0;
 }
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 544f53d7ba90d..68baee9dddc8d 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -44,18 +44,30 @@ struct mtmd_image_tokens_data {
     clip_image_f32_batch_ptr batch_f32; // preprocessed image patches
 };
 
-mtmd_context_ptr mtmd_init_from_file(const char * mmproj_fname,
+struct mtmd_image_tokens {
+    uint32_t nx; // number of tokens in x direction
+    uint32_t ny; // number of tokens in y direction
+    uint32_t n_tokens() const { return nx * ny; }
+    clip_image_f32_batch_ptr batch_f32; // preprocessed image patches
+};
+
+mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
         const struct llama_model * text_model,
         const struct mtmd_context_params ctx_params) {
     try {
-        auto ctx = std::make_shared<mtmd_context>(mmproj_fname, text_model, ctx_params);
-        return ctx;
+        return new mtmd_context(mmproj_fname, text_model, ctx_params);
     } catch (const std::exception & e) {
         LOG_ERR("%s: error: %s\n", __func__, e.what());
         return nullptr;
     }
 }
 
+void mtmd_free(mtmd_context * ctx) {
+    if (ctx) {
+        delete ctx;
+    }
+}
+
 int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
     clip_image_u8_ptr img_u8(clip_image_u8_init());
     bool ok = clip_image_load_from_file(fname, img_u8.get());
@@ -89,10 +101,10 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
     return result;
 }
 
-int32_t mtmd_tokenize(mtmd_context_ptr & ctx,
-        std::vector<mtmd_input_chunk> & output,
-        const mtmd_input_text & text,
-        const std::vector<mtmd_bitmap> & bitmaps) {
+mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
+                                const mtmd_input_text & text,
+                                const std::vector<mtmd_bitmap> & bitmaps) {
+    mtmd_input_chunks * output = new mtmd_input_chunks;
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
     std::string prompt_modified(text.text);
@@ -107,8 +119,8 @@ int32_t mtmd_tokenize(mtmd_context_ptr & ctx,
     }
 
     std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
-    output.clear();
-    output.reserve(parts.size());
+    output->clear();
+    output->reserve(parts.size());
 
     size_t i_img = 0;
 
@@ -119,18 +131,19 @@ int32_t mtmd_tokenize(mtmd_context_ptr & ctx,
         if (tokens.empty()) {
             continue;
         }
-        output.push_back({
-            LLAVA2_INPUT_CHUNK_TYPE_TEXT,
+        mtmd_input_chunk chunk{
+            MTMD_INPUT_CHUNK_TYPE_TEXT,
             std::move(tokens),
             {},
-        });
+        };
+        output->emplace_back(std::move(chunk));
 
         if (&parts.back() != &part) {
             // add image token to middle of 2 parts
 
             if (i_img >= bitmaps.size()) {
                 LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
-                return 2;
+                return nullptr;
             }
 
             // shim layer
@@ -145,54 +158,58 @@ int32_t mtmd_tokenize(mtmd_context_ptr & ctx,
             bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), batch_f32.get());
             if (!ok) {
                 LOG_ERR("Unable to preprocess image\n");
-                return 1;
+                return nullptr;
             }
 
-            mtmd_image_tokens image_tokens;
-            image_tokens.nx = 0; // TODO
-            image_tokens.ny = 0; // TODO
-            image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
-            image_tokens.data = std::unique_ptr<mtmd_image_tokens_data>(
-                new mtmd_image_tokens_data{
-                    std::move(batch_f32),
-                }
-            );
-
-            output.push_back({
-                LLAVA2_INPUT_CHUNK_TYPE_IMAGE,
+            mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
+            image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
+            image_tokens->ny = 1; // TODO
+            image_tokens->batch_f32 = std::move(batch_f32);
+
+            mtmd_input_chunk chunk{
+                MTMD_INPUT_CHUNK_TYPE_IMAGE,
                 {},
-                std::move(image_tokens),
-            });
+                image_tokens,
+            };
+            output->emplace_back(std::move(chunk));
             i_img++;
         }
     }
 
-    return 0;
+    return output;
+}
+
+void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
+    for (auto & chunk : *chunks) {
+        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
+            delete chunk.tokens_image;
+        }
+    }
+    delete chunks;
 }
 
-LLAVA2_API int32_t mtmd_encode(mtmd_context_ptr & ctx,
-                            const mtmd_image_tokens & image_tokens) {
+int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
-    ctx->image_embd_v.resize(image_tokens.n_tokens * n_mmproj_embd);
+    ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
     bool ok = clip_image_batch_encode(
         ctx->ctx_clip,
         ctx->n_threads,
-        image_tokens.data->batch_f32.get(),
+        image_tokens->batch_f32.get(),
         ctx->image_embd_v.data());
     return ok ? 0 : 1;
 }
 
-LLAVA2_API float * mtmd_get_output_embd(mtmd_context_ptr & ctx) {
+float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
-size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks) {
+size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) {
     size_t n_tokens = 0;
-    for (auto & chunk : chunks) {
-        if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) {
+    for (auto & chunk : *chunks) {
+        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
             n_tokens += chunk.tokens_text.size();
-        } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) {
-            n_tokens += chunk.tokens_image.n_tokens;
+        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            n_tokens += chunk.tokens_image->n_tokens();
         } else {
             GGML_ASSERT(false && "chunk type not supported");
         }
@@ -235,9 +252,9 @@ struct decode_embd_batch {
     }
 };
 
-int32_t mtmd_helper_eval(mtmd_context_ptr & ctx,
+int32_t mtmd_helper_eval(mtmd_context * ctx,
         llama_context * lctx,
-        std::vector<mtmd_input_chunk> & chunks,
+        mtmd_input_chunks * chunks,
         llama_pos pos0,
         llama_seq_id seq_id,
         int32_t n_batch) {
@@ -245,9 +262,9 @@ int32_t mtmd_helper_eval(mtmd_context_ptr & ctx,
     llama_pos n_past = pos0;
     llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
 
-    for (auto & chunk : chunks) {
-        bool is_last = &chunk == &chunks.back();
-        if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) {
+    for (auto & chunk : *chunks) {
+        bool is_last = &chunk == &chunks->back();
+        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
             // TODO @ngxson : may need to split into smaller batches
             text_batch.n_tokens = chunk.tokens_text.size();
             for (size_t i = 0; i < chunk.tokens_text.size(); i++) {
@@ -268,8 +285,9 @@ int32_t mtmd_helper_eval(mtmd_context_ptr & ctx,
                 return ret;
             }
 
-        } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) {
+        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
             GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
+            GGML_ASSERT(chunk.tokens_image != nullptr);
             int64_t t0 = ggml_time_ms();
             if (ctx->print_timings) {
                 LOG_INF("encoding image...\n");
@@ -284,7 +302,7 @@ int32_t mtmd_helper_eval(mtmd_context_ptr & ctx,
                 LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
             }
 
-            int32_t n_tokens = chunk.tokens_image.n_tokens;
+            int32_t n_tokens = chunk.tokens_image->n_tokens();
             float * embd = mtmd_get_output_embd(ctx);
             decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
             int64_t t1 = ggml_time_ms();
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 38e29246f6de2..5222bb73127ec 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -1,5 +1,5 @@
-#ifndef LLAVA2_H
-#define LLAVA2_H
+#ifndef MTMD_H
+#define MTMD_H
 
 #include "ggml.h"
 #include "llama.h"
@@ -12,29 +12,26 @@
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
-#            define LLAVA2_API __declspec(dllexport)
+#            define MTMD_API __declspec(dllexport)
 #        else
-#            define LLAVA2_API __declspec(dllimport)
+#            define MTMD_API __declspec(dllimport)
 #        endif
 #    else
-#        define LLAVA2_API __attribute__ ((visibility ("default")))
+#        define MTMD_API __attribute__ ((visibility ("default")))
 #    endif
 #else
-#    define LLAVA2_API
+#    define MTMD_API
 #endif
 
 #ifdef __cplusplus
 
 enum mtmd_input_chunk_type {
-    LLAVA2_INPUT_CHUNK_TYPE_TEXT,
-    LLAVA2_INPUT_CHUNK_TYPE_IMAGE,
+    MTMD_INPUT_CHUNK_TYPE_TEXT,
+    MTMD_INPUT_CHUNK_TYPE_IMAGE,
 };
 
 struct mtmd_context;
-struct mtmd_image_tokens_data; // internal data
-
-using mtmd_context_ptr           = std::shared_ptr<struct mtmd_context>;
-using mtmd_image_tokens_data_ptr = std::shared_ptr<struct mtmd_image_tokens_data>;
+struct mtmd_image_tokens;
 
 // represents raw image data, layout is RGBRGBRGB...
 // length of data must be nx * ny * 3
@@ -44,20 +41,14 @@ struct mtmd_bitmap {
     std::vector<unsigned char> data;
 };
 
-// represents the processed image as tokens (to be encoded)
-struct mtmd_image_tokens {
-    uint32_t nx; // number of tokens in x direction
-    uint32_t ny; // number of tokens in y direction
-    uint32_t n_tokens; // == nx * ny
-    mtmd_image_tokens_data_ptr data; // internal data
-};
-
 struct mtmd_input_chunk {
     mtmd_input_chunk_type type;
-    std::vector<int32_t> tokens_text;
-    mtmd_image_tokens tokens_image;
+    std::vector<llama_token> tokens_text;
+    mtmd_image_tokens * tokens_image = nullptr;
 };
 
+using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
+
 struct mtmd_context_params {
     bool use_gpu = true;
     bool print_timings = true;
@@ -74,14 +65,16 @@ struct mtmd_input_text {
 
 // initialize the mtmd context
 // return nullptr on failure
-LLAVA2_API mtmd_context_ptr mtmd_init_from_file(const char * mmproj_fname,
+MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
                                                 const llama_model * text_model,
                                                 const mtmd_context_params ctx_params);
 
+MTMD_API void mtmd_free(mtmd_context * ctx);
+
 // helper function to load an image from a file
 // returns 0 on success
 // this function is thread-safe
-LLAVA2_API int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
+MTMD_API int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
 
 // tokenize an input text prompt and an image
 // the prompt must have the input image marker (default: "<__image__>") in it
@@ -94,33 +87,47 @@ LLAVA2_API int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap &
 //   3. "<end_of_image>\ndescribe it in detail."
 // number of bitmaps must be equal to the number of image markers in the prompt
 // this function is thread-safe (shared ctx)
-LLAVA2_API int32_t mtmd_tokenize(mtmd_context_ptr & ctx,
-                                std::vector<mtmd_input_chunk> & output,
+MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
                                 const mtmd_input_text & text,
                                 const std::vector<mtmd_bitmap> & bitmaps);
 
+// free image chunk data
+MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+
 // returns 0 on success
-LLAVA2_API int32_t mtmd_encode(mtmd_context_ptr & ctx,
-                            const mtmd_image_tokens & image_tokens);
+MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
+                            const mtmd_image_tokens * image_tokens);
 
 // get output embeddings from the last encode pass
-LLAVA2_API float * mtmd_get_output_embd(mtmd_context_ptr & ctx);
+MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
 // simple helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
-LLAVA2_API size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks);
+MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
 
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
 // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
 // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
 // otherwise, returns 0 on success
-LLAVA2_API int32_t mtmd_helper_eval(mtmd_context_ptr & ctx,
+MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
                                 llama_context * lctx,
-                                std::vector<mtmd_input_chunk> & chunks,
+                                mtmd_input_chunks * chunks,
                                 llama_pos pos0,
                                 llama_seq_id seq_id,
                                 int32_t n_batch);
 
+
+// convenient unique_ptr wrappers
+struct mtmd_context_deleter {
+    void operator()(mtmd_context * val) { mtmd_free(val); }
+};
+using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
+
+struct mtmd_input_chunks_deleter {
+    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
+};
+using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
+
 #else
 
 static_assert(false && "C header is not yet supported by this library");

From 6ed09b70dcca12177317e6c9f63e4a30cfc5b1ee Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 10 Apr 2025 18:20:28 +0200
Subject: [PATCH 10/11] refine helpers

---
 examples/llava/gemma3-cli.cpp |  2 +-
 examples/llava/mtmd.cpp       | 39 +++++++++++++++++++++++------------
 examples/llava/mtmd.h         | 21 +++++++++++++------
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
index 26d18921a5c75..91a07e2a8f40d 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -173,7 +173,7 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
 
     for (auto & fname : images_fname) {
         mtmd_bitmap bitmap;
-        if (mtmd_bitmap_init_from_file(fname.c_str(), bitmap)) {
+        if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) {
             LOG_ERR("Unable to load image %s\n", fname.c_str());
             return 2; // image not found
         }
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 68baee9dddc8d..743dd6266b169 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -68,19 +68,6 @@ void mtmd_free(mtmd_context * ctx) {
     }
 }
 
-int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
-    clip_image_u8_ptr img_u8(clip_image_u8_init());
-    bool ok = clip_image_load_from_file(fname, img_u8.get());
-    if (!ok) {
-        LOG_ERR("Unable to load image %s\n", fname);
-        return 1;
-    }
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
-    output.data.resize(output.nx * output.ny * 3);
-    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
-    return 0;
-}
-
 // copied from common_tokenize
 static std::vector<llama_token> mtmd_tokenize_text_internal(
     const struct llama_vocab * vocab,
@@ -326,3 +313,29 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
     llama_batch_free(text_batch);
     return 0;
 }
+
+int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) {
+    clip_image_u8_ptr img_u8(clip_image_u8_init());
+    bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
+    if (!ok) {
+        LOG_ERR("Unable to load image from buffer\n");
+        return 1;
+    }
+    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
+    output.data.resize(output.nx * output.ny * 3);
+    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
+    return 0;
+}
+
+int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
+    clip_image_u8_ptr img_u8(clip_image_u8_init());
+    bool ok = clip_image_load_from_file(fname, img_u8.get());
+    if (!ok) {
+        LOG_ERR("Unable to load image %s\n", fname);
+        return 1;
+    }
+    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
+    output.data.resize(output.nx * output.ny * 3);
+    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
+    return 0;
+}
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 5222bb73127ec..598f6947bb092 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -71,11 +71,6 @@ MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
 
 MTMD_API void mtmd_free(mtmd_context * ctx);
 
-// helper function to load an image from a file
-// returns 0 on success
-// this function is thread-safe
-MTMD_API int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
-
 // tokenize an input text prompt and an image
 // the prompt must have the input image marker (default: "<__image__>") in it
 // the marker will be replaced with the image tokens
@@ -101,7 +96,11 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
 // get output embeddings from the last encode pass
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
-// simple helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
+//
+// helper functions (can be implemented based on other functions)
+//
+
+// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
 MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
 
 // helper function that automatically:
@@ -116,6 +115,16 @@ MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
                                 llama_seq_id seq_id,
                                 int32_t n_batch);
 
+// helper function to construct a mtmd_bitmap from a file
+// returns 0 on success
+// this function is thread-safe
+MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
+
+// helper function to construct a mtmd_bitmap from a buffer
+// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
+// returns 0 on success
+// this function is thread-safe
+MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
 
 // convenient unique_ptr wrappers
 struct mtmd_context_deleter {

From aed3216b50d6ba149aa8bcb0f4c70bdae45a0be3 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <thichthat@gmail.com>
Date: Thu, 10 Apr 2025 19:41:54 +0200
Subject: [PATCH 11/11] Update examples/llava/mtmd.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/llava/mtmd.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 743dd6266b169..58503d0b22c33 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -23,8 +23,8 @@ struct mtmd_context {
     // TODO @ngxson : add timings
 
     mtmd_context(const char * mmproj_fname,
-                   const struct llama_model * text_model,
-                   const struct mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
+                   const llama_model * text_model,
+                   const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;