From 235340d3ef0359c3d275047148cc894e9392e56c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 9 Apr 2025 09:32:01 +0200 Subject: [PATCH 01/11] wip llava2 --- examples/llava/clip-impl.h | 55 +++++++++++ examples/llava/clip.cpp | 23 ++--- examples/llava/clip.h | 3 + examples/llava/gemma3-cli.cpp | 20 ++-- examples/llava/llava2.cpp | 174 ++++++++++++++++++++++++++++++++++ examples/llava/llava2.h | 102 ++++++++++++++++++++ 6 files changed, 350 insertions(+), 27 deletions(-) create mode 100644 examples/llava/llava2.cpp create mode 100644 examples/llava/llava2.h diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 685d6e7e09ad1..e9c23a59db7ee 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -1,6 +1,8 @@ #include "ggml.h" #include "gguf.h" +#include "clip.h" + #include #include #include @@ -120,6 +122,23 @@ static projector_type clip_projector_type_from_string(const std::string & str) { return PROJECTOR_TYPE_UNKNOWN; } +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... +struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + // // logging // @@ -178,6 +197,28 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, .. #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, __VA_ARGS__) +// +// cpp wrappers +// + +struct clip_image_u8_deleter { + void operator()(clip_image_u8 * val) { clip_image_u8_free(val); } +}; + +struct clip_image_f32_deleter { + void operator()(clip_image_f32 * val) { clip_image_f32_free(val); } +}; + +struct clip_image_f32_batch_deleter { + void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); } +}; + +typedef std::unique_ptr clip_image_u8_ptr; +typedef std::unique_ptr clip_image_f32_ptr; +typedef std::unique_ptr clip_image_f32_batch_ptr; + +// TODO @ngxson : we're currently having a naming clash between struct clip_image_size and function clip_image_size() + // // common utils // @@ -214,6 +255,20 @@ static void string_replace_all(std::string & s, const std::string & search, cons s = std::move(builder); } +// split string by a `std::string delim` instead of `char delim` +static std::vector string_split_str(std::string s, const std::string & delimiter) { + std::vector tokens; + size_t pos = 0; + std::string token; + while ((pos = s.find(delimiter)) != std::string::npos) { + token = s.substr(0, pos); + tokens.push_back(token); + s.erase(0, pos + delimiter.length()); + } + tokens.push_back(s); + return tokens; +} + // // gguf utils // diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index e9520f3d1a378..f29f1c7aab7ce 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -32,23 +32,6 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac //#define CLIP_DEBUG_FUNCTIONS -// RGB uint8 image -struct clip_image_u8 { - int nx; - int ny; - - std::vector buf; -}; - -// RGB float32 image (NHWC) -// Memory layout: RGBRGBRGB... -struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; -}; - #ifdef CLIP_DEBUG_FUNCTIONS static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); @@ -1618,6 +1601,12 @@ struct clip_image_f32 * clip_image_f32_init() { return new clip_image_f32(); } +unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) { + if (nx) *nx = img->nx; + if (ny) *ny = img->ny; + return img->buf.data(); +} + void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { diff --git a/examples/llava/clip.h b/examples/llava/clip.h index d806465bf68bb..9f6cc83e0280a 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -77,6 +77,9 @@ CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); +// nx, ny are the output image dimensions +CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny); + CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index 4f89c0e15b4e9..b5e8a305ce05e 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -2,11 +2,10 @@ #include "log.h" #include "common.h" #include "sampling.h" -#include "clip.h" -#include "stb_image.h" #include "llama.h" #include "ggml.h" #include "console.h" +#include "llava2.h" #include #include @@ -57,8 +56,8 @@ static void sigint_handler(int signo) { #endif struct gemma3_context { - struct clip_ctx * ctx_clip = NULL; - common_init_result llama_init; + llava2_context_ptr ctx_llava2; + common_init_result llama_init; llama_model * model; llama_context * lctx; @@ -79,16 +78,16 @@ struct gemma3_context { void init_clip_model(common_params & params) { const char * clip_path = params.mmproj.path.c_str(); - ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO); - if (!ctx_clip) { + ctx_llava2 = llava2_init_from_file(clip_path, model, llava2_context_params{ + /* use_gpu */ true, + /* n_threads */ params.cpuparams.n_threads, + /* verbosity */ GGML_LOG_LEVEL_INFO, + }); + if (!ctx_llava2.get()) { LOG_ERR("Failed to load CLIP model from %s\n", clip_path); exit(1); } } - - ~gemma3_context() { - clip_free(ctx_clip); - } }; struct decode_embd_batch { @@ -271,6 +270,7 @@ int main(int argc, char ** argv) { if (is_single_turn) { g_is_generating = true; + std::string prompt = "user\n" + params.prompt + "model\n"; if (eval_text(ctx, "user\n")) { return 1; } diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp new file mode 100644 index 0000000000000..a80ffe5fdf386 --- /dev/null +++ b/examples/llava/llava2.cpp @@ -0,0 +1,174 @@ +#include "clip.h" +#include "clip-impl.h" +#include "llava2.h" + +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include + +static const char * IMG_MARKER = ""; + +struct llava2_context { + struct clip_ctx * ctx_clip; + const struct llama_model * text_model; + std::vector image_embd_v; // image embedding vector + int n_threads; + + llava2_context(const char * mmproj_fname, + const struct llama_model * text_model, + const struct llava2_context_params & ctx_params) : n_threads(ctx_params.n_threads) { + clip_context_params ctx_clip_params; + ctx_clip_params.use_gpu = ctx_params.use_gpu; + ctx_clip_params.verbosity = ctx_params.verbosity; + ctx_clip = clip_init(mmproj_fname, ctx_clip_params); + if (!ctx_clip) { + throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname)); + } + this->text_model = text_model; + } + + ~llava2_context() { + clip_free(ctx_clip); + } +}; + +struct llava2_image_tokens_data { + clip_image_f32_batch_ptr batch_f32; // preprocessed image patches +}; + +llava2_context_ptr llava2_init_from_file(const char * mmproj_fname, + const struct llama_model * text_model, + const struct llava2_context_params ctx_params) { + try { + auto ctx = std::make_shared(mmproj_fname, text_model, ctx_params); + return ctx; + } catch (const std::exception & e) { + LOG_ERR("%s: error: %s\n", __func__, e.what()); + return nullptr; + } +} + +int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output) { + clip_image_u8_ptr img_u8(clip_image_u8_init()); + bool ok = clip_image_load_from_file(fname, img_u8.get()); + if (!ok) { + LOG_ERR("Unable to load image %s\n", fname); + return 1; + } + unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny); + output.data.resize(output.nx * output.ny * 3); + std::memcpy(output.data.data(), data, output.nx * output.ny * 3); + return 0; +} + +// copied from common_tokenize +static std::vector llava2_tokenize_text_internal( + const struct llama_vocab * vocab, + const std::string & text, + bool add_special, + bool parse_special) { + // upper limit for the number of tokens + int n_tokens = text.length() + 2 * add_special; + std::vector result(n_tokens); + n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + if (n_tokens < 0) { + result.resize(-n_tokens); + int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + GGML_ASSERT(check == -n_tokens); + } else { + result.resize(n_tokens); + } + return result; +} + +int32_t llava2_tokenize(llava2_context_ptr & ctx, + std::vector & output, + const std::string & prompt, + bool add_special, + bool parse_special, + const std::vector & bitmaps) { + auto vocab = llama_model_get_vocab(ctx->text_model); + + std::vector parts = string_split_str(prompt, IMG_MARKER); + output.clear(); + output.reserve(parts.size()); + + size_t i_img = 0; + + for (const auto & part : parts) { + //printf("tokenizing part: %s\n", part.c_str()); + bool add_bos = &parts.front() == ∂ + auto tokens = llava2_tokenize_text_internal(vocab, part, add_special && add_bos, parse_special); + if (tokens.empty()) { + continue; + } + output.push_back({ + LLAVA2_INPUT_CHUNK_TYPE_TEXT, + std::move(tokens), + {}, + }); + + if (&parts.back() != &part) { + // add image token to middle of 2 parts + + if (i_img >= bitmaps.size()) { + LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size()); + return 2; + } + + // shim layer + clip_image_u8_ptr img_u8(clip_image_u8_init()); + img_u8->nx = bitmaps[i_img].nx; + img_u8->ny = bitmaps[i_img].ny; + img_u8->buf.resize(bitmaps[i_img].data.size()); + std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3); + + // preprocess image + clip_image_f32_batch_ptr batch_f32; + bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), batch_f32.get()); + if (!ok) { + LOG_ERR("Unable to preprocess image\n"); + return 1; + } + + llava2_image_tokens image_tokens; + //image_tokens.nx = ...; + //image_tokens.ny = ...; + image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image + image_tokens.data = std::unique_ptr( + new llava2_image_tokens_data{ + std::move(batch_f32), + } + ); + + output.push_back({ + LLAVA2_INPUT_CHUNK_TYPE_IMAGE, + {}, + std::move(image_tokens), + }); + i_img++; + } + } + + return 0; +} + +LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx, + const llava2_image_tokens & image_tokens) { + ctx->image_embd_v.reserve(image_tokens.n_tokens * clip_n_mmproj_embd(ctx->ctx_clip)); + return clip_image_batch_encode( + ctx->ctx_clip, + ctx->n_threads, + image_tokens.data->batch_f32.get(), + ctx->image_embd_v.data()); +} + +LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx) { + return ctx->image_embd_v.data(); +} diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h new file mode 100644 index 0000000000000..188c18fdc259b --- /dev/null +++ b/examples/llava/llava2.h @@ -0,0 +1,102 @@ +#ifndef LLAVA2_H +#define LLAVA2_H + +#include "ggml.h" +#include "llama.h" +#include "clip.h" + +#include +#include +#include + +#ifdef LLAMA_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_BUILD +# define LLAVA2_API __declspec(dllexport) +# else +# define LLAVA2_API __declspec(dllimport) +# endif +# else +# define LLAVA2_API __attribute__ ((visibility ("default"))) +# endif +#else +# define LLAVA2_API +#endif + +#ifdef __cplusplus + +enum llava2_input_chunk_type { + LLAVA2_INPUT_CHUNK_TYPE_TEXT, + LLAVA2_INPUT_CHUNK_TYPE_IMAGE, +}; + +struct llava2_context; +struct llava2_image_tokens_data; // internal data + +using llava2_context_ptr = std::shared_ptr; +using llava2_image_tokens_data_ptr = std::shared_ptr; + +// represents raw image data, layout is RGBRGBRGB... +// length of data must be nx * ny * 3 +struct llava2_bitmap { + uint32_t nx; + uint32_t ny; + std::vector data; +}; + +// represents the processed image as tokens (to be encoded) +struct llava2_image_tokens { + uint32_t nx; // number of tokens in x direction + uint32_t ny; // number of tokens in y direction + uint32_t n_tokens; // == nx * ny + llava2_image_tokens_data_ptr data; // internal data +}; + +struct llava2_input_chunk { + llava2_input_chunk_type type; + std::vector tokens_text; + llava2_image_tokens tokens_image; +}; + +struct llava2_context_params { + bool use_gpu = true; + int n_threads = 4; + enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO; +}; + +LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname, + const llama_model * text_model, + const llava2_context_params ctx_params); + +// helper function to load an image from a file +LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output); + +// tokenize an input text prompt and an image +// the prompt must have the input image marker in it +// the marker will be replaced with the image tokens +// for example: +// "here is an image: \ndescribe it in detail." +// this will gives 3 chunks: +// 1. "here is an image: " +// 2. (image tokens) +// 3. "\ndescribe it in detail." +// number of bitmaps must be equal to the number of markers in the prompt +LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx, + std::vector & output, + const std::string & prompt, + bool add_special, + bool parse_special, + const std::vector & bitmaps); + +LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx, + const llava2_image_tokens & image_tokens); + +LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx); + +#else + +static_assert(false && "C header is not yet supported by this library"); + +#endif + +#endif From 96bf95ed5fdc762f3bc280eac559874a949dfc15 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 9 Apr 2025 12:11:10 +0200 Subject: [PATCH 02/11] migrated gemma3 to llava2 --- examples/llava/CMakeLists.txt | 32 +++++- examples/llava/clip.cpp | 2 + examples/llava/gemma3-cli.cpp | 180 ++++++++++++++-------------------- examples/llava/llava2.cpp | 131 +++++++++++++++++++++++-- examples/llava/llava2.h | 29 +++++- 5 files changed, 257 insertions(+), 117 deletions(-) diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index f275ce1ccd003..491435ec2342e 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -1,3 +1,5 @@ +# llava (legacy) + add_library(llava OBJECT llava.cpp llava.h @@ -22,12 +24,40 @@ if (BUILD_SHARED_LIBS) install(TARGETS llava_shared LIBRARY) endif() +# llava2 + +add_library(llava2 OBJECT + llava2.cpp + llava2.h + clip.cpp + clip.h + clip-impl.h + ) + +target_link_libraries(llava2 PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) + +target_include_directories(llava2 PUBLIC .) +target_include_directories(llava2 PUBLIC ../..) + +target_compile_features(llava2 PRIVATE cxx_std_17) + +add_library(llava2_static STATIC $) +if (BUILD_SHARED_LIBS) + set_target_properties(llava2 PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(llava2 PRIVATE LLAMA_SHARED LLAMA_BUILD) + add_library(llava2_shared SHARED $) + target_link_libraries(llava2_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) + install(TARGETS llava2_shared LIBRARY) +endif() + if (NOT MSVC) target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h + target_compile_options(llava2 PRIVATE -Wno-cast-qual) # stb_image.h endif() if(TARGET BUILD_INFO) add_dependencies(llava BUILD_INFO) + add_dependencies(llava2 BUILD_INFO) endif() set(TARGET llama-llava-cli) @@ -55,7 +85,7 @@ set(TARGET llama-gemma3-cli) add_executable(${TARGET} gemma3-cli.cpp) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llava2 ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-llava-clip-quantize-cli) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index f29f1c7aab7ce..2072a318cfda6 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2330,6 +2330,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); n_patches = x_patch * y_patch; + } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + n_patches = 256; } return n_patches; diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index b5e8a305ce05e..fd3778f918def 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -5,6 +5,7 @@ #include "llama.h" #include "ggml.h" #include "console.h" +#include "chat.h" #include "llava2.h" #include @@ -56,13 +57,18 @@ static void sigint_handler(int signo) { #endif struct gemma3_context { - llava2_context_ptr ctx_llava2; + llava2_context_ptr ctx_vision; common_init_result llama_init; llama_model * model; llama_context * lctx; const llama_vocab * vocab; llama_batch batch; + int n_batch; + + // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another + // so here we don't need to keep track of chat history + common_chat_templates_ptr tmpls; int n_threads = 1; llama_pos n_past = 0; @@ -73,18 +79,20 @@ struct gemma3_context { vocab = llama_model_get_vocab(model); n_threads = params.cpuparams.n_threads; batch = llama_batch_init(params.n_batch, 0, 1); - init_clip_model(params); + n_batch = params.n_batch; + tmpls = common_chat_templates_init(model, params.chat_template); + init_vision_context(params); } - void init_clip_model(common_params & params) { + void init_vision_context(common_params & params) { const char * clip_path = params.mmproj.path.c_str(); - ctx_llava2 = llava2_init_from_file(clip_path, model, llava2_context_params{ + ctx_vision = llava2_init_from_file(clip_path, model, llava2_context_params{ /* use_gpu */ true, /* n_threads */ params.cpuparams.n_threads, /* verbosity */ GGML_LOG_LEVEL_INFO, }); - if (!ctx_llava2.get()) { - LOG_ERR("Failed to load CLIP model from %s\n", clip_path); + if (!ctx_vision.get()) { + LOG_ERR("Failed to load vision model from %s\n", clip_path); exit(1); } } @@ -123,77 +131,6 @@ struct decode_embd_batch { } }; -static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) { - llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true); - common_batch_clear(ctx.batch); - for (llama_token & t : tokens) { - common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false); - } - if (logits_last) { - ctx.batch.logits[ctx.batch.n_tokens - 1] = true; - } - // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str()); - if (llama_decode(ctx.lctx, ctx.batch)) { - LOG_ERR("Failed to decode text\n"); - return 1; - } - return 0; -} - -static int eval_image(gemma3_context & ctx, std::string & fname) { - std::vector image_embd_v; - int n_embd = llama_model_n_embd(ctx.model); - int n_tokens = 256; - image_embd_v.resize(n_tokens * n_embd); - - bool ok; - struct clip_image_u8 * img_u8 = clip_image_u8_init(); - ok = clip_image_load_from_file(fname.c_str(), img_u8); - if (!ok) { - LOG_ERR("Unable to load image %s\n", fname.c_str()); - clip_image_u8_free(img_u8); - return 2; // non-fatal error - } - - clip_image_f32_batch batch_f32; - ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32); - if (!ok) { - LOG_ERR("Unable to preprocess image\n"); - clip_image_f32_batch_free(&batch_f32); - clip_image_u8_free(img_u8); - return 1; - } - - int64_t t0 = ggml_time_ms(); - LOG("Encoding image %s\n", fname.c_str()); - ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data()); - if (!ok) { - LOG_ERR("Unable to encode image\n"); - clip_image_f32_batch_free(&batch_f32); - clip_image_u8_free(img_u8); - return 1; - } - LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); - - clip_image_f32_batch_free(&batch_f32); - clip_image_u8_free(img_u8); - - // decode image embeddings - int64_t t1 = ggml_time_ms(); - eval_text(ctx, ""); - llama_set_causal_attn(ctx.lctx, false); - decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0); - if (llama_decode(ctx.lctx, batch_img.batch)) { - LOG_ERR("failed to decode image\n"); - return 1; - } - ctx.n_past += n_tokens; - llama_set_causal_attn(ctx.lctx, true); - eval_text(ctx, ""); - LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1); - return 0; -} - static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) { for (int i = 0; i < n_predict; i++) { if (i > n_predict || !g_is_generating) { @@ -223,6 +160,41 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_ return 0; } +static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector & images_fname, bool add_bos = false) { + std::vector bitmaps; + + common_chat_templates_inputs tmpl_inputs; + tmpl_inputs.messages = {msg}; + tmpl_inputs.add_generation_prompt = true; + tmpl_inputs.use_jinja = false; // jinja is buggy here + auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs); + LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str()); + + for (auto & fname : images_fname) { + llava2_bitmap bitmap; + if (llava2_bitmap_init_from_file(fname.c_str(), bitmap)) { + LOG_ERR("Unable to load image %s\n", fname.c_str()); + return 2; // image not found + } + bitmaps.push_back(std::move(bitmap)); + } + + std::vector chunks; + if (llava2_tokenize(ctx.ctx_vision, chunks, formatted_chat.prompt, add_bos, true, bitmaps)) { + LOG_ERR("Unable to tokenize prompt\n"); + return 1; + } + + if (llava2_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) { + LOG_ERR("Unable to eval prompt\n"); + return 1; + } + + ctx.n_past += llava2_helper_get_n_tokens(chunks); + + return 0; +} + int main(int argc, char ** argv) { ggml_time_init(); @@ -264,22 +236,15 @@ int main(int argc, char ** argv) { #endif } - if (eval_text(ctx, "")) { - return 1; - } - if (is_single_turn) { g_is_generating = true; - std::string prompt = "user\n" + params.prompt + "model\n"; - if (eval_text(ctx, "user\n")) { - return 1; - } - for (auto & fname : params.image) { - if (eval_image(ctx, fname)) { - return 1; - } + if (params.prompt.find("<__image__>") == std::string::npos) { + params.prompt += " <__image__>"; } - if (eval_text(ctx, params.prompt + "model\n", true)) { + common_chat_msg msg; + msg.role = "user"; + msg.content = params.prompt; + if (eval_message(ctx, msg, params.image, true)) { return 1; } if (generate_response(ctx, smpl, n_predict)) { @@ -293,9 +258,9 @@ int main(int argc, char ** argv) { LOG("\n /quit or /exit exit the program"); LOG("\n"); - if (eval_text(ctx, "user\n")) { - return 1; - } + bool is_first_msg = true; + std::vector images_fname; + std::string content; while (true) { g_is_generating = false; @@ -320,24 +285,31 @@ int main(int argc, char ** argv) { g_is_generating = true; if (line.find("/image") == 0) { std::string image = line.substr(7); - int res = eval_image(ctx, image); - if (res == 2) { - continue; // image not found - } - if (res) { - return 1; - } + images_fname.push_back(string_strip(image)); + content += "<__image__>"; continue; + } else { + content += line; } - if (eval_text(ctx, line + "model\n", true)) { - return 1; + common_chat_msg msg; + msg.role = "user"; + msg.content = content; + int ret = eval_message(ctx, msg, images_fname, is_first_msg); + if (ret == 2) { + // non-fatal error + images_fname.clear(); + content.clear(); + continue; } - if (generate_response(ctx, smpl, n_predict)) { + if (ret) { return 1; } - if (eval_text(ctx, "user\n")) { + if (generate_response(ctx, smpl, n_predict)) { return 1; } + images_fname.clear(); + content.clear(); + is_first_msg = false; } } diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp index a80ffe5fdf386..379a061715838 100644 --- a/examples/llava/llava2.cpp +++ b/examples/llava/llava2.cpp @@ -12,17 +12,18 @@ #include #include -static const char * IMG_MARKER = ""; - struct llava2_context { struct clip_ctx * ctx_clip; const struct llama_model * text_model; std::vector image_embd_v; // image embedding vector int n_threads; + std::string image_marker; + + // TODO @ngxson : add timings - llava2_context(const char * mmproj_fname, + llava2_context(const char * mmproj_fname, const struct llama_model * text_model, - const struct llava2_context_params & ctx_params) : n_threads(ctx_params.n_threads) { + const struct llava2_context_params & ctx_params) : n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) { clip_context_params ctx_clip_params; ctx_clip_params.use_gpu = ctx_params.use_gpu; ctx_clip_params.verbosity = ctx_params.verbosity; @@ -95,7 +96,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx, const std::vector & bitmaps) { auto vocab = llama_model_get_vocab(ctx->text_model); - std::vector parts = string_split_str(prompt, IMG_MARKER); + std::vector parts = string_split_str(prompt, ctx->image_marker); output.clear(); output.reserve(parts.size()); @@ -130,7 +131,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx, std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3); // preprocess image - clip_image_f32_batch_ptr batch_f32; + clip_image_f32_batch_ptr batch_f32(new clip_image_f32_batch); bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), batch_f32.get()); if (!ok) { LOG_ERR("Unable to preprocess image\n"); @@ -161,14 +162,128 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx, LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx, const llava2_image_tokens & image_tokens) { - ctx->image_embd_v.reserve(image_tokens.n_tokens * clip_n_mmproj_embd(ctx->ctx_clip)); - return clip_image_batch_encode( + int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); + ctx->image_embd_v.resize(image_tokens.n_tokens * n_mmproj_embd); + bool ok = clip_image_batch_encode( ctx->ctx_clip, ctx->n_threads, image_tokens.data->batch_f32.get(), ctx->image_embd_v.data()); + return ok ? 0 : 1; } LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx) { return ctx->image_embd_v.data(); } + +size_t llava2_helper_get_n_tokens(std::vector & chunks) { + size_t n_tokens = 0; + for (auto & chunk : chunks) { + if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) { + n_tokens += chunk.tokens_text.size(); + } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) { + n_tokens += chunk.tokens_image.n_tokens; + } else { + GGML_ASSERT(false && "chunk type not supported"); + } + } + return n_tokens; +} + +// helper struct to make working with embd batch easier +// note: this will be removed after llama_batch_ext refactoring +struct decode_embd_batch { + std::vector pos; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_batch batch; + decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + pos .resize(n_tokens); + n_seq_id.resize(n_tokens); + seq_ids .resize(n_tokens + 1); + logits .resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids [n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ embd, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = pos_0 + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } +}; + +int32_t llava2_helper_eval(llava2_context_ptr & ctx, + llama_context * lctx, + std::vector & chunks, + llama_pos pos0, + llama_seq_id seq_id, + int32_t n_batch) { + int32_t ret; + llama_pos n_past = pos0; + llama_batch text_batch = llama_batch_init(n_batch, 0, 1); + + for (auto & chunk : chunks) { + bool is_last = &chunk == &chunks.back(); + if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) { + // TODO @ngxson : may need to split into smaller batches + text_batch.n_tokens = chunk.tokens_text.size(); + for (size_t i = 0; i < chunk.tokens_text.size(); i++) { + text_batch.token [i] = chunk.tokens_text[i]; + text_batch.pos [i] = n_past++; + text_batch.n_seq_id[i] = 1; + text_batch.seq_id [i][0] = seq_id; + text_batch.logits [i] = false; + } + if (is_last) { + // always get logits for last input chunk + text_batch.logits[text_batch.n_tokens - 1] = true; + } + ret = llama_decode(lctx, text_batch); + if (ret != 0) { + LOG_ERR("failed to decode text\n"); + llama_batch_free(text_batch); + return ret; + } + + } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) { + GGML_ASSERT(!is_last && "logits for last image chunk is not yet support"); + ret = llava2_encode(ctx, chunk.tokens_image); + if (ret != 0) { + LOG_ERR("failed to encode image\n"); + llama_batch_free(text_batch); + return ret; + } + + int32_t n_tokens = chunk.tokens_image.n_tokens; + float * embd = llava2_get_output_embd(ctx); + decode_embd_batch batch_img(embd, n_tokens, n_past, 0); + ret = llama_decode(lctx, batch_img.batch); + if (ret != 0) { + LOG_ERR("failed to decode image\n"); + llama_batch_free(text_batch); + return ret; + } + + n_past += n_tokens; + + } else { + GGML_ASSERT(false && "chunk type not supported"); + } + } + + llama_batch_free(text_batch); + return 0; +} diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h index 188c18fdc259b..3691e30c386cd 100644 --- a/examples/llava/llava2.h +++ b/examples/llava/llava2.h @@ -62,25 +62,29 @@ struct llava2_context_params { bool use_gpu = true; int n_threads = 4; enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO; + const char * image_marker = "<__image__>"; }; +// initialize the llava2 context +// return nullptr on failure LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname, const llama_model * text_model, const llava2_context_params ctx_params); // helper function to load an image from a file +// returns 0 on success LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output); // tokenize an input text prompt and an image -// the prompt must have the input image marker in it +// the prompt must have the input image marker (default: "<__image__>") in it // the marker will be replaced with the image tokens // for example: -// "here is an image: \ndescribe it in detail." +// "here is an image: <__image__>\ndescribe it in detail." // this will gives 3 chunks: // 1. "here is an image: " -// 2. (image tokens) +// 2. (image tokens) // 3. "\ndescribe it in detail." -// number of bitmaps must be equal to the number of markers in the prompt +// number of bitmaps must be equal to the number of image markers in the prompt LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx, std::vector & output, const std::string & prompt, @@ -88,11 +92,28 @@ LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx, bool parse_special, const std::vector & bitmaps); +// returns 0 on success LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx, const llava2_image_tokens & image_tokens); +// get output embeddings from the last encode pass LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx); +// simple helper to count the total number of tokens from a list of chunks, useful to keep track of n_past +LLAVA2_API size_t llava2_helper_get_n_tokens(std::vector & chunks); + +// helper function that automatically: +// 1. run llama_decode() on text chunks +// 2. run llava2_encode() on image chunks, then llava2_get_output_embd() and then llama_decode() +// if any of the llava2_encode() or llama_decode() calls return non-zero, stop and forward the error +// otherwise, returns 0 on success +LLAVA2_API int32_t llava2_helper_eval(llava2_context_ptr & ctx, + llama_context * lctx, + std::vector & chunks, + llama_pos pos0, + llama_seq_id seq_id, + int32_t n_batch); + #else static_assert(false && "C header is not yet supported by this library"); From 7cc4108a9b6bfe406839a186c5f488661b365f8f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 9 Apr 2025 14:47:07 +0200 Subject: [PATCH 03/11] add timings --- examples/llava/CMakeLists.txt | 1 + examples/llava/gemma3-cli.cpp | 1 + examples/llava/llava2.cpp | 11 ++++++++++- examples/llava/llava2.h | 1 + 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 491435ec2342e..390a8ff9a132a 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -38,6 +38,7 @@ target_link_libraries(llava2 PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(llava2 PUBLIC .) target_include_directories(llava2 PUBLIC ../..) +target_include_directories(llava2 PUBLIC ../../common) # for stb_image.h target_compile_features(llava2 PRIVATE cxx_std_17) diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index fd3778f918def..ae32d146e3401 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -88,6 +88,7 @@ struct gemma3_context { const char * clip_path = params.mmproj.path.c_str(); ctx_vision = llava2_init_from_file(clip_path, model, llava2_context_params{ /* use_gpu */ true, + /* timings */ true, /* n_threads */ params.cpuparams.n_threads, /* verbosity */ GGML_LOG_LEVEL_INFO, }); diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp index 379a061715838..a50bf6138ca9a 100644 --- a/examples/llava/llava2.cpp +++ b/examples/llava/llava2.cpp @@ -16,6 +16,7 @@ struct llava2_context { struct clip_ctx * ctx_clip; const struct llama_model * text_model; std::vector image_embd_v; // image embedding vector + bool print_timings; int n_threads; std::string image_marker; @@ -23,7 +24,7 @@ struct llava2_context { llava2_context(const char * mmproj_fname, const struct llama_model * text_model, - const struct llava2_context_params & ctx_params) : n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) { + const struct llava2_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) { clip_context_params ctx_clip_params; ctx_clip_params.use_gpu = ctx_params.use_gpu; ctx_clip_params.verbosity = ctx_params.verbosity; @@ -260,22 +261,30 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx, } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) { GGML_ASSERT(!is_last && "logits for last image chunk is not yet support"); + int64_t t0 = ggml_time_ms(); ret = llava2_encode(ctx, chunk.tokens_image); if (ret != 0) { LOG_ERR("failed to encode image\n"); llama_batch_free(text_batch); return ret; } + if (ctx->print_timings) { + LOG_INF("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); + } int32_t n_tokens = chunk.tokens_image.n_tokens; float * embd = llava2_get_output_embd(ctx); decode_embd_batch batch_img(embd, n_tokens, n_past, 0); + int64_t t1 = ggml_time_ms(); ret = llama_decode(lctx, batch_img.batch); if (ret != 0) { LOG_ERR("failed to decode image\n"); llama_batch_free(text_batch); return ret; } + if (ctx->print_timings) { + LOG_INF("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1); + } n_past += n_tokens; diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h index 3691e30c386cd..501fe781ffadd 100644 --- a/examples/llava/llava2.h +++ b/examples/llava/llava2.h @@ -60,6 +60,7 @@ struct llava2_input_chunk { struct llava2_context_params { bool use_gpu = true; + bool print_timings = true; int n_threads = 4; enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO; const char * image_marker = "<__image__>"; From a9ef623187133720765b33ef5d107a07b1a7e478 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 9 Apr 2025 14:53:29 +0200 Subject: [PATCH 04/11] correct pre/postfix --- examples/llava/clip-impl.h | 6 ++++++ examples/llava/clip.cpp | 8 ++++++++ examples/llava/llava2.cpp | 11 +++++++++++ 3 files changed, 25 insertions(+) diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index e9c23a59db7ee..8fa8dcdc692a8 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -326,3 +326,9 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); } } + +// +// API used internally with llava2 +// + +projector_type clip_get_projector_type(const struct clip_ctx * ctx); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 07c3df23c53bb..f8ab6a89e6a38 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2884,3 +2884,11 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, clip_image_encode(ctx, n_threads, &clip_img, vec); return true; } + +// +// API used internally with llava2 +// + +projector_type clip_get_projector_type(const struct clip_ctx * ctx) { + return ctx->proj_type; +} diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp index a50bf6138ca9a..fa2d73cd38a34 100644 --- a/examples/llava/llava2.cpp +++ b/examples/llava/llava2.cpp @@ -97,6 +97,17 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx, const std::vector & bitmaps) { auto vocab = llama_model_get_vocab(ctx->text_model); + std::string prompt_modified(prompt); + std::string marker_modified(ctx->image_marker); + projector_type proj_type = clip_get_projector_type(ctx->ctx_clip); + // a bit hacky here, but works for now + // for some models, we need to add prefix and suffix to the image embeddings + if (proj_type == PROJECTOR_TYPE_GEMMA3) { + // ... (image embeddings) ... + marker_modified = "" + ctx->image_marker + ""; + string_replace_all(prompt_modified, ctx->image_marker, marker_modified); + } + std::vector parts = string_split_str(prompt, ctx->image_marker); output.clear(); output.reserve(parts.size()); From 3b25bd944cbea7c67ec2fd641f8a0ebe900a82e0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 9 Apr 2025 15:00:32 +0200 Subject: [PATCH 05/11] fix missing include --- examples/llava/clip-impl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 8fa8dcdc692a8..d923b04add7aa 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -9,6 +9,7 @@ #include #include #include +#include // Internal header for clip.cpp From 1576c82c0fea39548e9f589ffe93db3cc55e5420 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 9 Apr 2025 15:03:07 +0200 Subject: [PATCH 06/11] fix compilation unused var warn --- examples/llava/llava2.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp index fa2d73cd38a34..77abb689566d8 100644 --- a/examples/llava/llava2.cpp +++ b/examples/llava/llava2.cpp @@ -151,8 +151,8 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx, } llava2_image_tokens image_tokens; - //image_tokens.nx = ...; - //image_tokens.ny = ...; + image_tokens.nx = 0; // TODO + image_tokens.ny = 0; // TODO image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image image_tokens.data = std::unique_ptr( new llava2_image_tokens_data{ From 117bf734cc36107031eebe4e01adab9ab5759137 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 9 Apr 2025 17:55:14 +0200 Subject: [PATCH 07/11] update llava2_tokenize --- examples/llava/gemma3-cli.cpp | 6 +++++- examples/llava/llava2.cpp | 17 +++++++++-------- examples/llava/llava2.h | 12 +++++++++--- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index ae32d146e3401..423efc58024e0 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -181,7 +181,11 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector } std::vector chunks; - if (llava2_tokenize(ctx.ctx_vision, chunks, formatted_chat.prompt, add_bos, true, bitmaps)) { + llava2_input_text text; + text.text = formatted_chat.prompt; + text.add_special = add_bos; + text.parse_special = true; + if (llava2_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) { LOG_ERR("Unable to tokenize prompt\n"); return 1; } diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp index 77abb689566d8..1bc153af0edad 100644 --- a/examples/llava/llava2.cpp +++ b/examples/llava/llava2.cpp @@ -91,13 +91,11 @@ static std::vector llava2_tokenize_text_internal( int32_t llava2_tokenize(llava2_context_ptr & ctx, std::vector & output, - const std::string & prompt, - bool add_special, - bool parse_special, + const llava2_input_text & text, const std::vector & bitmaps) { auto vocab = llama_model_get_vocab(ctx->text_model); - std::string prompt_modified(prompt); + std::string prompt_modified(text.text); std::string marker_modified(ctx->image_marker); projector_type proj_type = clip_get_projector_type(ctx->ctx_clip); // a bit hacky here, but works for now @@ -108,7 +106,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx, string_replace_all(prompt_modified, ctx->image_marker, marker_modified); } - std::vector parts = string_split_str(prompt, ctx->image_marker); + std::vector parts = string_split_str(text.text, ctx->image_marker); output.clear(); output.reserve(parts.size()); @@ -117,7 +115,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx, for (const auto & part : parts) { //printf("tokenizing part: %s\n", part.c_str()); bool add_bos = &parts.front() == ∂ - auto tokens = llava2_tokenize_text_internal(vocab, part, add_special && add_bos, parse_special); + auto tokens = llava2_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special); if (tokens.empty()) { continue; } @@ -273,6 +271,9 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx, } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) { GGML_ASSERT(!is_last && "logits for last image chunk is not yet support"); int64_t t0 = ggml_time_ms(); + if (ctx->print_timings) { + LOG_INF("encoding image...\n"); + } ret = llava2_encode(ctx, chunk.tokens_image); if (ret != 0) { LOG_ERR("failed to encode image\n"); @@ -280,7 +281,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx, return ret; } if (ctx->print_timings) { - LOG_INF("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); + LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); } int32_t n_tokens = chunk.tokens_image.n_tokens; @@ -294,7 +295,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx, return ret; } if (ctx->print_timings) { - LOG_INF("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1); + LOG_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1); } n_past += n_tokens; diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h index 501fe781ffadd..06126a8f12680 100644 --- a/examples/llava/llava2.h +++ b/examples/llava/llava2.h @@ -66,6 +66,12 @@ struct llava2_context_params { const char * image_marker = "<__image__>"; }; +struct llava2_input_text { + std::string text; + bool add_special; + bool parse_special; +}; + // initialize the llava2 context // return nullptr on failure LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname, @@ -74,6 +80,7 @@ LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname, // helper function to load an image from a file // returns 0 on success +// this function is thread-safe LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output); // tokenize an input text prompt and an image @@ -86,11 +93,10 @@ LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitma // 2. (image tokens) // 3. "\ndescribe it in detail." // number of bitmaps must be equal to the number of image markers in the prompt +// this function is thread-safe (shared ctx) LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx, std::vector & output, - const std::string & prompt, - bool add_special, - bool parse_special, + const llava2_input_text & text, const std::vector & bitmaps); // returns 0 on success From a6625fa68b0c234dbd1349b4b05efdda0d4ee9e1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 10 Apr 2025 12:00:30 +0200 Subject: [PATCH 08/11] change name llava2 --> mtmd --- examples/llava/CMakeLists.txt | 36 +++++++-------- examples/llava/clip-impl.h | 2 +- examples/llava/clip.cpp | 2 +- examples/llava/gemma3-cli.cpp | 22 +++++----- examples/llava/{llava2.cpp => mtmd.cpp} | 54 +++++++++++------------ examples/llava/{llava2.h => mtmd.h} | 58 ++++++++++++------------- 6 files changed, 87 insertions(+), 87 deletions(-) rename examples/llava/{llava2.cpp => mtmd.cpp} (85%) rename examples/llava/{llava2.h => mtmd.h} (60%) diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 390a8ff9a132a..c76555c59f0fd 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -24,41 +24,41 @@ if (BUILD_SHARED_LIBS) install(TARGETS llava_shared LIBRARY) endif() -# llava2 +# mtmd -add_library(llava2 OBJECT - llava2.cpp - llava2.h +add_library(mtmd OBJECT + mtmd.cpp + mtmd.h clip.cpp clip.h clip-impl.h ) -target_link_libraries(llava2 PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(llava2 PUBLIC .) -target_include_directories(llava2 PUBLIC ../..) -target_include_directories(llava2 PUBLIC ../../common) # for stb_image.h +target_include_directories(mtmd PUBLIC .) +target_include_directories(mtmd PUBLIC ../..) +target_include_directories(mtmd PUBLIC ../../common) # for stb_image.h -target_compile_features(llava2 PRIVATE cxx_std_17) +target_compile_features(mtmd PRIVATE cxx_std_17) -add_library(llava2_static STATIC $) +add_library(mtmd_static STATIC $) if (BUILD_SHARED_LIBS) - set_target_properties(llava2 PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(llava2 PRIVATE LLAMA_SHARED LLAMA_BUILD) - add_library(llava2_shared SHARED $) - target_link_libraries(llava2_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) - install(TARGETS llava2_shared LIBRARY) + set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD) + add_library(mtmd_shared SHARED $) + target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) + install(TARGETS mtmd_shared LIBRARY) endif() if (NOT MSVC) target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h - target_compile_options(llava2 PRIVATE -Wno-cast-qual) # stb_image.h + target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h endif() if(TARGET BUILD_INFO) add_dependencies(llava BUILD_INFO) - add_dependencies(llava2 BUILD_INFO) + add_dependencies(mtmd BUILD_INFO) endif() set(TARGET llama-llava-cli) @@ -86,7 +86,7 @@ set(TARGET llama-gemma3-cli) add_executable(${TARGET} gemma3-cli.cpp) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llava2 ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-llava-clip-quantize-cli) diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index d923b04add7aa..4c03529874924 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -329,7 +329,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { } // -// API used internally with llava2 +// API used internally with mtmd // projector_type clip_get_projector_type(const struct clip_ctx * ctx); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index f8ab6a89e6a38..710309edaecd6 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2886,7 +2886,7 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, } // -// API used internally with llava2 +// API used internally with mtmd // projector_type clip_get_projector_type(const struct clip_ctx * ctx) { diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index 423efc58024e0..9b643fcd36d72 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -6,7 +6,7 @@ #include "ggml.h" #include "console.h" #include "chat.h" -#include "llava2.h" +#include "mtmd.h" #include #include @@ -57,7 +57,7 @@ static void sigint_handler(int signo) { #endif struct gemma3_context { - llava2_context_ptr ctx_vision; + mtmd_context_ptr ctx_vision; common_init_result llama_init; llama_model * model; @@ -86,7 +86,7 @@ struct gemma3_context { void init_vision_context(common_params & params) { const char * clip_path = params.mmproj.path.c_str(); - ctx_vision = llava2_init_from_file(clip_path, model, llava2_context_params{ + ctx_vision = mtmd_init_from_file(clip_path, model, mtmd_context_params{ /* use_gpu */ true, /* timings */ true, /* n_threads */ params.cpuparams.n_threads, @@ -162,7 +162,7 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_ } static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector & images_fname, bool add_bos = false) { - std::vector bitmaps; + std::vector bitmaps; common_chat_templates_inputs tmpl_inputs; tmpl_inputs.messages = {msg}; @@ -172,30 +172,30 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str()); for (auto & fname : images_fname) { - llava2_bitmap bitmap; - if (llava2_bitmap_init_from_file(fname.c_str(), bitmap)) { + mtmd_bitmap bitmap; + if (mtmd_bitmap_init_from_file(fname.c_str(), bitmap)) { LOG_ERR("Unable to load image %s\n", fname.c_str()); return 2; // image not found } bitmaps.push_back(std::move(bitmap)); } - std::vector chunks; - llava2_input_text text; + std::vector chunks; + mtmd_input_text text; text.text = formatted_chat.prompt; text.add_special = add_bos; text.parse_special = true; - if (llava2_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) { + if (mtmd_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) { LOG_ERR("Unable to tokenize prompt\n"); return 1; } - if (llava2_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) { + if (mtmd_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) { LOG_ERR("Unable to eval prompt\n"); return 1; } - ctx.n_past += llava2_helper_get_n_tokens(chunks); + ctx.n_past += mtmd_helper_get_n_tokens(chunks); return 0; } diff --git a/examples/llava/llava2.cpp b/examples/llava/mtmd.cpp similarity index 85% rename from examples/llava/llava2.cpp rename to examples/llava/mtmd.cpp index 1bc153af0edad..544f53d7ba90d 100644 --- a/examples/llava/llava2.cpp +++ b/examples/llava/mtmd.cpp @@ -1,6 +1,6 @@ #include "clip.h" #include "clip-impl.h" -#include "llava2.h" +#include "mtmd.h" #include "llama.h" @@ -12,7 +12,7 @@ #include #include -struct llava2_context { +struct mtmd_context { struct clip_ctx * ctx_clip; const struct llama_model * text_model; std::vector image_embd_v; // image embedding vector @@ -22,9 +22,9 @@ struct llava2_context { // TODO @ngxson : add timings - llava2_context(const char * mmproj_fname, + mtmd_context(const char * mmproj_fname, const struct llama_model * text_model, - const struct llava2_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) { + const struct mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) { clip_context_params ctx_clip_params; ctx_clip_params.use_gpu = ctx_params.use_gpu; ctx_clip_params.verbosity = ctx_params.verbosity; @@ -35,20 +35,20 @@ struct llava2_context { this->text_model = text_model; } - ~llava2_context() { + ~mtmd_context() { clip_free(ctx_clip); } }; -struct llava2_image_tokens_data { +struct mtmd_image_tokens_data { clip_image_f32_batch_ptr batch_f32; // preprocessed image patches }; -llava2_context_ptr llava2_init_from_file(const char * mmproj_fname, +mtmd_context_ptr mtmd_init_from_file(const char * mmproj_fname, const struct llama_model * text_model, - const struct llava2_context_params ctx_params) { + const struct mtmd_context_params ctx_params) { try { - auto ctx = std::make_shared(mmproj_fname, text_model, ctx_params); + auto ctx = std::make_shared(mmproj_fname, text_model, ctx_params); return ctx; } catch (const std::exception & e) { LOG_ERR("%s: error: %s\n", __func__, e.what()); @@ -56,7 +56,7 @@ llava2_context_ptr llava2_init_from_file(const char * mmproj_fname, } } -int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output) { +int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) { clip_image_u8_ptr img_u8(clip_image_u8_init()); bool ok = clip_image_load_from_file(fname, img_u8.get()); if (!ok) { @@ -70,7 +70,7 @@ int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output) } // copied from common_tokenize -static std::vector llava2_tokenize_text_internal( +static std::vector mtmd_tokenize_text_internal( const struct llama_vocab * vocab, const std::string & text, bool add_special, @@ -89,10 +89,10 @@ static std::vector llava2_tokenize_text_internal( return result; } -int32_t llava2_tokenize(llava2_context_ptr & ctx, - std::vector & output, - const llava2_input_text & text, - const std::vector & bitmaps) { +int32_t mtmd_tokenize(mtmd_context_ptr & ctx, + std::vector & output, + const mtmd_input_text & text, + const std::vector & bitmaps) { auto vocab = llama_model_get_vocab(ctx->text_model); std::string prompt_modified(text.text); @@ -115,7 +115,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx, for (const auto & part : parts) { //printf("tokenizing part: %s\n", part.c_str()); bool add_bos = &parts.front() == ∂ - auto tokens = llava2_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special); + auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special); if (tokens.empty()) { continue; } @@ -148,12 +148,12 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx, return 1; } - llava2_image_tokens image_tokens; + mtmd_image_tokens image_tokens; image_tokens.nx = 0; // TODO image_tokens.ny = 0; // TODO image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image - image_tokens.data = std::unique_ptr( - new llava2_image_tokens_data{ + image_tokens.data = std::unique_ptr( + new mtmd_image_tokens_data{ std::move(batch_f32), } ); @@ -170,8 +170,8 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx, return 0; } -LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx, - const llava2_image_tokens & image_tokens) { +LLAVA2_API int32_t mtmd_encode(mtmd_context_ptr & ctx, + const mtmd_image_tokens & image_tokens) { int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); ctx->image_embd_v.resize(image_tokens.n_tokens * n_mmproj_embd); bool ok = clip_image_batch_encode( @@ -182,11 +182,11 @@ LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx, return ok ? 0 : 1; } -LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx) { +LLAVA2_API float * mtmd_get_output_embd(mtmd_context_ptr & ctx) { return ctx->image_embd_v.data(); } -size_t llava2_helper_get_n_tokens(std::vector & chunks) { +size_t mtmd_helper_get_n_tokens(std::vector & chunks) { size_t n_tokens = 0; for (auto & chunk : chunks) { if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) { @@ -235,9 +235,9 @@ struct decode_embd_batch { } }; -int32_t llava2_helper_eval(llava2_context_ptr & ctx, +int32_t mtmd_helper_eval(mtmd_context_ptr & ctx, llama_context * lctx, - std::vector & chunks, + std::vector & chunks, llama_pos pos0, llama_seq_id seq_id, int32_t n_batch) { @@ -274,7 +274,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx, if (ctx->print_timings) { LOG_INF("encoding image...\n"); } - ret = llava2_encode(ctx, chunk.tokens_image); + ret = mtmd_encode(ctx, chunk.tokens_image); if (ret != 0) { LOG_ERR("failed to encode image\n"); llama_batch_free(text_batch); @@ -285,7 +285,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx, } int32_t n_tokens = chunk.tokens_image.n_tokens; - float * embd = llava2_get_output_embd(ctx); + float * embd = mtmd_get_output_embd(ctx); decode_embd_batch batch_img(embd, n_tokens, n_past, 0); int64_t t1 = ggml_time_ms(); ret = llama_decode(lctx, batch_img.batch); diff --git a/examples/llava/llava2.h b/examples/llava/mtmd.h similarity index 60% rename from examples/llava/llava2.h rename to examples/llava/mtmd.h index 06126a8f12680..38e29246f6de2 100644 --- a/examples/llava/llava2.h +++ b/examples/llava/mtmd.h @@ -25,40 +25,40 @@ #ifdef __cplusplus -enum llava2_input_chunk_type { +enum mtmd_input_chunk_type { LLAVA2_INPUT_CHUNK_TYPE_TEXT, LLAVA2_INPUT_CHUNK_TYPE_IMAGE, }; -struct llava2_context; -struct llava2_image_tokens_data; // internal data +struct mtmd_context; +struct mtmd_image_tokens_data; // internal data -using llava2_context_ptr = std::shared_ptr; -using llava2_image_tokens_data_ptr = std::shared_ptr; +using mtmd_context_ptr = std::shared_ptr; +using mtmd_image_tokens_data_ptr = std::shared_ptr; // represents raw image data, layout is RGBRGBRGB... // length of data must be nx * ny * 3 -struct llava2_bitmap { +struct mtmd_bitmap { uint32_t nx; uint32_t ny; std::vector data; }; // represents the processed image as tokens (to be encoded) -struct llava2_image_tokens { +struct mtmd_image_tokens { uint32_t nx; // number of tokens in x direction uint32_t ny; // number of tokens in y direction uint32_t n_tokens; // == nx * ny - llava2_image_tokens_data_ptr data; // internal data + mtmd_image_tokens_data_ptr data; // internal data }; -struct llava2_input_chunk { - llava2_input_chunk_type type; +struct mtmd_input_chunk { + mtmd_input_chunk_type type; std::vector tokens_text; - llava2_image_tokens tokens_image; + mtmd_image_tokens tokens_image; }; -struct llava2_context_params { +struct mtmd_context_params { bool use_gpu = true; bool print_timings = true; int n_threads = 4; @@ -66,22 +66,22 @@ struct llava2_context_params { const char * image_marker = "<__image__>"; }; -struct llava2_input_text { +struct mtmd_input_text { std::string text; bool add_special; bool parse_special; }; -// initialize the llava2 context +// initialize the mtmd context // return nullptr on failure -LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname, +LLAVA2_API mtmd_context_ptr mtmd_init_from_file(const char * mmproj_fname, const llama_model * text_model, - const llava2_context_params ctx_params); + const mtmd_context_params ctx_params); // helper function to load an image from a file // returns 0 on success // this function is thread-safe -LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output); +LLAVA2_API int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output); // tokenize an input text prompt and an image // the prompt must have the input image marker (default: "<__image__>") in it @@ -94,29 +94,29 @@ LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitma // 3. "\ndescribe it in detail." // number of bitmaps must be equal to the number of image markers in the prompt // this function is thread-safe (shared ctx) -LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx, - std::vector & output, - const llava2_input_text & text, - const std::vector & bitmaps); +LLAVA2_API int32_t mtmd_tokenize(mtmd_context_ptr & ctx, + std::vector & output, + const mtmd_input_text & text, + const std::vector & bitmaps); // returns 0 on success -LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx, - const llava2_image_tokens & image_tokens); +LLAVA2_API int32_t mtmd_encode(mtmd_context_ptr & ctx, + const mtmd_image_tokens & image_tokens); // get output embeddings from the last encode pass -LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx); +LLAVA2_API float * mtmd_get_output_embd(mtmd_context_ptr & ctx); // simple helper to count the total number of tokens from a list of chunks, useful to keep track of n_past -LLAVA2_API size_t llava2_helper_get_n_tokens(std::vector & chunks); +LLAVA2_API size_t mtmd_helper_get_n_tokens(std::vector & chunks); // helper function that automatically: // 1. run llama_decode() on text chunks -// 2. run llava2_encode() on image chunks, then llava2_get_output_embd() and then llama_decode() -// if any of the llava2_encode() or llama_decode() calls return non-zero, stop and forward the error +// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() +// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error // otherwise, returns 0 on success -LLAVA2_API int32_t llava2_helper_eval(llava2_context_ptr & ctx, +LLAVA2_API int32_t mtmd_helper_eval(mtmd_context_ptr & ctx, llama_context * lctx, - std::vector & chunks, + std::vector & chunks, llama_pos pos0, llama_seq_id seq_id, int32_t n_batch); From 430dbd85e269122858312f5ac23c3f7ec8b319be Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 10 Apr 2025 15:47:43 +0200 Subject: [PATCH 09/11] improve api --- examples/llava/CMakeLists.txt | 4 +- examples/llava/gemma3-cli.cpp | 12 ++-- examples/llava/mtmd.cpp | 110 ++++++++++++++++++++-------------- examples/llava/mtmd.h | 71 ++++++++++++---------- 4 files changed, 111 insertions(+), 86 deletions(-) diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index c76555c59f0fd..2d5061de460c0 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -37,8 +37,8 @@ add_library(mtmd OBJECT target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(mtmd PUBLIC .) -target_include_directories(mtmd PUBLIC ../..) -target_include_directories(mtmd PUBLIC ../../common) # for stb_image.h +target_include_directories(mtmd PRIVATE ../..) +target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h target_compile_features(mtmd PRIVATE cxx_std_17) diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index 9b643fcd36d72..26d18921a5c75 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -86,12 +86,12 @@ struct gemma3_context { void init_vision_context(common_params & params) { const char * clip_path = params.mmproj.path.c_str(); - ctx_vision = mtmd_init_from_file(clip_path, model, mtmd_context_params{ + ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{ /* use_gpu */ true, /* timings */ true, /* n_threads */ params.cpuparams.n_threads, /* verbosity */ GGML_LOG_LEVEL_INFO, - }); + })); if (!ctx_vision.get()) { LOG_ERR("Failed to load vision model from %s\n", clip_path); exit(1); @@ -180,22 +180,22 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector bitmaps.push_back(std::move(bitmap)); } - std::vector chunks; mtmd_input_text text; text.text = formatted_chat.prompt; text.add_special = add_bos; text.parse_special = true; - if (mtmd_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) { + mtmd_input_chunks_ptr chunks(mtmd_tokenize(ctx.ctx_vision.get(), text, bitmaps)); + if (chunks == nullptr) { LOG_ERR("Unable to tokenize prompt\n"); return 1; } - if (mtmd_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) { + if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks.get(), ctx.n_past, 0, ctx.n_batch)) { LOG_ERR("Unable to eval prompt\n"); return 1; } - ctx.n_past += mtmd_helper_get_n_tokens(chunks); + ctx.n_past += mtmd_helper_get_n_tokens(chunks.get()); return 0; } diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 544f53d7ba90d..68baee9dddc8d 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -44,18 +44,30 @@ struct mtmd_image_tokens_data { clip_image_f32_batch_ptr batch_f32; // preprocessed image patches }; -mtmd_context_ptr mtmd_init_from_file(const char * mmproj_fname, +struct mtmd_image_tokens { + uint32_t nx; // number of tokens in x direction + uint32_t ny; // number of tokens in y direction + uint32_t n_tokens() const { return nx * ny; } + clip_image_f32_batch_ptr batch_f32; // preprocessed image patches +}; + +mtmd_context * mtmd_init_from_file(const char * mmproj_fname, const struct llama_model * text_model, const struct mtmd_context_params ctx_params) { try { - auto ctx = std::make_shared(mmproj_fname, text_model, ctx_params); - return ctx; + return new mtmd_context(mmproj_fname, text_model, ctx_params); } catch (const std::exception & e) { LOG_ERR("%s: error: %s\n", __func__, e.what()); return nullptr; } } +void mtmd_free(mtmd_context * ctx) { + if (ctx) { + delete ctx; + } +} + int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) { clip_image_u8_ptr img_u8(clip_image_u8_init()); bool ok = clip_image_load_from_file(fname, img_u8.get()); @@ -89,10 +101,10 @@ static std::vector mtmd_tokenize_text_internal( return result; } -int32_t mtmd_tokenize(mtmd_context_ptr & ctx, - std::vector & output, - const mtmd_input_text & text, - const std::vector & bitmaps) { +mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, + const mtmd_input_text & text, + const std::vector & bitmaps) { + mtmd_input_chunks * output = new mtmd_input_chunks; auto vocab = llama_model_get_vocab(ctx->text_model); std::string prompt_modified(text.text); @@ -107,8 +119,8 @@ int32_t mtmd_tokenize(mtmd_context_ptr & ctx, } std::vector parts = string_split_str(text.text, ctx->image_marker); - output.clear(); - output.reserve(parts.size()); + output->clear(); + output->reserve(parts.size()); size_t i_img = 0; @@ -119,18 +131,19 @@ int32_t mtmd_tokenize(mtmd_context_ptr & ctx, if (tokens.empty()) { continue; } - output.push_back({ - LLAVA2_INPUT_CHUNK_TYPE_TEXT, + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_TEXT, std::move(tokens), {}, - }); + }; + output->emplace_back(std::move(chunk)); if (&parts.back() != &part) { // add image token to middle of 2 parts if (i_img >= bitmaps.size()) { LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size()); - return 2; + return nullptr; } // shim layer @@ -145,54 +158,58 @@ int32_t mtmd_tokenize(mtmd_context_ptr & ctx, bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), batch_f32.get()); if (!ok) { LOG_ERR("Unable to preprocess image\n"); - return 1; + return nullptr; } - mtmd_image_tokens image_tokens; - image_tokens.nx = 0; // TODO - image_tokens.ny = 0; // TODO - image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image - image_tokens.data = std::unique_ptr( - new mtmd_image_tokens_data{ - std::move(batch_f32), - } - ); - - output.push_back({ - LLAVA2_INPUT_CHUNK_TYPE_IMAGE, + mtmd_image_tokens * image_tokens = new mtmd_image_tokens; + image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image + image_tokens->ny = 1; // TODO + image_tokens->batch_f32 = std::move(batch_f32); + + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_IMAGE, {}, - std::move(image_tokens), - }); + image_tokens, + }; + output->emplace_back(std::move(chunk)); i_img++; } } - return 0; + return output; +} + +void mtmd_input_chunks_free(mtmd_input_chunks * chunks) { + for (auto & chunk : *chunks) { + if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) { + delete chunk.tokens_image; + } + } + delete chunks; } -LLAVA2_API int32_t mtmd_encode(mtmd_context_ptr & ctx, - const mtmd_image_tokens & image_tokens) { +int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); - ctx->image_embd_v.resize(image_tokens.n_tokens * n_mmproj_embd); + ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); bool ok = clip_image_batch_encode( ctx->ctx_clip, ctx->n_threads, - image_tokens.data->batch_f32.get(), + image_tokens->batch_f32.get(), ctx->image_embd_v.data()); return ok ? 0 : 1; } -LLAVA2_API float * mtmd_get_output_embd(mtmd_context_ptr & ctx) { +float * mtmd_get_output_embd(mtmd_context * ctx) { return ctx->image_embd_v.data(); } -size_t mtmd_helper_get_n_tokens(std::vector & chunks) { +size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) { size_t n_tokens = 0; - for (auto & chunk : chunks) { - if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) { + for (auto & chunk : *chunks) { + if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) { n_tokens += chunk.tokens_text.size(); - } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) { - n_tokens += chunk.tokens_image.n_tokens; + } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + n_tokens += chunk.tokens_image->n_tokens(); } else { GGML_ASSERT(false && "chunk type not supported"); } @@ -235,9 +252,9 @@ struct decode_embd_batch { } }; -int32_t mtmd_helper_eval(mtmd_context_ptr & ctx, +int32_t mtmd_helper_eval(mtmd_context * ctx, llama_context * lctx, - std::vector & chunks, + mtmd_input_chunks * chunks, llama_pos pos0, llama_seq_id seq_id, int32_t n_batch) { @@ -245,9 +262,9 @@ int32_t mtmd_helper_eval(mtmd_context_ptr & ctx, llama_pos n_past = pos0; llama_batch text_batch = llama_batch_init(n_batch, 0, 1); - for (auto & chunk : chunks) { - bool is_last = &chunk == &chunks.back(); - if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) { + for (auto & chunk : *chunks) { + bool is_last = &chunk == &chunks->back(); + if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) { // TODO @ngxson : may need to split into smaller batches text_batch.n_tokens = chunk.tokens_text.size(); for (size_t i = 0; i < chunk.tokens_text.size(); i++) { @@ -268,8 +285,9 @@ int32_t mtmd_helper_eval(mtmd_context_ptr & ctx, return ret; } - } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) { + } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { GGML_ASSERT(!is_last && "logits for last image chunk is not yet support"); + GGML_ASSERT(chunk.tokens_image != nullptr); int64_t t0 = ggml_time_ms(); if (ctx->print_timings) { LOG_INF("encoding image...\n"); @@ -284,7 +302,7 @@ int32_t mtmd_helper_eval(mtmd_context_ptr & ctx, LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); } - int32_t n_tokens = chunk.tokens_image.n_tokens; + int32_t n_tokens = chunk.tokens_image->n_tokens(); float * embd = mtmd_get_output_embd(ctx); decode_embd_batch batch_img(embd, n_tokens, n_past, 0); int64_t t1 = ggml_time_ms(); diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h index 38e29246f6de2..5222bb73127ec 100644 --- a/examples/llava/mtmd.h +++ b/examples/llava/mtmd.h @@ -1,5 +1,5 @@ -#ifndef LLAVA2_H -#define LLAVA2_H +#ifndef MTMD_H +#define MTMD_H #include "ggml.h" #include "llama.h" @@ -12,29 +12,26 @@ #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) # ifdef LLAMA_BUILD -# define LLAVA2_API __declspec(dllexport) +# define MTMD_API __declspec(dllexport) # else -# define LLAVA2_API __declspec(dllimport) +# define MTMD_API __declspec(dllimport) # endif # else -# define LLAVA2_API __attribute__ ((visibility ("default"))) +# define MTMD_API __attribute__ ((visibility ("default"))) # endif #else -# define LLAVA2_API +# define MTMD_API #endif #ifdef __cplusplus enum mtmd_input_chunk_type { - LLAVA2_INPUT_CHUNK_TYPE_TEXT, - LLAVA2_INPUT_CHUNK_TYPE_IMAGE, + MTMD_INPUT_CHUNK_TYPE_TEXT, + MTMD_INPUT_CHUNK_TYPE_IMAGE, }; struct mtmd_context; -struct mtmd_image_tokens_data; // internal data - -using mtmd_context_ptr = std::shared_ptr; -using mtmd_image_tokens_data_ptr = std::shared_ptr; +struct mtmd_image_tokens; // represents raw image data, layout is RGBRGBRGB... // length of data must be nx * ny * 3 @@ -44,20 +41,14 @@ struct mtmd_bitmap { std::vector data; }; -// represents the processed image as tokens (to be encoded) -struct mtmd_image_tokens { - uint32_t nx; // number of tokens in x direction - uint32_t ny; // number of tokens in y direction - uint32_t n_tokens; // == nx * ny - mtmd_image_tokens_data_ptr data; // internal data -}; - struct mtmd_input_chunk { mtmd_input_chunk_type type; - std::vector tokens_text; - mtmd_image_tokens tokens_image; + std::vector tokens_text; + mtmd_image_tokens * tokens_image = nullptr; }; +using mtmd_input_chunks = std::vector; + struct mtmd_context_params { bool use_gpu = true; bool print_timings = true; @@ -74,14 +65,16 @@ struct mtmd_input_text { // initialize the mtmd context // return nullptr on failure -LLAVA2_API mtmd_context_ptr mtmd_init_from_file(const char * mmproj_fname, +MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname, const llama_model * text_model, const mtmd_context_params ctx_params); +MTMD_API void mtmd_free(mtmd_context * ctx); + // helper function to load an image from a file // returns 0 on success // this function is thread-safe -LLAVA2_API int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output); +MTMD_API int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output); // tokenize an input text prompt and an image // the prompt must have the input image marker (default: "<__image__>") in it @@ -94,33 +87,47 @@ LLAVA2_API int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & // 3. "\ndescribe it in detail." // number of bitmaps must be equal to the number of image markers in the prompt // this function is thread-safe (shared ctx) -LLAVA2_API int32_t mtmd_tokenize(mtmd_context_ptr & ctx, - std::vector & output, +MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx, const mtmd_input_text & text, const std::vector & bitmaps); +// free image chunk data +MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks); + // returns 0 on success -LLAVA2_API int32_t mtmd_encode(mtmd_context_ptr & ctx, - const mtmd_image_tokens & image_tokens); +MTMD_API int32_t mtmd_encode(mtmd_context * ctx, + const mtmd_image_tokens * image_tokens); // get output embeddings from the last encode pass -LLAVA2_API float * mtmd_get_output_embd(mtmd_context_ptr & ctx); +MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); // simple helper to count the total number of tokens from a list of chunks, useful to keep track of n_past -LLAVA2_API size_t mtmd_helper_get_n_tokens(std::vector & chunks); +MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks); // helper function that automatically: // 1. run llama_decode() on text chunks // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error // otherwise, returns 0 on success -LLAVA2_API int32_t mtmd_helper_eval(mtmd_context_ptr & ctx, +MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx, llama_context * lctx, - std::vector & chunks, + mtmd_input_chunks * chunks, llama_pos pos0, llama_seq_id seq_id, int32_t n_batch); + +// convenient unique_ptr wrappers +struct mtmd_context_deleter { + void operator()(mtmd_context * val) { mtmd_free(val); } +}; +using mtmd_context_ptr = std::unique_ptr; + +struct mtmd_input_chunks_deleter { + void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); } +}; +using mtmd_input_chunks_ptr = std::unique_ptr; + #else static_assert(false && "C header is not yet supported by this library"); From 6ed09b70dcca12177317e6c9f63e4a30cfc5b1ee Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 10 Apr 2025 18:20:28 +0200 Subject: [PATCH 10/11] refine helpers --- examples/llava/gemma3-cli.cpp | 2 +- examples/llava/mtmd.cpp | 39 +++++++++++++++++++++++------------ examples/llava/mtmd.h | 21 +++++++++++++------ 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index 26d18921a5c75..91a07e2a8f40d 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -173,7 +173,7 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector for (auto & fname : images_fname) { mtmd_bitmap bitmap; - if (mtmd_bitmap_init_from_file(fname.c_str(), bitmap)) { + if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) { LOG_ERR("Unable to load image %s\n", fname.c_str()); return 2; // image not found } diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 68baee9dddc8d..743dd6266b169 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -68,19 +68,6 @@ void mtmd_free(mtmd_context * ctx) { } } -int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) { - clip_image_u8_ptr img_u8(clip_image_u8_init()); - bool ok = clip_image_load_from_file(fname, img_u8.get()); - if (!ok) { - LOG_ERR("Unable to load image %s\n", fname); - return 1; - } - unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny); - output.data.resize(output.nx * output.ny * 3); - std::memcpy(output.data.data(), data, output.nx * output.ny * 3); - return 0; -} - // copied from common_tokenize static std::vector mtmd_tokenize_text_internal( const struct llama_vocab * vocab, @@ -326,3 +313,29 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, llama_batch_free(text_batch); return 0; } + +int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) { + clip_image_u8_ptr img_u8(clip_image_u8_init()); + bool ok = clip_image_load_from_bytes(buf, len, img_u8.get()); + if (!ok) { + LOG_ERR("Unable to load image from buffer\n"); + return 1; + } + unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny); + output.data.resize(output.nx * output.ny * 3); + std::memcpy(output.data.data(), data, output.nx * output.ny * 3); + return 0; +} + +int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) { + clip_image_u8_ptr img_u8(clip_image_u8_init()); + bool ok = clip_image_load_from_file(fname, img_u8.get()); + if (!ok) { + LOG_ERR("Unable to load image %s\n", fname); + return 1; + } + unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny); + output.data.resize(output.nx * output.ny * 3); + std::memcpy(output.data.data(), data, output.nx * output.ny * 3); + return 0; +} diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h index 5222bb73127ec..598f6947bb092 100644 --- a/examples/llava/mtmd.h +++ b/examples/llava/mtmd.h @@ -71,11 +71,6 @@ MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname, MTMD_API void mtmd_free(mtmd_context * ctx); -// helper function to load an image from a file -// returns 0 on success -// this function is thread-safe -MTMD_API int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output); - // tokenize an input text prompt and an image // the prompt must have the input image marker (default: "<__image__>") in it // the marker will be replaced with the image tokens @@ -101,7 +96,11 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx, // get output embeddings from the last encode pass MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); -// simple helper to count the total number of tokens from a list of chunks, useful to keep track of n_past +// +// helper functions (can be implemented based on other functions) +// + +// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks); // helper function that automatically: @@ -116,6 +115,16 @@ MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx, llama_seq_id seq_id, int32_t n_batch); +// helper function to construct a mtmd_bitmap from a file +// returns 0 on success +// this function is thread-safe +MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output); + +// helper function to construct a mtmd_bitmap from a buffer +// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.) +// returns 0 on success +// this function is thread-safe +MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output); // convenient unique_ptr wrappers struct mtmd_context_deleter { From aed3216b50d6ba149aa8bcb0f4c70bdae45a0be3 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 10 Apr 2025 19:41:54 +0200 Subject: [PATCH 11/11] Update examples/llava/mtmd.cpp Co-authored-by: Georgi Gerganov --- examples/llava/mtmd.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 743dd6266b169..58503d0b22c33 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -23,8 +23,8 @@ struct mtmd_context { // TODO @ngxson : add timings mtmd_context(const char * mmproj_fname, - const struct llama_model * text_model, - const struct mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) { + const llama_model * text_model, + const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) { clip_context_params ctx_clip_params; ctx_clip_params.use_gpu = ctx_params.use_gpu; ctx_clip_params.verbosity = ctx_params.verbosity;