From 2a458d1a9dc45bd27bf1c3a05fbb0402c3a31f4c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 18 Jan 2025 12:19:25 +0100 Subject: [PATCH 01/25] wip --- include/llama.h | 10 + src/llama-vision.cpp | 841 +++++++++++++++++++++++++++++++++++++++++++ src/llama-vision.h | 151 ++++++++ 3 files changed, 1002 insertions(+) create mode 100644 src/llama-vision.cpp create mode 100644 src/llama-vision.h diff --git a/include/llama.h b/include/llama.h index 298b8d1bc0fa2..6049d2382967a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -229,6 +229,16 @@ extern "C" { bool sorted; } llama_token_data_array; + // represent an RGB image + // size of data must be equal to 3*nx*ny + typedef struct llama_vision_bitmap { + uint32_t nx; + uint32_t ny; + unsigned char * data; + } llama_vision_bitmap; + + struct llama_vision_patches; + typedef bool (*llama_progress_callback)(float progress, void * user_data); // Input data for llama_decode diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp new file mode 100644 index 0000000000000..87a33c1818257 --- /dev/null +++ b/src/llama-vision.cpp @@ -0,0 +1,841 @@ +#include "llama.h" +#include "llama-vision.h" +#include "llama-impl.h" + +#include // memcpy +#include +#include + +#ifndef NDEBUG +// for debugging +#include +#include +#include + +// export clip_image_u8 to bmp file for debugging +// https://codereview.stackexchange.com/questions/195121/writing-a-bitmap-image-from-c +struct clip_image_size; +static int bmp_export(const struct clip_image_u8 &img, const std::string &location); +#endif + +struct clip_image_size { + int width; + int height; +}; + +// RGB uint8 image +// Memory layout: RGBRGBRGB... +struct clip_image_u8 { + int nx; + int ny; + std::vector buf; + clip_image_u8() {} + clip_image_u8(const llama_vision_bitmap & bmp) { + nx = bmp.nx; + ny = bmp.ny; + buf.resize(nx*ny*3); + memcpy(buf.data(), bmp.data, buf.size()); + } +}; + +struct clip_image_u8_batch { + struct clip_image_u8 * data; + size_t size; +}; + +static int clip_n_patches(const clip_context & ctx) { + auto & hparams = ctx.model->hparams; + int n_patches = (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size); + return n_patches; +} + +int clip_n_mmproj_embd(const clip_context & ctx) { + if (ctx.model->hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) { + return ctx.model->mm_2_b->ne[0]; + } else { + GGML_ASSERT(false && "invalid proj type"); + } +} + +/** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * @param original_size The original size of the image in the format (width, height). + * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. + * @return The best fit resolution in the format (width, height). + */ +static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector& possible_resolutions) { + int original_width = original_size.width; + int original_height = original_size.height; + + clip_image_size best_fit; + int max_effective_resolution = 0; + int min_wasted_resolution = std::numeric_limits::max(); + + for (const auto& resolution : possible_resolutions) { + int width = resolution.width; + int height = resolution.height; + float scale = std::min(static_cast(width) / original_width, static_cast(height) / original_height); + int downscaled_width = static_cast(original_width * scale); + int downscaled_height = static_cast(original_height * scale); + int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); + int wasted_resolution = (width * height) - effective_resolution; + // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { + max_effective_resolution = effective_resolution; + min_wasted_resolution = wasted_resolution; + best_fit = resolution; + } + } + + return best_fit; +} + +static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + auto clip = [](int x, int lower, int upper) -> int { + return std::max(lower, std::min(x, upper)); + }; + + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float Cc; + float C[5]; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) { + for (j = 0; j < target_width; j++) { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + for (k = 0; k < 3; k++) { + for (jj = 0; jj <= 3; jj++) { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; +} + +static std::vector divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { + std::vector patches; + int width = image.nx; + int height = image.ny; + for (int i = 0; i < height; i += patch_size) { + for (int j = 0; j < width; j += patch_size) { + clip_image_u8 patch; + patch.nx = std::min(patch_size, width - j); + patch.ny = std::min(patch_size, height - i); + patch.buf.resize(3 * patch.nx * patch.ny); + for (int y = 0; y < patch.ny; ++y) { + for (int x = 0; x < patch.nx; ++x) { + for (int c = 0; c < 3; ++c) { + patch.buf[3 * (y * patch.nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c]; + } + } + } + patches.push_back(patch); + } + } + return patches; +} + +// llava-1.6 type of resize_and_pad (black) +static clip_image_u8 resize_and_pad_image(const clip_image_u8 & image, const clip_image_size & target_resolution) { + int target_width = target_resolution.width; + int target_height = target_resolution.height; + + float scale_w = static_cast(target_width) / image.nx; + float scale_h = static_cast(target_height) / image.ny; + + int new_width, new_height; + + if (scale_w < scale_h) { + new_width = target_width; + new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); + } else { + new_height = target_height; + new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); + } + + clip_image_u8 resized_image; + // bilinear_resize(image, resized_image, new_width, new_height); + bicubic_resize(image, resized_image, new_width, new_height); + + clip_image_u8 padded_image; + padded_image.nx = target_width; + padded_image.ny = target_height; + padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black + + // Calculate padding offsets + int pad_x = (target_width - new_width) / 2; + int pad_y = (target_height - new_height) / 2; + + // Copy the resized image into the center of the padded buffer + for (int y = 0; y < new_height; ++y) { + for (int x = 0; x < new_width; ++x) { + for (int c = 0; c < 3; ++c) { + padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; + } + } + } + return padded_image; +} + +static void normalize_image_u8_to_f32(const clip_image_u8 & src, std::vector & dst, const std::array & mean, const std::array & std) { + dst.resize(src.buf.size()); + + for (size_t i = 0; i < src.buf.size(); ++i) { + int c = i % 3; // rgb + dst[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; + } +} + +// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector +// res_imgs memory is being allocated here, previous allocations will be freed if found +static llama_vision_patches clip_image_preprocess(const clip_context & ctx, const clip_image_u8 & img) { + bool pad_to_square = true; + auto & params = ctx.model->hparams; + // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing + if (params.mm_patch_merge_type == MM_PATCH_MERGE_SPATIAL_UNPAD) { + pad_to_square = false; + } + + llama_vision_patches output_imgs; + output_imgs.px = clip_n_patches(ctx); + output_imgs.py = clip_n_patches(ctx); + output_imgs.n_px = params.image_size / output_imgs.px; + output_imgs.n_py = params.image_size / output_imgs.py; + + // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + + clip_image_u8 temp; + if (pad_to_square && img.nx != img.ny) { + // if the image is not square, pad it to a square + int longer_side = std::max(img.nx, img.ny); + temp.nx = longer_side; + temp.ny = longer_side; + temp.buf.resize(3 * longer_side * longer_side); + const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255) + + // fill with background color + for (size_t i = 0; i < temp.buf.size(); i++) { + temp.buf[i] = bc[i % 3]; + } + + // copy from the input image + for (int y = 0; y < img.ny; y++) { + for (int x = 0; x < img.nx; x++) { + const int i = 3 * (y * img.nx + x); + const int j = 3 * (y * temp.nx + x); + temp.buf[j] = img.buf[i]; + temp.buf[j+1] = img.buf[i+1]; + temp.buf[j+2] = img.buf[i+2]; + } + } + } else if (params.image_grid_pinpoints[0] != 0) { + // "spatial_unpad" with "anyres" processing for llava-1.6 + std::vector possible_resolutions; + for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i += 2) { + clip_image_size s; + s.width = params.image_grid_pinpoints[i]; + s.height = params.image_grid_pinpoints[i+1]; + possible_resolutions.push_back(s); + } + clip_image_size best_resolution = select_best_resolution({img.nx, img.ny}, possible_resolutions); + // clip_image_save_to_bmp(*img, "input.bmp"); + temp = resize_and_pad_image(img, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 + // clip_image_save_to_bmp(*temp, "resized.bmp"); + + std::vector patches = divide_to_patches_u8(temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) + + clip_image_u8 image_original_resize; + // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square + bicubic_resize(img, image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square + patches.insert(patches.begin(), image_original_resize); + // clip_image_f32_batch_init(patches.size()); + output_imgs.buf.resize(patches.size()); + int num = 0; + for (auto & patch : patches) { + normalize_image_u8_to_f32(patch, output_imgs.buf[num], params.image_mean, params.image_std); + num++; + } + return output_imgs; + } else { + temp.nx = img.nx; + temp.ny = img.ny; + temp.buf.resize(img.buf.size()); + memcpy(temp.buf.data(), img.buf.data(), temp.buf.size()); + } + + const int nx = temp.nx; + const int ny = temp.ny; + // bmp_export(temp, "resized_vanilla.bmp"); + + const int nx2 = params.image_size; + const int ny2 = params.image_size; + std::vector res; + res.resize(3 * nx2 * ny2); + + const float scale = std::max(nx, ny) / (float)params.image_size; + + const int nx3 = int(nx / scale + 0.5f); + const int ny3 = int(ny / scale + 0.5f); + + const auto & m3 = params.image_mean; // {0.48145466f, 0.4578275f, 0.40821073f}; + const auto & s3 = params.image_std; // {0.26862954f, 0.26130258f, 0.27577711f}; + + for (int y = 0; y < ny3; y++) { + for (int x = 0; x < nx3; x++) { + for (int c = 0; c < 3; c++) { + // linear interpolation + const float sx = (x + 0.5f) * scale - 0.5f; + const float sy = (y + 0.5f) * scale - 0.5f; + + const int x0 = std::max(0, (int)std::floor(sx)); + const int y0 = std::max(0, (int)std::floor(sy)); + + const int x1 = std::min(x0 + 1, nx - 1); + const int y1 = std::min(y0 + 1, ny - 1); + + const float dx = sx - x0; + const float dy = sy - y0; + + const int j00 = 3 * (y0 * nx + x0) + c; + const int j01 = 3 * (y0 * nx + x1) + c; + const int j10 = 3 * (y1 * nx + x0) + c; + const int j11 = 3 * (y1 * nx + x1) + c; + + const float v00 = temp.buf[j00]; + const float v01 = temp.buf[j01]; + const float v10 = temp.buf[j10]; + const float v11 = temp.buf[j11]; + + const float v0 = v00 * (1.0f - dx) + v01 * dx; + const float v1 = v10 * (1.0f - dx) + v11 * dx; + + const float v = v0 * (1.0f - dy) + v1 * dy; + + const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f); + + const int i = 3 * (y * nx3 + x) + c; + + res[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c]; + } + } + } + + output_imgs.buf.resize(1); + output_imgs.buf[0] = std::move(res); + + return output_imgs; +} + +static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, clip_image_size & image_size) { + auto & model = *ctx.model; + auto & hparams = ctx.model->hparams; + + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int patch_size = hparams.patch_size; + const float eps = hparams.eps; + const int num_patches = ((image_size.width / patch_size) * (image_size.height / patch_size)); + const int num_positions = num_patches + (model.class_embedding ? 1 : 0); + + LLAMA_LOG_DEBUG("%s: num_patches = %d\n", __func__, num_patches); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx.buf_compute_meta.size(), + /*.mem_buffer =*/ ctx.buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + // input + struct ggml_tensor * embeddings; + { + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size.width, image_size.height, 3, batch_size); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + + if (model.patch_bias) { + inp = ggml_add(ctx0, inp, model.patch_bias); + } + // auto * ne = inp->ne; printf("%d %d %d %d\n", ne[0], ne[1], ne[2], ne[3]); + + embeddings = inp; + if (model.class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + ggml_set_name(embeddings, "embeddings"); + ggml_set_input(embeddings); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + } + + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + embeddings = ggml_add(ctx0, + embeddings, + ggml_get_rows(ctx0, model.position_embeddings, positions)); + } + + // pre-layernorm + if (model.pre_norm_w) { + embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "pre_ln"); + + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_norm_w), model.pre_norm_b); + } + + // loop over layers + for (int il = 0; il < (int)hparams.n_layer + hparams.select_layer; il++) { + struct ggml_tensor * cur = embeddings; + + // layernorm1 + { + cur = ggml_norm(ctx0, cur, eps); + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.layers[il].norm_in_w), + model.layers[il].norm_in_b); + } + + // self-attention + { + + struct ggml_tensor * Q = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.layers[il].q_w, cur), + model.layers[il].q_b); + + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor * K = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.layers[il].k_w, cur), + model.layers[il].k_b); + + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + + struct ggml_tensor * V = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.layers[il].v_w, cur), + model.layers[il].v_b); + + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_inplace(ctx0, KQ); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size); + } + + // attention output + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].output_w, cur), model.layers[il].output_b); + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, embeddings); + + embeddings = cur; // embeddings = residual, cur = hidden_states + + // layernorm2 + { + cur = ggml_norm(ctx0, cur, eps); + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.layers[il].norm_out_w), + model.layers[il].norm_out_b); + } + + cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ffn_up_b); + + if (hparams.use_gelu) { + cur = ggml_gelu_inplace(ctx0, cur); + } else { + cur = ggml_gelu_quick_inplace(ctx0, cur); + } + + cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ffn_down_b); + + // residual 2 + cur = ggml_add(ctx0, embeddings, cur); + + embeddings = cur; + } + + // post-layernorm + if (model.post_norm_w) { + embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "post_ln"); + + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_norm_w), model.post_norm_b); + } + + // llava projector + { + embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); + + struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); + ggml_set_name(patches, "patches"); + ggml_set_input(patches); + + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] + embeddings = ggml_get_rows(ctx0, embeddings, patches); + + if (hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) { + embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); + + embeddings = ggml_gelu(ctx0, embeddings); + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } else { + GGML_ASSERT(false && "unsupported proj type"); + } + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + ggml_free(ctx0); + return gf; +} + +static int32_t clip_image_batch_encode(clip_context & ctx, const clip_image_f32_batch & imgs, std::vector & output) { + int batch_size = imgs.size(); + auto & model = *ctx.model; + auto & hparams = ctx.model->hparams; + + if (hparams.arch == VISION_ARCH_LLAVA) { + GGML_ASSERT(batch_size == 1); // TODO: support multiple images + } + + clip_image_size image_size{(int)hparams.image_size, (int)hparams.image_size}; + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size.width / patch_size) * (image_size.height / patch_size)); + const int num_positions = num_patches + (model.class_embedding ? 1 : 0); + + LLAMA_LOG_DEBUG("%s: image_size = %d\n", __func__, hparams.image_size); + LLAMA_LOG_DEBUG("%s: num_positions = %d\n", __func__, num_positions); + + // build the inference graph + ggml_cgraph * gf = clip_image_build_graph(ctx, batch_size, image_size); + + // alloc memory for graph + bool ok = ggml_backend_sched_alloc_graph(ctx.sched, gf); + if (!ok) { + LLAMA_LOG_ERROR("failed to alloc memory for graph\n"); + return -1; + } + + // set raw input + { + struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); + float * data = (float *)malloc(ggml_nbytes(inp_raw)); + + for (int i = 0; i < batch_size; i++) { + const int nx = imgs[i].nx; + const int ny = imgs[i].ny; + const int n = nx * ny; + + for (int b = 0; b < batch_size; b++) { + for (int k = 0; k < 3; k++) { + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].buf[3 * (y * nx + x) + k]; + } + } + } + } + } + ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); + free(data); + } + + if (model.class_embedding) { + struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); + + void* zero_mem = malloc(ggml_nbytes(embeddings)); + memset(zero_mem, 0, ggml_nbytes(embeddings)); + ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); + free(zero_mem); + } + + { + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = i; + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } + + { + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { + patches_data[i] = i + 1; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); + } + + // compute + ggml_backend_sched_graph_compute_async(ctx.sched, gf); + + // the last node is the embedding tensor + struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(ctx.sched, embeddings); + + // copy the embeddings to the location passed by the user + size_t out_nbytes = clip_n_patches(ctx)*clip_n_mmproj_embd(ctx)*sizeof(float); + GGML_ASSERT(out_nbytes == ggml_nbytes(embeddings)); + output.resize(out_nbytes); + ggml_backend_tensor_get_async(backend_embd, embeddings, output.data(), 0, ggml_nbytes(embeddings)); + + ggml_backend_sched_synchronize(ctx.sched); + + return 0; +} + +static int32_t clip_image_encode(clip_context & ctx, const clip_image_f32 & img, std::vector & output) { + clip_image_f32_batch imgs{img}; + return clip_image_batch_encode(ctx, imgs, output); +} + +static int32_t encode_image_with_clip(clip_context & ctx, const llama_img img, std::vector & output_embd) { + clip_image_u8 img_u8(img); + clip_image_f32_batch img_res_v; + auto & hparams = ctx.model->hparams; + // bmp_export(img_u8, "test_inp.bmp"); + + if (!clip_image_preprocess(ctx, img_u8, img_res_v)) { + LLAMA_LOG_ERROR("%s: unable to preprocess image\n", __func__); + return -2; + } + + switch (hparams.mm_patch_merge_type) { + case MM_PATCH_MERGE_FLAT: + { + // flat / default llava-1.5 type embedding + // n_output = clip_n_patches(ctx); + int32_t encoded = clip_image_encode(ctx, img_res_v[0], output_embd); + if (encoded != 0) { + LLAMA_LOG_ERROR("Unable to encode image\n"); + return encoded; + } + } break; + case MM_PATCH_MERGE_SPATIAL_UNPAD: + { + // TODO: support llava-1.6 + (void)0; + } break; + default: + GGML_ASSERT(false && "unsupported mm_patch_merge_type"); + } + + return 0; +} + +//////////////////////////////////////////////////////////////////////////////////////// +// public API + +int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch) { + if (batch->n_imgs == 0) { + return 0; + } + + // TODO: batching is not working atm, should be fixed later + const int n_embd = clip_n_mmproj_embd(ctx); + const int n_tokens_per_img = clip_n_patches(ctx); + const int n_pos = n_tokens_per_img*batch->n_imgs; + + ctx.out_embd.resize(n_embd*n_pos); + ctx.out_pos.resize(n_pos); + + for (int i = 0; i < batch->n_imgs; i++) { + std::vector output_single; + int32_t status = encode_image_with_clip(ctx, *batch->imgs[i], output_single); + if (status != 0) { + return status; + } + // copy output embeddings to result + for (int k = 0; k < n_embd*n_tokens_per_img; k++) { + ctx.out_embd[n_embd*n_tokens_per_img*i + k] = output_single[k]; + } + // fill position for all output tokens + for (int p = 0; p < n_tokens_per_img; p++) { + ctx.out_pos[n_tokens_per_img*i + p] = batch->pos[i] + p; + } + } + + return 0; +} + +void llama_vision_clear_output(clip_context & ctx) { + ctx.out_embd.clear(); + ctx.out_pos.clear(); +} + +//////////////////////////////////////////////////////////////////////////////////////// +// for debugging +#ifndef NDEBUG + +static int bmp_export(const struct clip_image_u8 &img, const std::string &location) { + const uint32_t width = img.nx; + const uint32_t height = img.ny; + // swap red and blue channel + std::vector buffer(width*height*3); + for (uint32_t y = 0; y < height; y++) { + for (uint32_t x = 0; x < width; x++) { + size_t base = x*3 + y*3*width; + buffer[base+2] = img.buf[base]; + buffer[base+1] = img.buf[base+1]; + buffer[base] = img.buf[base+2]; + } + } + const bool hasAlphaChannel = false; + + std::ofstream fout(location, std::ios::out | std::ios::binary); + + if (fout.fail()) { + return 0; + } + + //Padding + const uint8_t padding = hasAlphaChannel ? 0 : (4 - (width * 3) % 4) % 4; + + //Bitmap file header. + const char signature[2] = { 'B', 'M' }; + const uint32_t fileSize = buffer.size() * sizeof(uint8_t) + padding * (height - 1) + 14 + 124; + const uint32_t offset = 14 + 124; + + //Bitmap information header file + const uint32_t DIBSize = 124; + const int32_t bitmapWidth = width; + const int32_t bitmapHeight = height; + const uint16_t numPlanes = 1; + const uint16_t bitsPerPixel = (hasAlphaChannel) ? 32 : 24; + const uint32_t compressionMethod = (hasAlphaChannel) ? 3 : 0; //BI_RGB = 0, BI_BITFIELDS = 3 + const uint32_t bitmapSize = buffer.size() * sizeof(uint8_t); + const int32_t horizontalResolution = 2834; + const int32_t verticalResolution = 2834; + const uint32_t numColors = 0; + const uint32_t impColorCount = 0; + const uint32_t redBitmask = (hasAlphaChannel) ? 0x0000FF00 : 0; //ARGB32 pixel format + const uint32_t greenBitmask = (hasAlphaChannel) ? 0x00FF0000 : 0; + const uint32_t blueBitmask = (hasAlphaChannel) ? 0xFF000000 : 0; + const uint32_t alphaBitmask = (hasAlphaChannel) ? 0x000000FF : 0; + + //Writing the file header and information header to the file + std::vector header(offset, 0); + header[0] = signature[0]; + header[1] = signature[1]; + +#define BMP_HEADERS(i, variableName) header[i] = variableName; header[i+1] = variableName >> 8; header[i+2] = variableName >> 16; header[i+3] = variableName >> 24; + + BMP_HEADERS(2, fileSize); + BMP_HEADERS(6, 0); + BMP_HEADERS(10, offset); + BMP_HEADERS(14, DIBSize); + BMP_HEADERS(18, bitmapWidth); + BMP_HEADERS(22, bitmapHeight); + + header[26] = (uint8_t)numPlanes; + header[27] = (uint8_t)(numPlanes >> 8); + header[28] = (uint8_t)bitsPerPixel; + header[29] = (uint8_t)(bitsPerPixel >> 8); + + BMP_HEADERS(30, compressionMethod); + BMP_HEADERS(34, (unsigned char)bitmapSize); + BMP_HEADERS(38, (unsigned char)horizontalResolution); + BMP_HEADERS(42, (unsigned char)verticalResolution); + BMP_HEADERS(46, (unsigned char)numColors); + BMP_HEADERS(50, (unsigned char)impColorCount); + BMP_HEADERS(54, (unsigned char)redBitmask); + BMP_HEADERS(58, (unsigned char)greenBitmask); + BMP_HEADERS(62, (unsigned char)blueBitmask); + BMP_HEADERS(66, alphaBitmask); + +#undef BMP_HEADERS + + fout.write((char *)header.data(), sizeof(uint8_t) * header.size()); + + //Writing the pixel array + const uint32_t bWidth = bitsPerPixel / 8 * width; + + for (int i = height - 1; i >= 0; i--) { + std::vector row(buffer.begin() + i * bWidth, buffer.begin() + i * bWidth + bWidth); + fout.write((char *)row.data(), row.size() * sizeof(uint8_t)); + fout.seekp(padding * sizeof(uint8_t), std::ios::cur); + } + + fout.close(); + return 1; +} + +#endif + diff --git a/src/llama-vision.h b/src/llama-vision.h new file mode 100644 index 0000000000000..d7c922d99ff26 --- /dev/null +++ b/src/llama-vision.h @@ -0,0 +1,151 @@ +#pragma once + +#include "ggml.h" +#include "llama.h" + +#include +#include + +enum vision_arch { + VISION_ARCH_UNKNOWN, + VISION_ARCH_LLAVA, +}; + +enum clip_projector_type { + CLIP_PROJECTOR_TYPE_UNKNOWN, + CLIP_PROJECTOR_TYPE_MLP, +}; + +enum mm_patch_merge { + MM_PATCH_MERGE_UNKNOWN, + MM_PATCH_MERGE_FLAT, + MM_PATCH_MERGE_SPATIAL_UNPAD, +}; + +struct clip_hparams { + vision_arch arch = VISION_ARCH_UNKNOWN; + + uint32_t image_size; + uint32_t patch_size; + uint32_t hidden_size; + uint32_t n_intermediate; + uint32_t projection_dim; + uint32_t n_head; + uint32_t n_layer; + uint32_t max_pos_embd; + int32_t select_layer = 0; + bool use_gelu = false; + + float eps; + + clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_UNKNOWN; + mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_FLAT; + + std::array image_mean; + std::array image_std; + + std::array image_grid_pinpoints; + int32_t image_crop_resolution; +}; + +struct clip_layer { + // attention + struct ggml_tensor * k_w = NULL; + struct ggml_tensor * k_b = NULL; + struct ggml_tensor * q_w = NULL; + struct ggml_tensor * q_b = NULL; + struct ggml_tensor * v_w = NULL; + struct ggml_tensor * v_b = NULL; + + struct ggml_tensor * output_w = NULL; + struct ggml_tensor * output_b = NULL; + + // layernorm 1 + struct ggml_tensor * norm_in_w = NULL; + struct ggml_tensor * norm_in_b = NULL; + + // ff + struct ggml_tensor * ffn_up_w = NULL; + struct ggml_tensor * ffn_up_b = NULL; + + struct ggml_tensor * ffn_down_w = NULL; + struct ggml_tensor * ffn_down_b = NULL; + + // layernorm 2 + struct ggml_tensor * norm_out_w = NULL; + struct ggml_tensor * norm_out_b = NULL; +}; + +struct clip_vision_model { + struct clip_hparams hparams; + + // embeddings + struct ggml_tensor * class_embedding = NULL; + struct ggml_tensor * patch_embeddings = NULL; + struct ggml_tensor * patch_bias = NULL; + struct ggml_tensor * position_embeddings = NULL; + + struct ggml_tensor * pre_norm_w = NULL; + struct ggml_tensor * pre_norm_b = NULL; + + std::vector layers; + + struct ggml_tensor * post_norm_w = NULL; + struct ggml_tensor * post_norm_b = NULL; + + struct ggml_tensor * projection = NULL; + + // LLaVA projection + struct ggml_tensor * mm_1_w = NULL; + struct ggml_tensor * mm_1_b = NULL; + struct ggml_tensor * mm_2_w = NULL; + struct ggml_tensor * mm_2_b = NULL; + + struct ggml_tensor * image_newline = NULL; +}; + +struct clip_context { + // memory buffers used to evaluate the model + std::vector buf_compute_meta; + ggml_backend_sched_t sched = nullptr; + + const clip_vision_model * model; + + // temporary output data, to be picked up by llama_decode() + std::vector out_embd; // size == n_tokens * n_embd + std::vector out_pos; // position of each token +}; + +struct llama_vision_patches { + uint32_t px; // size of patch + uint32_t py; // size of patch + size_t n_px; // number of patches in x direction + size_t n_py; // number of patches in y direction + // RGB float32 image (NHWC) + // Memory layout: RGBRGBRGB... + std::vector> buf; // preprocessed image data +}; + +mm_patch_merge mm_patch_merge_from_name(std::string & name) { + if (name == "flat") { + return MM_PATCH_MERGE_FLAT; + } else if (name == "spatial_unpad") { + return MM_PATCH_MERGE_SPATIAL_UNPAD; + } + return MM_PATCH_MERGE_UNKNOWN; +} + +clip_projector_type clip_projector_type_from_name(std::string & name) { + if (name == "mlp") { + return CLIP_PROJECTOR_TYPE_MLP; + } + return CLIP_PROJECTOR_TYPE_UNKNOWN; +} + +llama_vision_patches * llama_vision_patches_init(llama_vision_bitmap * bmp); +void llama_vision_patches_free(llama_vision_patches * p); + +int32_t llama_vision_encode_impl(clip_context & ctx, llama_vision_patches * p); + +// dimension of the output embeddings, must be equal to n_embd of language model +int clip_n_mmproj_embd(const clip_context & ctx); From 0a81051ae2c7c881fc1ce74f95f28cba977fbe9b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 18 Jan 2025 20:56:35 +0100 Subject: [PATCH 02/25] llama : second attempt to refactor vision API --- common/arg.cpp | 2 +- common/common.h | 1 + examples/CMakeLists.txt | 1 + examples/vision/CMakeLists.txt | 5 + examples/vision/README.md | 3 + examples/vision/vision.cpp | 211 +++++++++++++++++++++++++++++++++ include/llama.h | 26 +++- src/CMakeLists.txt | 1 + src/llama-arch.cpp | 77 +++++++++++- src/llama-arch.h | 71 ++++++++++- src/llama-batch.cpp | 42 ++++++- src/llama-batch.h | 2 + src/llama-context.cpp | 2 +- src/llama-context.h | 4 + src/llama-model-loader.cpp | 2 + src/llama-model.cpp | 114 ++++++++++++++++++ src/llama-model.h | 5 + src/llama-vision.cpp | 164 ++++++++++++------------- src/llama-vision.h | 87 +++++++------- src/llama.cpp | 20 +++- 20 files changed, 695 insertions(+), 145 deletions(-) create mode 100644 examples/vision/CMakeLists.txt create mode 100644 examples/vision/README.md create mode 100644 examples/vision/vision.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 9069950eb0939..710b61c6d3c2b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1403,7 +1403,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.image.emplace_back(value); } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); + ).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_VISION})); if (llama_supports_rpc()) { add_opt(common_arg( {"--rpc"}, "SERVERS", diff --git a/common/common.h b/common/common.h index 691141d6b6b2c..8fc982cf56801 100644 --- a/common/common.h +++ b/common/common.h @@ -79,6 +79,7 @@ enum llama_example { LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_TTS, + LLAMA_EXAMPLE_VISION, LLAMA_EXAMPLE_COUNT, }; diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 66cfab2c3b796..41d968ed64531 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -53,6 +53,7 @@ else() add_subdirectory(tokenize) add_subdirectory(tts) add_subdirectory(gen-docs) + add_subdirectory(vision) if (NOT GGML_BACKEND_DL) # these examples use the backends directly and cannot be built with dynamic loading add_subdirectory(convert-llama2c-to-ggml) diff --git a/examples/vision/CMakeLists.txt b/examples/vision/CMakeLists.txt new file mode 100644 index 0000000000000..ab009157a957f --- /dev/null +++ b/examples/vision/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-vision) +add_executable(${TARGET} vision.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/vision/README.md b/examples/vision/README.md new file mode 100644 index 0000000000000..c2468444caa89 --- /dev/null +++ b/examples/vision/README.md @@ -0,0 +1,3 @@ +# llama.cpp/example/simple-vision + +Minimal demo for vision API diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp new file mode 100644 index 0000000000000..73f8ef1b6ac79 --- /dev/null +++ b/examples/vision/vision.cpp @@ -0,0 +1,211 @@ +#include "llama.h" +#include "common.h" +#include "arg.h" +#include "log.h" +#include "sampling.h" +#include +#include +#include +#include +#include + +#define STB_IMAGE_IMPLEMENTATION +#include "stb_image.h" + +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [--image img_path] [-p prompt]\n", argv[0]); + printf("\n"); +} + +static llama_vision_bitmap * load_image_from_file(const char * fname) { + std::ifstream file(fname, std::ios::binary); + if (!file) { + throw std::runtime_error("Unable to open file"); + } + std::vector image_bytes = std::vector( + std::istreambuf_iterator(file), + std::istreambuf_iterator()); + // decode image to byte array + int nx, ny, nc; + auto * bytes = (unsigned char *) image_bytes.data(); + auto * img = stbi_load_from_memory(bytes, image_bytes.size(), &nx, &ny, &nc, 3); + if (!img) { + throw std::runtime_error("failed to decode image bytes"); + } + // printf("nx=%d ny=%d nc=%d\n", nx, ny, nc); + // GGML_ASSERT(nc == 3); + // for (int y = 0; y < ny; y++) { + // for (int x = 0; x < nx; x++) { + // unsigned char * pix = img + x*nc + y*nc*nx; + // printf("%02x%02x%02x ", pix[0], pix[1], pix[2]); + // } + // printf("\n"); + // } + // printf("\n"); + llama_vision_bitmap * result = llama_vision_bitmap_init(nx, ny); + memcpy(result->data, img, nx*ny*3); + stbi_image_free(img); + return result; +} + +// split string by a `std::string delim` instead of `char delim` +static std::vector string_split(std::string s, const std::string & delimiter) { + std::vector tokens; + size_t pos = 0; + std::string token; + while ((pos = s.find(delimiter)) != std::string::npos) { + token = s.substr(0, pos); + tokens.push_back(token); + s.erase(0, pos + delimiter.length()); + } + tokens.push_back(s); + return tokens; +} + +struct tokenized_part { + llama_tokens tokens; + bool is_image; +}; + +// TODO: this function is hacky, need to be improved +// static const llama_token TOKEN_IMG_PLACEMENT = -1000; +static const std::string IMG_PLACEMENT = ""; +static std::vector tokenize_with_img_placement( + const llama_vocab * vocab, + const std::string & text, + bool add_special, + bool parse_special) { + std::vector parts = string_split(text, IMG_PLACEMENT); + std::vector output; + for (const auto & part : parts) { + //printf("tokenizing part: %s\n", part.c_str()); + bool add_bos = &parts.front() == ∂ + auto tokens = common_tokenize(vocab, part, add_special && add_bos, parse_special); + if (tokens.empty()) { + continue; + } + output.push_back({std::move(tokens), false}); + if (&parts.back() != &part) { + // add image token to middle of 2 parts + output.push_back({{}, true}); + } + } + return output; +} + +int main(int argc, char ** argv) { + common_params params; + + // default prompt for llava 1.5 + params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n" + "USER:\nwhat did you see?\nASSISTANT:"; + params.n_predict = 64; + params.n_batch = 2048; + params.n_ubatch = 1024; + params.n_gpu_layers = 99; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_VISION, print_usage)) { + return 1; + } + + common_init(); + common_init_result llama_init = common_init_from_params(params); + llama_context * ctx = llama_init.context.get(); + const llama_model * model = llama_init.model.get(); + const llama_vocab * vocab = llama_model_get_vocab(model); + + struct common_sampler * smpl = common_sampler_init(model, params.sampling); + + llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); + int n_past = 0; + int n_prompt = 0; + + // process image + llama_vision_patches * img_patches = nullptr; + { + const char * img_path = params.image[0].c_str(); + if (params.image[0].empty()) { + LOG_ERR("no image path provided\n"); + return 1; + } + llama_vision_bitmap * img = load_image_from_file(img_path); + LOG_INF("loaded image %s, size = %d x %d\n", img_path, img->nx, img->ny); + img_patches = llama_vision_patches_init(ctx, img); + if (!img_patches) { + LOG_ERR("failed to create image patches\n"); + return 1; + } + if (llama_vision_encode(ctx, img_patches)) { + LOG_ERR("failed to encode image\n"); + return 1; + } + LOG_INF("encoded image\n"); + } + + // process prompt + { + std::vector parts = tokenize_with_img_placement(vocab, params.prompt, true, true); + for (const tokenized_part & part : parts) { + if (!part.is_image) { + for (const llama_token & token : part.tokens) { + //LOG_INF("%d -> %s\n", token, common_token_to_piece(ctx, token).c_str()); + common_batch_add(batch, token, n_past++, {0}, &part == &parts.back()); + } + LOG_INF("eval text batch (%d tokens)\n", batch.n_tokens); + if (llama_decode(ctx, batch)) { + LOG_ERR("failed to decode text prompt\n"); + return 1; + } + } else { + auto * img_embd = llama_vision_get_output_tensor(ctx); + // std::vector output_debug(ggml_nelements(img_embd)); + // ggml_backend_tensor_get(img_embd, output_debug.data(), 0, ggml_nbytes(img_embd)); + // for (int row = 0; row < 10; row++) { + // int off = row * img_embd->ne[0]; + // printf("... %f %f %f\n", output_debug[off], output_debug[off+1], output_debug[off+2]); + // } + // exit(1); + llama_batch batch_img = llama_batch_get_one_from_tensor(img_embd, n_past, 0); + n_past += batch_img.n_tokens; + LOG_INF("eval image batch (%d embeddings)\n", batch_img.n_tokens); + if (llama_decode(ctx, batch_img)) { + LOG_ERR("failed to decode image prompt\n"); + return 1; + } + llama_batch_free(batch_img); + } + } + n_prompt = n_past; + LOG_INF("prompt processed, %d tokens\n", n_prompt); + } + + // generate response + while (true){ + int n_generated = n_past - n_prompt; + if (n_generated > params.n_predict) { + printf("\n"); + break; + } + + llama_token token_id = common_sampler_sample(smpl, ctx, -1); + common_sampler_accept(smpl, token_id, true); + printf("%s", common_token_to_piece(ctx, token_id).c_str()); + fflush(stdout); + + if (llama_vocab_is_eog(vocab, token_id)) { + printf("\n"); + break; + } + + // eval the token + common_batch_clear(batch); + common_batch_add(batch, token_id, n_past++, {0}, true); + if (llama_decode(ctx, batch)) { + LOG_ERR("failed to decode token\n"); + break; + } + } + + return 0; +} diff --git a/include/llama.h b/include/llama.h index 6049d2382967a..5013e96e78825 100644 --- a/include/llama.h +++ b/include/llama.h @@ -229,6 +229,8 @@ extern "C" { bool sorted; } llama_token_data_array; + struct llama_vision_patches; + // represent an RGB image // size of data must be equal to 3*nx*ny typedef struct llama_vision_bitmap { @@ -237,8 +239,6 @@ extern "C" { unsigned char * data; } llama_vision_bitmap; - struct llama_vision_patches; - typedef bool (*llama_progress_callback)(float progress, void * user_data); // Input data for llama_decode @@ -263,6 +263,8 @@ extern "C" { int32_t * n_seq_id; llama_seq_id ** seq_id; int8_t * logits; // TODO: rename this to "output" + + struct ggml_tensor * embd_tensor; } llama_batch; enum llama_model_kv_override_type { @@ -854,6 +856,10 @@ extern "C" { int32_t embd, int32_t n_seq_max); + // Allocates a batch based on a tensor, only used by vision API for now + // Unlike llama_batch_get_one, this will need to be freed after use + LLAMA_API struct llama_batch llama_batch_get_one_from_tensor(struct ggml_tensor * tensor, int32_t p0, int32_t seq_id); + // Frees a batch of tokens allocated with llama_batch_init() LLAMA_API void llama_batch_free(struct llama_batch batch); @@ -1272,6 +1278,22 @@ extern "C" { // TODO: extend in the future //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...); + // + // Vision API + // + + // Container for RGB bitmap + LLAMA_API struct llama_vision_bitmap * llama_vision_bitmap_init(uint32_t nx, uint32_t ny); + LLAMA_API void llama_vision_bitmap_free(struct llama_vision_bitmap * bmp); + + // Create patches from the RGB bitmap + LLAMA_API struct llama_vision_patches * llama_vision_patches_init(struct llama_context * ctx, llama_vision_bitmap * bmp); + LLAMA_API void llama_vision_patches_free(struct llama_vision_patches * p); + + // Encode patches into embeddings + LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, struct llama_vision_patches * p); + LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx); + // // Model split // diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aeb75bf3e625e..1f3b454fa0c82 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,6 +24,7 @@ add_library(llama llama-quant.cpp llama-sampling.cpp llama-vocab.cpp + llama-vision.cpp unicode.h unicode.cpp unicode-data.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index d7d277e72977a..dcfbdab3e8816 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -65,6 +65,11 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_UNKNOWN, "(unknown)" }, }; +static const std::map VISION_ARCH_NAMES = { + { VISION_ARCH_LLAVA, "llava" }, + { VISION_ARCH_UNKNOWN, "(unknown)" }, +}; + static const std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_TYPE, "general.type" }, { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, @@ -189,6 +194,27 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_VISION_TYPE, "vision.type" }, + { LLM_KV_VISION_IMAGE_SIZE, "vision.image_size" }, + { LLM_KV_VISION_PATCH_SIZE, "vision.patch_size" }, + { LLM_KV_VISION_IMAGE_MEAN, "vision.image_mean" }, + { LLM_KV_VISION_IMAGE_STD, "vision.image_std" }, + { LLM_KV_VISION_CLIP_ARCHITECTURE, "vision.clip.architecture" }, + { LLM_KV_VISION_CLIP_CONTEXT_LENGTH, "vision.clip.context_length" }, + { LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, "vision.clip.embedding_length" }, + { LLM_KV_VISION_CLIP_BLOCK_COUNT, "vision.clip.block_count" }, + { LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, "vision.clip.feed_forward_length" }, + { LLM_KV_VISION_CLIP_PROJECTION_TYPE, "vision.clip.projection_type" }, + { LLM_KV_VISION_CLIP_PROJECTION_DIM, "vision.clip.projection_dim" }, + { LLM_KV_VISION_CLIP_USE_GELU, "vision.clip.use_gelu" }, + { LLM_KV_VISION_CLIP_MAX_POS_EMBD, "vision.clip.max_position_embeddings" }, + { LLM_KV_VISION_CLIP_MAX_SLICES, "vision.clip.max_slices" }, + { LLM_KV_VISION_CLIP_PROJECTOR_TYPE, "vision.clip.projector_type" }, + { LLM_KV_VISION_CLIP_SELECT_LAYER, "vision.clip.select_layer" }, + { LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, "vision.clip.patch_merge_type" }, + { LLM_KV_VISION_CLIP_HEAD_COUNT, "vision.clip.attention.head_count" }, + { LLM_KV_VISION_CLIP_LAYERNORM_EPS, "vision.clip.attention.layer_norm_epsilon" }, + // deprecated { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, @@ -1300,6 +1326,28 @@ static const std::map> LLM_TENSOR_N }, }; +static const std::map> VISION_TENSOR_NAMES = { + { + VISION_ARCH_LLAVA, + { + { VISION_TENSOR_MMPROJ, "v.mmproj_%d" }, + { VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" }, + { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, + { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, + { VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, + { VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, + { VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, + { VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, + { VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" }, + { VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, + { VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, + { VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, + { VISION_TENSOR_PRE_NORM, "v.pre_norm" }, + { VISION_TENSOR_POST_NORM, "v.post_norm" }, + } + } +}; + static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_TOKEN_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, {LLM_TENSOR_POS_EMBD, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, @@ -1449,7 +1497,8 @@ std::string LLM_KV::operator()(llm_kv kv) const { return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch)); } -std::string LLM_TN_IMPL::str() const { +template<> +std::string BASE_TN_IMPL::str() const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } @@ -1464,6 +1513,22 @@ std::string LLM_TN_IMPL::str() const { return name; } +template<> +std::string BASE_TN_IMPL::str() const { + if (VISION_TENSOR_NAMES.at(arch).find(tensor) == VISION_TENSOR_NAMES.at(arch).end()) { + return "__missing__"; + } + + std::string name = ::format(VISION_TENSOR_NAMES.at(arch).at(tensor), bid, xid); + + if (suffix != nullptr) { + name += "."; + name += suffix; + } + + return name; +} + const char * llm_arch_name(llm_arch arch) { auto it = LLM_ARCH_NAMES.find(arch); if (it == LLM_ARCH_NAMES.end()) { @@ -1482,6 +1547,16 @@ llm_arch llm_arch_from_string(const std::string & name) { return LLM_ARCH_UNKNOWN; } +vision_arch vision_arch_from_string(const std::string & name) { + for (const auto & kv : VISION_ARCH_NAMES) { // NOLINT + if (kv.second == name) { + return kv.first; + } + } + + return VISION_ARCH_UNKNOWN; +} + const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) { return LLM_TENSOR_INFOS.at(tensor); } diff --git a/src/llama-arch.h b/src/llama-arch.h index 349844790453f..ce89b15f544c5 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -69,6 +69,11 @@ enum llm_arch { LLM_ARCH_UNKNOWN, }; +enum vision_arch { + VISION_ARCH_UNKNOWN, + VISION_ARCH_LLAVA, +}; + enum llm_kv { LLM_KV_GENERAL_TYPE, LLM_KV_GENERAL_ARCHITECTURE, @@ -193,6 +198,27 @@ enum llm_kv { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, LLM_KV_CONVNEXT_BLOCK_COUNT, + LLM_KV_VISION_TYPE, + LLM_KV_VISION_IMAGE_SIZE, + LLM_KV_VISION_PATCH_SIZE, + LLM_KV_VISION_IMAGE_MEAN, + LLM_KV_VISION_IMAGE_STD, + LLM_KV_VISION_CLIP_ARCHITECTURE, + LLM_KV_VISION_CLIP_CONTEXT_LENGTH, + LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, + LLM_KV_VISION_CLIP_BLOCK_COUNT, + LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, + LLM_KV_VISION_CLIP_PROJECTION_TYPE, + LLM_KV_VISION_CLIP_PROJECTION_DIM, + LLM_KV_VISION_CLIP_USE_GELU, + LLM_KV_VISION_CLIP_MAX_POS_EMBD, + LLM_KV_VISION_CLIP_MAX_SLICES, + LLM_KV_VISION_CLIP_PROJECTOR_TYPE, + LLM_KV_VISION_CLIP_SELECT_LAYER, + LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, + LLM_KV_VISION_CLIP_HEAD_COUNT, + LLM_KV_VISION_CLIP_LAYERNORM_EPS, + // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_SUFFIX_ID, @@ -328,6 +354,23 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_OUT, }; +enum vision_tensor { + VISION_TENSOR_MMPROJ, + VISION_TENSOR_ENC_EMBD_CLS, + VISION_TENSOR_ENC_EMBD_PATCH, + VISION_TENSOR_ENC_EMBD_POS, + VISION_TENSOR_ENC_ATTN_Q, + VISION_TENSOR_ENC_ATTN_K, + VISION_TENSOR_ENC_ATTN_V, + VISION_TENSOR_ENC_INPUT_NORM, + VISION_TENSOR_ENC_OUTPUT, + VISION_TENSOR_ENC_OUTPUT_NORM, + VISION_TENSOR_ENC_FFN_UP, + VISION_TENSOR_ENC_FFN_DOWN, + VISION_TENSOR_PRE_NORM, + VISION_TENSOR_POST_NORM, +}; + enum llm_tensor_layer { LLM_TENSOR_LAYER_INPUT, LLM_TENSOR_LAYER_REPEATING, @@ -351,9 +394,10 @@ struct LLM_KV { // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias" // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight" // -struct LLM_TN_IMPL { - const llm_arch arch; - const llm_tensor tensor; +template +struct BASE_TN_IMPL { + const Tname arch; + const Ttensor tensor; const char * const suffix; const int bid; const int xid; @@ -364,15 +408,16 @@ struct LLM_TN_IMPL { return str(); } - friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) { + friend bool operator==(const std::string & str, const BASE_TN_IMPL & tn) { return str == tn.str(); } - friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) { + friend bool operator!=(const std::string & str, const BASE_TN_IMPL & tn) { return str != tn.str(); } }; +using LLM_TN_IMPL = BASE_TN_IMPL; struct LLM_TN { LLM_TN(llm_arch arch) : arch(arch) {} @@ -387,6 +432,20 @@ struct LLM_TN { } }; +struct VISION_TN { + VISION_TN(vision_arch arch) : arch(arch) {} + + vision_arch arch; + + BASE_TN_IMPL operator()(vision_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const { + return { arch, tensor, suffix, bid, xid }; + } + + BASE_TN_IMPL operator()(vision_tensor tensor, int bid = -1, int xid = -1) const { + return { arch, tensor, nullptr, bid, xid }; + } +}; + struct llm_tensor_info { llm_tensor_layer layer; @@ -397,4 +456,6 @@ const char * llm_arch_name(llm_arch arch); llm_arch llm_arch_from_string(const std::string & name); +vision_arch vision_arch_from_string(const std::string & name); + const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor); diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 01d5ca57fd82b..5ed32d8595256 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -31,6 +31,7 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) { /*n_seq_id =*/ ubatch_n_seq_id.data(), /*seq_id =*/ ubatch_seq_id.data(), /*output =*/ ubatch_output.data(), + /*embd_tensor =*/ nullptr, }; return ubatch; } @@ -55,7 +56,9 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s } else { ubatch.token = nullptr; } - if (batch->embd) { + if (batch->embd_tensor) { + ubatch.embd_tensor = batch->embd_tensor; + } else if (batch->embd) { if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { memcpy( @@ -139,7 +142,7 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) { n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; - llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr || batch->embd_tensor != nullptr); ubatch.equal_seqs = false; if (!seq.empty()) { llama_sbatch_seq & s = seq[0]; @@ -152,7 +155,7 @@ llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) { llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) { n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; - llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr || batch->embd_tensor != nullptr); if (!seq.empty()) { size_t length = 0; size_t n_tokens_in_ubatch = 0; @@ -179,7 +182,7 @@ llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) { llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) { n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch; - llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr); + llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr || batch->embd_tensor != nullptr); if (!seq.empty()) { llama_sbatch_seq & s = seq[seq.size() - 1]; size_t length = s.length < n_ubatch ? s.length : n_ubatch; @@ -320,6 +323,7 @@ struct llama_batch llama_batch_get_one( /*n_seq_id =*/ nullptr, /*seq_id =*/ nullptr, /*logits =*/ nullptr, + /*embd_tensor =*/ nullptr, }; } @@ -332,6 +336,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ /*n_seq_id =*/ nullptr, /*seq_id =*/ nullptr, /*logits =*/ nullptr, + /*embd_tensor =*/ nullptr, }; if (embd) { @@ -353,6 +358,35 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_ return batch; } +struct llama_batch llama_batch_get_one_from_tensor(struct ggml_tensor * tensor, int32_t p0, int32_t seq_id) { + GGML_ASSERT(tensor->ne[2] == 1 && tensor->ne[3] == 1); + int32_t n_tokens = tensor->ne[1]; + llama_batch batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ nullptr, + /*pos =*/ nullptr, + /*n_seq_id =*/ nullptr, + /*seq_id =*/ nullptr, + /*logits =*/ nullptr, + /*embd_tensor =*/ tensor, + }; + batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens); + batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens); + batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens + 1)); + for (int i = 0; i < n_tokens; ++i) { + batch.pos [i] = p0 + i; + batch.seq_id [i] = (llama_seq_id *) malloc(sizeof(llama_seq_id)); + batch.seq_id [i][0] = seq_id; + batch.n_seq_id[i] = 1; + } + batch.seq_id[n_tokens] = nullptr; + + batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens); + + return batch; +} + void llama_batch_free(struct llama_batch batch) { if (batch.token) free(batch.token); if (batch.embd) free(batch.embd); diff --git a/src/llama-batch.h b/src/llama-batch.h index 773c3808b770f..a5e6f1d499772 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -21,6 +21,8 @@ struct llama_ubatch { int32_t * n_seq_id; // [n_seqs] llama_seq_id ** seq_id; // [n_seqs] int8_t * output; // [n_tokens] + + struct ggml_tensor * embd_tensor; }; struct llama_sbatch_seq { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 671d2a81adabf..47cb701a3b05f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -73,7 +73,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens)); } - if (ubatch.embd) { + if (ubatch.embd && !ubatch.embd_tensor) { const int64_t n_embd = hparams.n_embd; const int64_t n_tokens = ubatch.n_tokens; diff --git a/src/llama-context.h b/src/llama-context.h index a9268b2920908..10c839f55ebca 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -6,6 +6,7 @@ #include "llama-model.h" #include "llama-kv-cache.h" #include "llama-adapter.h" +#include "llama-vision.h" #include "ggml-cpp.h" @@ -107,6 +108,9 @@ struct llama_context { struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] + + // vision + clip_context vctx; }; // TODO: make these methods of llama_context diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 75073bf610ac3..2045fcfa582ab 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -375,6 +375,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_key (enum llm_kv kid, bool & result, bool required); template bool llama_model_loader::get_key (enum llm_kv kid, float & result, bool required); + template bool llama_model_loader::get_key (enum llm_kv kid, int32_t & result, bool required); template bool llama_model_loader::get_key (enum llm_kv kid, uint32_t & result, bool required); template bool llama_model_loader::get_key(enum llm_kv kid, std::string & result, bool required); @@ -439,6 +440,7 @@ namespace GGUFMeta { // TODO: this is not very clever - figure out something better template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); llama_model_loader::llama_model_loader( const std::string & fname, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c2d23a8d3a195..42cc230ced973 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1245,6 +1245,54 @@ void llama_model::load_hparams(llama_model_loader & ml) { } hparams.rope_type = llama_model_rope_type(this); + + // vision model + auto & vparams = clip.hparams; + std::string vision_type; + ml.get_key(LLM_KV_VISION_TYPE, vision_type, false); + if (vision_type == "clip-vit") { + LLAMA_LOG_INFO("%s: loading clip-vit vision model\n", __func__); + has_vision = true; + ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true); + ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true); + ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true); + ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD, vparams.image_std, 3, true); + ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, vparams.hidden_size, true); + ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT, vparams.n_layer, true); + ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, vparams.n_intermediate, true); + ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true); + ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true); + ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true); + { + std::string name; + ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true); + vparams.proj_type = clip_projector_type_from_name(name); + if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) { + throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str())); + } + } + { + std::string name; + ml.get_key(LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, name, false); + vparams.mm_patch_merge_type = mm_patch_merge_from_name(name); + } + { + std::string arch; + ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true); + vparams.arch = vision_arch_from_string(arch); + } + } else if (!vision_type.empty()) { + throw std::runtime_error(format("unsupported vision type: %s", vision_type.c_str())); + } + + // arch-specific CLIP hparams + switch (vparams.arch) { + case VISION_ARCH_LLAVA: + { + ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true); + } break; + default: (void)0; + } } void llama_model::load_vocab(llama_model_loader & ml) { @@ -3359,6 +3407,72 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + // load tensors for vision model + auto & vparams = clip.hparams; + if (has_vision) { + const int64_t n_layer = vparams.n_layer; + const int64_t n_embd = vparams.hidden_size; + const int64_t n_ff = vparams.n_intermediate; + const int64_t max_pos_embd = vparams.max_pos_embd; + const int64_t n_channel = 3; // always RGB + const int64_t patch_size = vparams.patch_size; + const auto tn = VISION_TN(vparams.arch); + + // clip is CPU-only for now + clip.buft = ggml_backend_cpu_buffer_type(); + ggml_context * ctx_vision = ctx_map.at(clip.buft); + clip.layers.resize(n_layer); + + switch (vparams.arch) { + case VISION_ARCH_LLAVA: + { + clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff}); + clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_ff}); + clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff, n_ff}); + clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_ff}); + + clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_embd}); + clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_embd}); + clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_embd, max_pos_embd}); + + clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "weight"), {n_embd}); + clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_embd}); + clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = clip.layers[i]; + + layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd}); + layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_embd}); + layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd}); + layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_embd}); + layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_embd}); + + layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_ff}); + layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_embd}); + + layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_embd}); + layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_embd}); + layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_embd}); + layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_embd}); + + layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_embd, n_embd}); + layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_embd}); + } + } break; + default: + throw std::runtime_error("unknown vision architecture"); + } + + if (clip_n_mmproj_embd(clip) != hparams.n_embd) { + std::runtime_error("model has vision, but n_mmproj_embd != n_embd"); + } + } + ml.done_getting_tensors(); ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr); diff --git a/src/llama-model.h b/src/llama-model.h index a7c30444786fd..fd3820f1e418b 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -4,6 +4,7 @@ #include "llama-arch.h" #include "llama-hparams.h" #include "llama-vocab.h" +#include "llama-vision.h" #include #include @@ -362,6 +363,10 @@ struct llama_model { const struct ggml_tensor * get_tensor(const char * name) const; + // vision + bool has_vision = false; + clip_vision_model clip; + private: struct impl; std::unique_ptr pimpl; diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index 87a33c1818257..b419627e64c44 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -1,6 +1,7 @@ #include "llama.h" #include "llama-vision.h" #include "llama-impl.h" +#include "llama-context.h" #include // memcpy #include @@ -43,15 +44,22 @@ struct clip_image_u8_batch { size_t size; }; -static int clip_n_patches(const clip_context & ctx) { +static int clip_n_patches_x(const clip_context & ctx) { auto & hparams = ctx.model->hparams; - int n_patches = (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size); - return n_patches; + return hparams.image_size / hparams.patch_size; +} + +static int clip_n_patches_y(const clip_context & ctx) { + return clip_n_patches_x(ctx); +} + +static int clip_n_patches(const clip_context & ctx) { + return clip_n_patches_x(ctx) * clip_n_patches_y(ctx); } -int clip_n_mmproj_embd(const clip_context & ctx) { - if (ctx.model->hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) { - return ctx.model->mm_2_b->ne[0]; +uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model) { + if (clip_model.hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) { + return clip_model.mm_2_b->ne[0]; } else { GGML_ASSERT(false && "invalid proj type"); } @@ -242,11 +250,11 @@ static llama_vision_patches clip_image_preprocess(const clip_context & ctx, cons pad_to_square = false; } - llama_vision_patches output_imgs; - output_imgs.px = clip_n_patches(ctx); - output_imgs.py = clip_n_patches(ctx); - output_imgs.n_px = params.image_size / output_imgs.px; - output_imgs.n_py = params.image_size / output_imgs.py; + llama_vision_patches output_patches; + output_patches.n_px = clip_n_patches_x(ctx); + output_patches.n_py = clip_n_patches_y(ctx); + output_patches.px = params.patch_size; + output_patches.py = params.patch_size; // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 @@ -296,13 +304,13 @@ static llama_vision_patches clip_image_preprocess(const clip_context & ctx, cons bicubic_resize(img, image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square patches.insert(patches.begin(), image_original_resize); // clip_image_f32_batch_init(patches.size()); - output_imgs.buf.resize(patches.size()); + output_patches.buf.resize(patches.size()); int num = 0; for (auto & patch : patches) { - normalize_image_u8_to_f32(patch, output_imgs.buf[num], params.image_mean, params.image_std); + normalize_image_u8_to_f32(patch, output_patches.buf[num], params.image_mean, params.image_std); num++; } - return output_imgs; + return output_patches; } else { temp.nx = img.nx; temp.ny = img.ny; @@ -367,10 +375,10 @@ static llama_vision_patches clip_image_preprocess(const clip_context & ctx, cons } } - output_imgs.buf.resize(1); - output_imgs.buf[0] = std::move(res); + output_patches.buf.resize(1); + output_patches.buf[0] = std::move(res); - return output_imgs; + return output_patches; } static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, clip_image_size & image_size) { @@ -556,14 +564,16 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, } } + embeddings = ggml_cont(ctx0, embeddings); + // build the graph ggml_build_forward_expand(gf, embeddings); ggml_free(ctx0); return gf; } -static int32_t clip_image_batch_encode(clip_context & ctx, const clip_image_f32_batch & imgs, std::vector & output) { - int batch_size = imgs.size(); +static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches & patches) { + int batch_size = patches.buf.size(); auto & model = *ctx.model; auto & hparams = ctx.model->hparams; @@ -595,15 +605,15 @@ static int32_t clip_image_batch_encode(clip_context & ctx, const clip_image_f32_ float * data = (float *)malloc(ggml_nbytes(inp_raw)); for (int i = 0; i < batch_size; i++) { - const int nx = imgs[i].nx; - const int ny = imgs[i].ny; + const int nx = patches.px * patches.n_px; + const int ny = patches.py * patches.n_py; const int n = nx * ny; for (int b = 0; b < batch_size; b++) { for (int k = 0; k < 3; k++) { for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { - data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].buf[3 * (y * nx + x) + k]; + data[(b * 3 * n) + k * n + y * nx + x] = patches.buf[b][3 * (y * nx + x) + k]; } } } @@ -644,45 +654,71 @@ static int32_t clip_image_batch_encode(clip_context & ctx, const clip_image_f32_ } // compute - ggml_backend_sched_graph_compute_async(ctx.sched, gf); + ggml_backend_sched_graph_compute(ctx.sched, gf); // the last node is the embedding tensor - struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(ctx.sched, embeddings); - - // copy the embeddings to the location passed by the user - size_t out_nbytes = clip_n_patches(ctx)*clip_n_mmproj_embd(ctx)*sizeof(float); - GGML_ASSERT(out_nbytes == ggml_nbytes(embeddings)); - output.resize(out_nbytes); - ggml_backend_tensor_get_async(backend_embd, embeddings, output.data(), 0, ggml_nbytes(embeddings)); + struct ggml_tensor * output_node = ggml_graph_node(gf, -1); + //LLAMA_LOG_INFO("%s: output tensor shape = %lld %lld %lld %lld\n", __func__, output->ne[0], output->ne[1], output->ne[2], output->ne[3]); - ggml_backend_sched_synchronize(ctx.sched); + // copy output node to context + if (ctx.ctx_ggml) { + ggml_free(ctx.ctx_ggml); + } + ggml_init_params params = { + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx.ctx_ggml = ggml_init(params); + ctx.output = ggml_dup_tensor(ctx.ctx_ggml, output_node); + ggml_backend_alloc_ctx_tensors_from_buft(ctx.ctx_ggml, ctx.model->buft); + ggml_backend_tensor_copy(output_node, ctx.output); return 0; } -static int32_t clip_image_encode(clip_context & ctx, const clip_image_f32 & img, std::vector & output) { - clip_image_f32_batch imgs{img}; - return clip_image_batch_encode(ctx, imgs, output); +//////////////////////////////////////////////////////////////////////////////////////// +// public API + +struct llama_vision_bitmap * llama_vision_bitmap_init(uint32_t nx, uint32_t ny) { + llama_vision_bitmap * bmp = new llama_vision_bitmap; + bmp->nx = nx; + bmp->ny = ny; + bmp->data = (unsigned char *)malloc(3 * nx * ny); + return bmp; } -static int32_t encode_image_with_clip(clip_context & ctx, const llama_img img, std::vector & output_embd) { - clip_image_u8 img_u8(img); - clip_image_f32_batch img_res_v; - auto & hparams = ctx.model->hparams; - // bmp_export(img_u8, "test_inp.bmp"); +void llama_vision_bitmap_free(llama_vision_bitmap * bmp) { + free(bmp->data); + delete bmp; +} + +struct llama_vision_patches * llama_vision_patches_init( + struct llama_context * ctx, + llama_vision_bitmap * bmp) { + clip_context & vctx = ctx->vctx; + llama_vision_patches p = clip_image_preprocess(vctx, *bmp); + return new llama_vision_patches(p); +} - if (!clip_image_preprocess(ctx, img_u8, img_res_v)) { - LLAMA_LOG_ERROR("%s: unable to preprocess image\n", __func__); - return -2; +void llama_vision_patches_free(llama_vision_patches * p) { + delete p; +} + +int32_t llama_vision_encode(struct llama_context * ctx, llama_vision_patches * p) { + if (p->buf.empty()) { + LLAMA_LOG_ERROR("%s: nothing to encode\n", __func__); + return -1; } + clip_context & vctx = ctx->vctx; + auto & hparams = vctx.model->hparams; switch (hparams.mm_patch_merge_type) { case MM_PATCH_MERGE_FLAT: { // flat / default llava-1.5 type embedding // n_output = clip_n_patches(ctx); - int32_t encoded = clip_image_encode(ctx, img_res_v[0], output_embd); + int32_t encoded = clip_image_encode(vctx, *p); if (encoded != 0) { LLAMA_LOG_ERROR("Unable to encode image\n"); return encoded; @@ -700,44 +736,8 @@ static int32_t encode_image_with_clip(clip_context & ctx, const llama_img img, s return 0; } -//////////////////////////////////////////////////////////////////////////////////////// -// public API - -int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch) { - if (batch->n_imgs == 0) { - return 0; - } - - // TODO: batching is not working atm, should be fixed later - const int n_embd = clip_n_mmproj_embd(ctx); - const int n_tokens_per_img = clip_n_patches(ctx); - const int n_pos = n_tokens_per_img*batch->n_imgs; - - ctx.out_embd.resize(n_embd*n_pos); - ctx.out_pos.resize(n_pos); - - for (int i = 0; i < batch->n_imgs; i++) { - std::vector output_single; - int32_t status = encode_image_with_clip(ctx, *batch->imgs[i], output_single); - if (status != 0) { - return status; - } - // copy output embeddings to result - for (int k = 0; k < n_embd*n_tokens_per_img; k++) { - ctx.out_embd[n_embd*n_tokens_per_img*i + k] = output_single[k]; - } - // fill position for all output tokens - for (int p = 0; p < n_tokens_per_img; p++) { - ctx.out_pos[n_tokens_per_img*i + p] = batch->pos[i] + p; - } - } - - return 0; -} - -void llama_vision_clear_output(clip_context & ctx) { - ctx.out_embd.clear(); - ctx.out_pos.clear(); +struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx) { + return ctx->vctx.output; } //////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/llama-vision.h b/src/llama-vision.h index d7c922d99ff26..56c6b49c96ed9 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -2,15 +2,11 @@ #include "ggml.h" #include "llama.h" +#include "llama-arch.h" #include #include -enum vision_arch { - VISION_ARCH_UNKNOWN, - VISION_ARCH_LLAVA, -}; - enum clip_projector_type { CLIP_PROJECTOR_TYPE_UNKNOWN, CLIP_PROJECTOR_TYPE_MLP, @@ -50,72 +46,76 @@ struct clip_hparams { struct clip_layer { // attention - struct ggml_tensor * k_w = NULL; - struct ggml_tensor * k_b = NULL; - struct ggml_tensor * q_w = NULL; - struct ggml_tensor * q_b = NULL; - struct ggml_tensor * v_w = NULL; - struct ggml_tensor * v_b = NULL; + struct ggml_tensor * k_w = nullptr; + struct ggml_tensor * k_b = nullptr; + struct ggml_tensor * q_w = nullptr; + struct ggml_tensor * q_b = nullptr; + struct ggml_tensor * v_w = nullptr; + struct ggml_tensor * v_b = nullptr; - struct ggml_tensor * output_w = NULL; - struct ggml_tensor * output_b = NULL; + struct ggml_tensor * output_w = nullptr; + struct ggml_tensor * output_b = nullptr; // layernorm 1 - struct ggml_tensor * norm_in_w = NULL; - struct ggml_tensor * norm_in_b = NULL; + struct ggml_tensor * norm_in_w = nullptr; + struct ggml_tensor * norm_in_b = nullptr; // ff - struct ggml_tensor * ffn_up_w = NULL; - struct ggml_tensor * ffn_up_b = NULL; + struct ggml_tensor * ffn_up_w = nullptr; + struct ggml_tensor * ffn_up_b = nullptr; - struct ggml_tensor * ffn_down_w = NULL; - struct ggml_tensor * ffn_down_b = NULL; + struct ggml_tensor * ffn_down_w = nullptr; + struct ggml_tensor * ffn_down_b = nullptr; // layernorm 2 - struct ggml_tensor * norm_out_w = NULL; - struct ggml_tensor * norm_out_b = NULL; + struct ggml_tensor * norm_out_w = nullptr; + struct ggml_tensor * norm_out_b = nullptr; }; struct clip_vision_model { struct clip_hparams hparams; + ggml_backend_buffer_type_t buft; // embeddings - struct ggml_tensor * class_embedding = NULL; - struct ggml_tensor * patch_embeddings = NULL; - struct ggml_tensor * patch_bias = NULL; - struct ggml_tensor * position_embeddings = NULL; + struct ggml_tensor * class_embedding = nullptr; + struct ggml_tensor * patch_embeddings = nullptr; + struct ggml_tensor * patch_bias = nullptr; + struct ggml_tensor * position_embeddings = nullptr; - struct ggml_tensor * pre_norm_w = NULL; - struct ggml_tensor * pre_norm_b = NULL; + struct ggml_tensor * pre_norm_w = nullptr; + struct ggml_tensor * pre_norm_b = nullptr; std::vector layers; - struct ggml_tensor * post_norm_w = NULL; - struct ggml_tensor * post_norm_b = NULL; + struct ggml_tensor * post_norm_w = nullptr; + struct ggml_tensor * post_norm_b = nullptr; - struct ggml_tensor * projection = NULL; + struct ggml_tensor * projection = nullptr; // LLaVA projection - struct ggml_tensor * mm_1_w = NULL; - struct ggml_tensor * mm_1_b = NULL; - struct ggml_tensor * mm_2_w = NULL; - struct ggml_tensor * mm_2_b = NULL; + struct ggml_tensor * mm_1_w = nullptr; + struct ggml_tensor * mm_1_b = nullptr; + struct ggml_tensor * mm_2_w = nullptr; + struct ggml_tensor * mm_2_b = nullptr; - struct ggml_tensor * image_newline = NULL; + struct ggml_tensor * image_newline = nullptr; }; struct clip_context { // memory buffers used to evaluate the model std::vector buf_compute_meta; ggml_backend_sched_t sched = nullptr; + struct ggml_context * ctx_ggml = nullptr; const clip_vision_model * model; // temporary output data, to be picked up by llama_decode() - std::vector out_embd; // size == n_tokens * n_embd - std::vector out_pos; // position of each token + struct ggml_tensor * output; }; +// for now, this only contains: +// - the instruction for ggml_conv_2d to break the image into patches +// - the pre-processed image data in f32 struct llama_vision_patches { uint32_t px; // size of patch uint32_t py; // size of patch @@ -126,7 +126,7 @@ struct llama_vision_patches { std::vector> buf; // preprocessed image data }; -mm_patch_merge mm_patch_merge_from_name(std::string & name) { +inline mm_patch_merge mm_patch_merge_from_name(std::string & name) { if (name == "flat") { return MM_PATCH_MERGE_FLAT; } else if (name == "spatial_unpad") { @@ -135,17 +135,14 @@ mm_patch_merge mm_patch_merge_from_name(std::string & name) { return MM_PATCH_MERGE_UNKNOWN; } -clip_projector_type clip_projector_type_from_name(std::string & name) { +inline clip_projector_type clip_projector_type_from_name(std::string & name) { if (name == "mlp") { return CLIP_PROJECTOR_TYPE_MLP; } return CLIP_PROJECTOR_TYPE_UNKNOWN; } -llama_vision_patches * llama_vision_patches_init(llama_vision_bitmap * bmp); -void llama_vision_patches_free(llama_vision_patches * p); - -int32_t llama_vision_encode_impl(clip_context & ctx, llama_vision_patches * p); +// only for sanity check: must be equal to n_embd of language model +uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model); -// dimension of the output embeddings, must be equal to n_embd of language model -int clip_n_mmproj_embd(const clip_context & ctx); +struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index e8cfe5012819c..6170a655a276a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -138,6 +138,9 @@ static struct ggml_tensor * llm_build_inp_embd( ), scale); inpL = ggml_add(ctx, inpL, inpL_delta); } + } else if (ubatch.embd_tensor) { + inpL = ubatch.embd_tensor; + ggml_set_input(ubatch.embd_tensor); } else { lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); inpL = lctx.inp_embd; @@ -8466,7 +8469,9 @@ static int llama_decode_impl( const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + GGML_ASSERT((batch.token && !batch.embd && !batch.embd_tensor) + || (!batch.token && batch.embd && !batch.embd_tensor) + || (!batch.token && !batch.embd && batch.embd_tensor)); if (batch.token) { for (uint32_t i = 0; i < n_tokens_all; ++i) { @@ -9232,7 +9237,7 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) { uint32_t n_seqs = 1; // TODO: worst-case number of sequences uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch); llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); // initialize scheduler with the worst-case graph @@ -9785,7 +9790,7 @@ struct llama_context * llama_init_from_model( uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true); // reserve pp graph first so that buffers are only allocated once @@ -9794,7 +9799,7 @@ struct llama_context * llama_init_from_model( int n_nodes_pp = ggml_graph_n_nodes(gf_pp); // reserve with tg graph to get the number of splits and nodes - llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true); ggml_backend_sched_reserve(ctx->sched.get(), gf_tg); int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get()); @@ -9832,6 +9837,13 @@ struct llama_context * llama_init_from_model( } } + if (model->has_vision) { + ctx->vctx.model = &model->clip; + ctx->vctx.sched = ctx->sched.get(); + const size_t max_nodes = 1024; + ctx->vctx.buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + } + return ctx; } From 6cabdda0df1a5d89255c3895dc74dfc0eb435048 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 18 Jan 2025 22:56:04 +0100 Subject: [PATCH 03/25] add back convert hf to gguf --- convert_hf_to_gguf.py | 70 +++++++++++++++++++++++++-- examples/server/server.cpp | 1 + gguf-py/gguf/constants.py | 86 ++++++++++++++++++++++++++++++++++ gguf-py/gguf/gguf_writer.py | 53 +++++++++++++++++++++ gguf-py/gguf/tensor_mapping.py | 58 +++++++++++++++++++++++ include/llama.h | 2 +- src/llama-vision.h | 2 +- 7 files changed, 266 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 95f11204332eb..9e36cad61131c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -17,6 +17,7 @@ from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast from itertools import chain +from transformers import AutoConfig import math import numpy as np import torch @@ -66,6 +67,12 @@ class Model: metadata_override: Path | None dir_model_card: Path + # for vision model + preprocessor_config: dict[str, Any] | None = None + vparams: dict[str, Any] | None = None + v_tensor_map: gguf.TensorNameMap + v_tensor_names: set[str] | None + # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -95,6 +102,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.metadata_override = metadata_override self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + self.preprocessor_config = self.load_preprocessor_config(self.dir_model) # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: @@ -210,9 +218,13 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) - if new_name is None: + new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes) + if new_name is not None: + return new_name + elif new_name_vision is not None: + return new_name_vision + else: raise ValueError(f"Can not map tensor {name!r}") - return new_name def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) @@ -466,7 +478,24 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] @staticmethod def load_hparams(dir_model: Path): with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) + hparams = json.load(f) + if "text_config" in hparams: + text_config = hparams["text_config"] + # for example, llava-1.5-7b-hf misses the language model config, need to retrieve it via model ID + if "_name_or_path" in text_config: + text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict() + hparams = {**text_config, **hparams} + return hparams + + @staticmethod + def load_preprocessor_config(dir_model: Path): + # TODO: this varies vastly among models, need to handle more cases in the future + file_path = dir_model / "preprocessor_config.json" + if os.path.exists(file_path): + with open(file_path, "r", encoding="utf-8") as f: + return json.load(f) + else: + return None @classmethod def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: @@ -1557,10 +1586,17 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if "vision_config" in self.hparams: + self.vparams = self.hparams["vision_config"] + if self.vparams is not None: + self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.vparams["num_hidden_layers"]) + def set_vocab(self): try: self._set_vocab_sentencepiece() @@ -1594,6 +1630,26 @@ def set_vocab(self): if self.hparams.get("vocab_size", 32000) == 49152: self.gguf_writer.add_add_bos_token(False) + # For vision model + if self.vparams is not None and self.preprocessor_config is not None: + self.gguf_writer.add_vision_type("clip-vit") + self.gguf_writer.add_vision_image_size(self.vparams["image_size"]) + self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"]) + self.gguf_writer.add_vision_clip_architecture("llava") + self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"]) + self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"]) + self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"]) + self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"]) + self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"]) + self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"]) + self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"]) + self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) + max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 + self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) + # TODO: should not hardcode these, but they are currently missing from config.json + self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) + self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05) + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -1624,6 +1680,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") + # For vision model + if name.startswith("language_model"): + name = name.replace("language_model.", "") + if "post_layernorm" in name: + return [] # skip post_layernorm + if name.endswith(("q_proj.weight", "q_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 64c0c4ef68f13..83aa946e2a64c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2949,6 +2949,7 @@ struct server_context { batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, + nullptr, }; const int ret = llama_decode(ctx, batch_view); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8fe84df21ea20..411c89e7f5373 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -202,6 +202,9 @@ class Tokenizer: FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id" FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id" FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id" + # Vision models + IMAGE_START_ID = "tokenizer.ggml.image_start_token_id" + IMAGE_END_ID = "tokenizer.ggml.image_end_token_id" # deprecated: PREFIX_ID = "tokenizer.ggml.prefix_token_id" SUFFIX_ID = "tokenizer.ggml.suffix_token_id" @@ -211,6 +214,31 @@ class Adapter: TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" + class Vision: + # only support vision.type = "clip-vit" for now + TYPE = "vision.type" + IMAGE_SIZE = "vision.image_size" + PATCH_SIZE = "vision.patch_size" + IMAGE_MEAN = "vision.image_mean" + IMAGE_STD = "vision.image_std" + + class Clip: + ARCHITECTURE = "vision.clip.architecture" + CONTEXT_LENGTH = "vision.clip.context_length" + EMBEDDING_LENGTH = "vision.clip.embedding_length" + BLOCK_COUNT = "vision.clip.block_count" + FEED_FORWARD_LENGTH = "vision.clip.feed_forward_length" + PROJECTION_TYPE = "vision.clip.projection_type" + PROJECTION_DIM = "vision.clip.projection_dim" + USE_GELU = "vision.clip.use_gelu" + MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings" + MAX_SLICES = "vision.clip.max_slices" + PROJECTOR_TYPE = "vision.clip.projector_type" + SELECT_LAYER = "vision.clip.select_layer" + PATCH_MERGE_TYPE = "vision.clip.patch_merge_type" + HEAD_COUNT = "vision.clip.attention.head_count" + LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon" + # # recommended mapping of model tensor names for storage in gguf # @@ -279,6 +307,8 @@ class MODEL_ARCH(IntEnum): GRANITE_MOE = auto() CHAMELEON = auto() WAVTOKENIZER_DEC = auto() + # vision models + LLAVA_VISION = auto() class MODEL_TENSOR(IntEnum): @@ -390,6 +420,7 @@ class MODEL_TENSOR(IntEnum): ENC_OUTPUT_NORM = auto() CLS = auto() # classifier CLS_OUT = auto() # classifier output projection + # wavtokenizer CONV1D = auto() CONVNEXT_DW = auto() CONVNEXT_NORM = auto() @@ -406,6 +437,21 @@ class MODEL_TENSOR(IntEnum): POSNET_ATTN_K = auto() POSNET_ATTN_V = auto() POSNET_ATTN_OUT = auto() + # vision + V_MMPROJ = auto() + V_ENC_EMBD_CLS = auto() + V_ENC_EMBD_PATCH = auto() + V_ENC_EMBD_POS = auto() + V_ENC_ATTN_Q = auto() + V_ENC_ATTN_K = auto() + V_ENC_ATTN_V = auto() + V_ENC_INPUT_NORM = auto() + V_ENC_OUTPUT = auto() + V_ENC_OUTPUT_NORM = auto() + V_ENC_FFN_UP = auto() + V_ENC_FFN_DOWN = auto() + V_PRE_NORM = auto() + V_POST_NORM = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -593,6 +639,21 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", + # vision + MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}", + MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls", + MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.enc.embd.patch", + MODEL_TENSOR.V_ENC_EMBD_POS: "v.enc.embd.pos", + MODEL_TENSOR.V_ENC_ATTN_Q: "v.enc.blk.{bid}.attn_q", + MODEL_TENSOR.V_ENC_ATTN_K: "v.enc.blk.{bid}.attn_k", + MODEL_TENSOR.V_ENC_ATTN_V: "v.enc.blk.{bid}.attn_v", + MODEL_TENSOR.V_ENC_INPUT_NORM: "v.enc.blk.{bid}.input_norm", + MODEL_TENSOR.V_ENC_OUTPUT: "v.enc.blk.{bid}.output", + MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.enc.blk.{bid}.output_norm", + MODEL_TENSOR.V_ENC_FFN_UP: "v.enc.blk.{bid}.ffn_up", + MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down", + MODEL_TENSOR.V_PRE_NORM: "v.pre_norm", + MODEL_TENSOR.V_POST_NORM: "v.post_norm", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1534,6 +1595,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_V, MODEL_TENSOR.POSNET_ATTN_OUT, ], + MODEL_ARCH.LLAVA_VISION: [ + MODEL_TENSOR.V_MMPROJ, + MODEL_TENSOR.V_ENC_EMBD_CLS, + MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_ATTN_Q, + MODEL_TENSOR.V_ENC_ATTN_K, + MODEL_TENSOR.V_ENC_ATTN_V, + MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_OUTPUT, + MODEL_TENSOR.V_ENC_OUTPUT_NORM, + MODEL_TENSOR.V_ENC_FFN_UP, + MODEL_TENSOR.V_ENC_FFN_DOWN, + MODEL_TENSOR.V_PRE_NORM, + MODEL_TENSOR.V_POST_NORM, + ], # TODO } @@ -1615,6 +1692,15 @@ class PoolingType(IntEnum): CLS = 2 +class CLIPProjectorType(Enum): + MLP = 'mlp' + + +class CLIPPatchMergeType(Enum): + FLAT = 'flat' + SPATIAL_UNPAD = 'spatial_unpad' + + class GGMLQuantizationType(IntEnum): F32 = 0 F16 = 1 diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 080d2b9dce5cb..5438acd06132b 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -27,6 +27,8 @@ PoolingType, TokenType, ExpertGatingFuncType, + CLIPPatchMergeType, + CLIPProjectorType, ) from .quants import quant_shape_from_byte_shape @@ -874,6 +876,57 @@ def add_remove_extra_whitespaces(self, value: bool) -> None: def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None: self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap) + + def add_vision_type(self, value: str) -> None: + self.add_string(Keys.Vision.TYPE, value) + + def add_vision_image_size(self, value: int) -> None: + self.add_uint32(Keys.Vision.IMAGE_SIZE, value) + + def add_vision_patch_size(self, value: int) -> None: + self.add_uint32(Keys.Vision.PATCH_SIZE, value) + + def add_vision_clip_architecture(self, value: str) -> None: + self.add_string(Keys.Vision.Clip.ARCHITECTURE, value) + + def add_vision_clip_context_length(self, value: int) -> None: + self.add_uint32(Keys.Vision.Clip.CONTEXT_LENGTH, value) + + def add_vision_clip_embedding_length(self, value: int) -> None: + self.add_uint32(Keys.Vision.Clip.EMBEDDING_LENGTH, value) + + def add_vision_clip_block_count(self, value: int) -> None: + self.add_uint32(Keys.Vision.Clip.BLOCK_COUNT, value) + + def add_vision_clip_feed_forward_length(self, value: int) -> None: + self.add_uint32(Keys.Vision.Clip.FEED_FORWARD_LENGTH, value) + + def add_vision_clip_head_count(self, value: int) -> None: + self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value) + + def add_vision_clip_max_position_embeddings(self, value: int) -> None: + self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value) + + def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None: + self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value) + + def add_vision_clip_max_slices(self, value: int) -> None: + self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value) + + def add_vision_clip_select_layer(self, value: int) -> None: + self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value) + + def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None: + self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value) + + def add_vision_clip_layer_norm_epsilon(self, value: float) -> None: + self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value) + + def add_vision_clip_image_mean(self, value: Sequence[float]) -> None: + self.add_array(Keys.Vision.IMAGE_MEAN, value) + + def add_vision_clip_image_std(self, value: Sequence[float]) -> None: + self.add_array(Keys.Vision.IMAGE_STD, value) def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: if not isinstance(value, str): diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 617791e240b60..813f8f7e052ce 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -787,6 +787,64 @@ class TensorNameMap: MODEL_TENSOR.POSNET_ATTN_OUT: ( "backbone.posnet.{bid}.proj_out", # wavtokenizer ), + + ############################################################################# + + MODEL_TENSOR.V_MMPROJ: ( + "multi_modal_projector.linear_{bid}", + ), + + MODEL_TENSOR.V_ENC_EMBD_CLS: ( + "vision_tower.vision_model.embeddings.class_embedding", + ), + + MODEL_TENSOR.V_ENC_EMBD_PATCH: ( + "vision_tower.vision_model.embeddings.patch_embedding", + ), + + MODEL_TENSOR.V_ENC_EMBD_POS: ( + "vision_tower.vision_model.embeddings.position_embedding", + ), + + MODEL_TENSOR.V_ENC_ATTN_Q: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", + ), + + MODEL_TENSOR.V_ENC_ATTN_K: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", + ), + + MODEL_TENSOR.V_ENC_ATTN_V: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", + ), + + MODEL_TENSOR.V_ENC_INPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", + ), + + MODEL_TENSOR.V_ENC_OUTPUT: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", + ), + + MODEL_TENSOR.V_ENC_OUTPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", + ), + + MODEL_TENSOR.V_ENC_FFN_UP: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", + ), + + MODEL_TENSOR.V_ENC_FFN_DOWN: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", + ), + + MODEL_TENSOR.V_PRE_NORM: ( + "vision_tower.vision_model.pre_layrnorm", + ), + + MODEL_TENSOR.V_POST_NORM: ( + "vision_tower.vision_model.post_layernorm", + ), } # architecture-specific block mappings diff --git a/include/llama.h b/include/llama.h index 5013e96e78825..bd8e696585693 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1292,7 +1292,7 @@ extern "C" { // Encode patches into embeddings LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, struct llama_vision_patches * p); - LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx); + LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(struct llama_context * ctx); // // Model split diff --git a/src/llama-vision.h b/src/llama-vision.h index 56c6b49c96ed9..ced58dd0b88ca 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -40,7 +40,7 @@ struct clip_hparams { std::array image_mean; std::array image_std; - std::array image_grid_pinpoints; + std::array image_grid_pinpoints; // TODO: should this be array of (x, y) pairs? int32_t image_crop_resolution; }; From d0068ef0eda5f43c65258dd2eefda5bacea412fb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 19 Jan 2025 16:29:20 +0100 Subject: [PATCH 04/25] add mobilevlm --- convert_hf_to_gguf.py | 66 ++++++++++++++++------- gguf-py/gguf/constants.py | 34 ++++++++++-- gguf-py/gguf/gguf_writer.py | 2 +- gguf-py/gguf/tensor_mapping.py | 8 +++ src/llama-arch.cpp | 31 ++++++++++- src/llama-arch.h | 3 ++ src/llama-model.cpp | 99 ++++++++++++++++++++-------------- src/llama-vision.cpp | 29 +++++++++- src/llama-vision.h | 11 ++++ 9 files changed, 216 insertions(+), 67 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9e36cad61131c..89e62f5cefb37 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast from itertools import chain -from transformers import AutoConfig +from transformers import AutoConfig, AutoImageProcessor import math import numpy as np import torch @@ -68,9 +68,10 @@ class Model: dir_model_card: Path # for vision model + vision_arch: gguf.MODEL_ARCH | None = None preprocessor_config: dict[str, Any] | None = None vparams: dict[str, Any] | None = None - v_tensor_map: gguf.TensorNameMap + v_tensor_map: gguf.TensorNameMap | None = None v_tensor_names: set[str] | None # subclasses should define this! @@ -102,7 +103,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.metadata_override = metadata_override self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py - self.preprocessor_config = self.load_preprocessor_config(self.dir_model) # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: @@ -218,7 +218,7 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) - new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes) + new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes) if self.v_tensor_map is not None else None if new_name is not None: return new_name elif new_name_vision is not None: @@ -488,14 +488,17 @@ def load_hparams(dir_model: Path): return hparams @staticmethod - def load_preprocessor_config(dir_model: Path): + def load_preprocessor_config(dir_or_model_id: Path | str): # TODO: this varies vastly among models, need to handle more cases in the future - file_path = dir_model / "preprocessor_config.json" - if os.path.exists(file_path): - with open(file_path, "r", encoding="utf-8") as f: - return json.load(f) + if isinstance(dir_or_model_id, Path): + file_path = dir_or_model_id / "preprocessor_config.json" + if os.path.exists(file_path): + with open(file_path, "r", encoding="utf-8") as f: + return json.load(f) + else: + raise Exception(f"Preprocessor config not found at {file_path}") else: - return None + return AutoImageProcessor.from_pretrained(dir_or_model_id).to_dict() @classmethod def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: @@ -1586,16 +1589,31 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration", "MobileLlamaForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - if "vision_config" in self.hparams: + + model_type = self.hparams.get("model_type", None) + self.vision_arch = None + + # only tested with https://huggingface.co/llava-hf/llava-1.5-7b-hf + if "vision_config" in self.hparams and model_type == "llava": self.vparams = self.hparams["vision_config"] - if self.vparams is not None: - self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.vparams["num_hidden_layers"]) + self.preprocessor_config = self.load_preprocessor_config(self.dir_model) + self.vision_arch = gguf.MODEL_ARCH.VISION_LLAVA + + # only tested with https://huggingface.co/mtgv/MobileVLM_V2-1.7B + if "mm_vision_tower" in self.hparams and model_type == "mobilevlm": + vision_model_id = self.hparams["mm_vision_tower"] + self.vparams = AutoConfig.from_pretrained(vision_model_id).to_dict()["vision_config"] + self.preprocessor_config = self.load_preprocessor_config(vision_model_id) + self.vision_arch = gguf.MODEL_ARCH.VISION_MOBILEVLM + + if self.vparams is not None and self.vision_arch is not None: + self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"]) def set_vocab(self): try: @@ -1631,23 +1649,31 @@ def set_vocab(self): self.gguf_writer.add_add_bos_token(False) # For vision model - if self.vparams is not None and self.preprocessor_config is not None: + if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None: self.gguf_writer.add_vision_type("clip-vit") self.gguf_writer.add_vision_image_size(self.vparams["image_size"]) self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"]) - self.gguf_writer.add_vision_clip_architecture("llava") + self.gguf_writer.add_vision_clip_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch]) self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"]) self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"]) self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"]) self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"]) self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"]) self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"]) - self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"]) self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) + if "vision_feature_layer" in self.hparams: + self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"]) + elif "mm_vision_select_layer" in self.hparams: + self.gguf_writer.add_vision_clip_select_layer(self.hparams["mm_vision_select_layer"]) + else: + raise ValueError("gguf: can not find vision_feature_layer parameter.") # TODO: should not hardcode these, but they are currently missing from config.json - self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) + if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA: + self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) + if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM: + self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.LDPV2) self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05) def set_gguf_parameters(self): @@ -1683,6 +1709,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # For vision model if name.startswith("language_model"): name = name.replace("language_model.", "") + else: + name = name.replace("model.vision_tower.", "") if "post_layernorm" in name: return [] # skip post_layernorm @@ -2101,7 +2129,7 @@ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: return n_dims > 1 -@Model.register("MiniCPMForCausalLM") +@Model.register("MiniCPMForCausalLM", "MiniCPMV") class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 411c89e7f5373..7007ecfd860de 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -308,7 +308,8 @@ class MODEL_ARCH(IntEnum): CHAMELEON = auto() WAVTOKENIZER_DEC = auto() # vision models - LLAVA_VISION = auto() + VISION_LLAVA = auto() + VISION_MOBILEVLM = auto() class MODEL_TENSOR(IntEnum): @@ -439,6 +440,8 @@ class MODEL_TENSOR(IntEnum): POSNET_ATTN_OUT = auto() # vision V_MMPROJ = auto() + V_MMPROJ_MLP = auto() + V_MMPROJ_PEG = auto() V_ENC_EMBD_CLS = auto() V_ENC_EMBD_PATCH = auto() V_ENC_EMBD_POS = auto() @@ -512,6 +515,9 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GRANITE_MOE: "granitemoe", MODEL_ARCH.CHAMELEON: "chameleon", MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec", + # vision + MODEL_ARCH.VISION_LLAVA: "llava", + MODEL_ARCH.VISION_MOBILEVLM: "mobilevlm", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -641,6 +647,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", # vision MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}", + MODEL_TENSOR.V_MMPROJ_MLP: "v.mmproj.mlp.{bid}", + MODEL_TENSOR.V_MMPROJ_PEG: "v.mmproj.peg.{bid}", MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.enc.embd.patch", MODEL_TENSOR.V_ENC_EMBD_POS: "v.enc.embd.pos", @@ -1595,7 +1603,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_V, MODEL_TENSOR.POSNET_ATTN_OUT, ], - MODEL_ARCH.LLAVA_VISION: [ + MODEL_ARCH.VISION_LLAVA: [ MODEL_TENSOR.V_MMPROJ, MODEL_TENSOR.V_ENC_EMBD_CLS, MODEL_TENSOR.V_ENC_EMBD_PATCH, @@ -1611,6 +1619,23 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_PRE_NORM, MODEL_TENSOR.V_POST_NORM, ], + MODEL_ARCH.VISION_MOBILEVLM: [ + MODEL_TENSOR.V_MMPROJ_MLP, + MODEL_TENSOR.V_MMPROJ_PEG, + MODEL_TENSOR.V_ENC_EMBD_CLS, + MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_ATTN_Q, + MODEL_TENSOR.V_ENC_ATTN_K, + MODEL_TENSOR.V_ENC_ATTN_V, + MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_OUTPUT, + MODEL_TENSOR.V_ENC_OUTPUT_NORM, + MODEL_TENSOR.V_ENC_FFN_UP, + MODEL_TENSOR.V_ENC_FFN_DOWN, + MODEL_TENSOR.V_PRE_NORM, + MODEL_TENSOR.V_POST_NORM, + ], # TODO } @@ -1693,11 +1718,12 @@ class PoolingType(IntEnum): class CLIPProjectorType(Enum): - MLP = 'mlp' + MLP = 'mlp' + LDPV2 = 'ldpv2' class CLIPPatchMergeType(Enum): - FLAT = 'flat' + FLAT = 'flat' SPATIAL_UNPAD = 'spatial_unpad' diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 5438acd06132b..4b9a0c9662dfe 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -876,7 +876,7 @@ def add_remove_extra_whitespaces(self, value: bool) -> None: def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None: self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap) - + def add_vision_type(self, value: str) -> None: self.add_string(Keys.Vision.TYPE, value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 813f8f7e052ce..f7ff9a032f341 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -794,6 +794,14 @@ class TensorNameMap: "multi_modal_projector.linear_{bid}", ), + MODEL_TENSOR.V_MMPROJ_MLP: ( + "model.mm_projector.mlp.mlp.{bid}", + ), + + MODEL_TENSOR.V_MMPROJ_PEG: ( + "model.mm_projector.peg.peg.{bid}", + ), + MODEL_TENSOR.V_ENC_EMBD_CLS: ( "vision_tower.vision_model.embeddings.class_embedding", ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index dcfbdab3e8816..b474e07507bbd 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -67,6 +67,7 @@ static const std::map LLM_ARCH_NAMES = { static const std::map VISION_ARCH_NAMES = { { VISION_ARCH_LLAVA, "llava" }, + { VISION_ARCH_MOBILEVLM, "mobilevlm" }, { VISION_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1345,7 +1346,27 @@ static const std::map> VISION { VISION_TENSOR_PRE_NORM, "v.pre_norm" }, { VISION_TENSOR_POST_NORM, "v.post_norm" }, } - } + }, + { + VISION_ARCH_MOBILEVLM, + { + { VISION_TENSOR_MMPROJ_MLP, "v.mmproj.mlp.%d" }, + { VISION_TENSOR_MMPROJ_PEG, "v.mmproj.peg.%d" }, + { VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" }, + { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, + { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, + { VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, + { VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, + { VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, + { VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, + { VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" }, + { VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, + { VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, + { VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, + { VISION_TENSOR_PRE_NORM, "v.pre_norm" }, + { VISION_TENSOR_POST_NORM, "v.post_norm" }, + } + }, }; static const std::map LLM_TENSOR_INFOS = { @@ -1499,6 +1520,10 @@ std::string LLM_KV::operator()(llm_kv kv) const { template<> std::string BASE_TN_IMPL::str() const { + if (LLM_TENSOR_NAMES.find(arch) == LLM_TENSOR_NAMES.end()) { + throw std::runtime_error(format("Cannot find tensor name mapping for arch %d", arch)); + } + if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } @@ -1515,6 +1540,10 @@ std::string BASE_TN_IMPL::str() const { template<> std::string BASE_TN_IMPL::str() const { + if (VISION_TENSOR_NAMES.find(arch) == VISION_TENSOR_NAMES.end()) { + throw std::runtime_error(format("Cannot find tensor name mapping for arch %d", arch)); + } + if (VISION_TENSOR_NAMES.at(arch).find(tensor) == VISION_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } diff --git a/src/llama-arch.h b/src/llama-arch.h index ce89b15f544c5..87966b11fe510 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -72,6 +72,7 @@ enum llm_arch { enum vision_arch { VISION_ARCH_UNKNOWN, VISION_ARCH_LLAVA, + VISION_ARCH_MOBILEVLM, }; enum llm_kv { @@ -356,6 +357,8 @@ enum llm_tensor { enum vision_tensor { VISION_TENSOR_MMPROJ, + VISION_TENSOR_MMPROJ_MLP, + VISION_TENSOR_MMPROJ_PEG, VISION_TENSOR_ENC_EMBD_CLS, VISION_TENSOR_ENC_EMBD_PATCH, VISION_TENSOR_ENC_EMBD_POS, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 42cc230ced973..cd669744f15c4 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1280,6 +1280,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { std::string arch; ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true); vparams.arch = vision_arch_from_string(arch); + if (vparams.arch == VISION_ARCH_UNKNOWN) { + throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str())); + } } } else if (!vision_type.empty()) { throw std::runtime_error(format("unsupported vision type: %s", vision_type.c_str())); @@ -1288,6 +1291,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // arch-specific CLIP hparams switch (vparams.arch) { case VISION_ARCH_LLAVA: + case VISION_ARCH_MOBILEVLM: { ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true); } break; @@ -3410,58 +3414,71 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // load tensors for vision model auto & vparams = clip.hparams; if (has_vision) { - const int64_t n_layer = vparams.n_layer; - const int64_t n_embd = vparams.hidden_size; - const int64_t n_ff = vparams.n_intermediate; - const int64_t max_pos_embd = vparams.max_pos_embd; - const int64_t n_channel = 3; // always RGB - const int64_t patch_size = vparams.patch_size; + // language params + const int64_t n_embd = hparams.n_embd; + // vision params + const int64_t n_vlayer = vparams.n_layer; + const int64_t n_vembd = vparams.hidden_size; + const int64_t n_vff = vparams.n_intermediate; + const int64_t max_pos_embd = vparams.max_pos_embd; + const int64_t n_channel = 3; // always RGB + const int64_t patch_size = vparams.patch_size; const auto tn = VISION_TN(vparams.arch); // clip is CPU-only for now clip.buft = ggml_backend_cpu_buffer_type(); ggml_context * ctx_vision = ctx_map.at(clip.buft); - clip.layers.resize(n_layer); + clip.layers.resize(n_vlayer); switch (vparams.arch) { case VISION_ARCH_LLAVA: + case VISION_ARCH_MOBILEVLM: { - clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff}); - clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_ff}); - clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff, n_ff}); - clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_ff}); - - clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_embd}); - clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_embd}); - clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_embd, max_pos_embd}); - - clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "weight"), {n_embd}); - clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_embd}); - clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - - for (int i = 0; i < n_layer; ++i) { + if (vparams.arch == VISION_ARCH_LLAVA) { + clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_vembd, n_vff}); + clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_vff}); + clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_vff, n_vff}); + clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_vff}); + } else if (vparams.arch == VISION_ARCH_MOBILEVLM) { + clip.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd}); + clip.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "bias", 0), {n_embd}); + clip.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "weight", 2), {n_embd, n_embd}); + clip.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "bias", 2), {n_embd}); + clip.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd}); + clip.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_PEG, "bias", 0), {n_embd}); + } + + clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_vembd}); + clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); + clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + + clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "weight"), {n_vembd}); + clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_vembd}); + clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + + for (int i = 0; i < n_vlayer; ++i) { auto & layer = clip.layers[i]; - layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd}); - layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_embd}); - layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd}); - layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_embd}); - layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd}); - layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_embd}); - - layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}); - layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_ff}); - layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_ff, n_embd}); - layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_embd}); - - layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_embd}); - layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_embd}); - layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_embd}); - layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_embd}); - - layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_embd, n_embd}); - layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_embd}); + layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); + layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_vembd}); + layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); + layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_vembd}); + layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); + layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_vembd}); + + layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); + layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_vff}); + layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); + layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_vembd}); + + layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_vembd}); + layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_vembd}); + layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); + layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); + + layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); + layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_vembd}); } } break; default: diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index b419627e64c44..9b78ec1a6099e 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -58,8 +58,11 @@ static int clip_n_patches(const clip_context & ctx) { } uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model) { - if (clip_model.hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) { + auto & proj_type = clip_model.hparams.proj_type; + if (proj_type == CLIP_PROJECTOR_TYPE_MLP) { return clip_model.mm_2_b->ne[0]; + } else if (proj_type == CLIP_PROJECTOR_TYPE_LDPV2) { + return clip_model.mm_model_peg_0_b->ne[0]; } else { GGML_ASSERT(false && "invalid proj type"); } @@ -559,6 +562,30 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, embeddings = ggml_gelu(ctx0, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + + } else if (hparams.proj_type == CLIP_PROJECTOR_TYPE_LDPV2) { + int n_patch = 24; + struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + embeddings = peg_0; + } else { GGML_ASSERT(false && "unsupported proj type"); } diff --git a/src/llama-vision.h b/src/llama-vision.h index ced58dd0b88ca..5401cb51a5160 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -10,6 +10,7 @@ enum clip_projector_type { CLIP_PROJECTOR_TYPE_UNKNOWN, CLIP_PROJECTOR_TYPE_MLP, + CLIP_PROJECTOR_TYPE_LDPV2, }; enum mm_patch_merge { @@ -98,6 +99,14 @@ struct clip_vision_model { struct ggml_tensor * mm_2_w = nullptr; struct ggml_tensor * mm_2_b = nullptr; + // MobileVLM_V2 projection + struct ggml_tensor * mm_model_mlp_0_w = nullptr; + struct ggml_tensor * mm_model_mlp_0_b = nullptr; + struct ggml_tensor * mm_model_mlp_2_w = nullptr; + struct ggml_tensor * mm_model_mlp_2_b = nullptr; + struct ggml_tensor * mm_model_peg_0_w = nullptr; + struct ggml_tensor * mm_model_peg_0_b = nullptr; + struct ggml_tensor * image_newline = nullptr; }; @@ -138,6 +147,8 @@ inline mm_patch_merge mm_patch_merge_from_name(std::string & name) { inline clip_projector_type clip_projector_type_from_name(std::string & name) { if (name == "mlp") { return CLIP_PROJECTOR_TYPE_MLP; + } else if (name == "ldpv2") { + return CLIP_PROJECTOR_TYPE_LDPV2; } return CLIP_PROJECTOR_TYPE_UNKNOWN; } From 4a7ab89d7593ccb89f80e6e118875ee0b3ede3c7 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 19 Jan 2025 22:33:05 +0100 Subject: [PATCH 05/25] wip minicpmv --- convert_hf_to_gguf.py | 160 ++++++++++++++++++++--------- gguf-py/gguf/constants.py | 46 ++++++++- gguf-py/gguf/tensor_mapping.py | 46 +++++++++ src/llama-arch.cpp | 41 +++++--- src/llama-arch.h | 11 ++ src/llama-hparams.h | 2 +- src/llama-model.cpp | 45 +++++++-- src/llama-vision.cpp | 177 ++++++++++++++++++++++++++++++++- src/llama-vision.h | 40 +++++++- 9 files changed, 491 insertions(+), 77 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 89e62f5cefb37..bf6ffb49c66e1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast from itertools import chain -from transformers import AutoConfig, AutoImageProcessor +from transformers import AutoConfig import math import numpy as np import torch @@ -134,6 +134,16 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: return None raise KeyError(f"could not find any of: {keys}") + def find_vparams(self, keys: Iterable[str], optional: bool = False) -> Any: + if self.vparams is None: + raise ValueError("vision model parameters not set") + key = next((k for k in keys if k in self.vparams), None) + if key is not None: + return self.vparams[key] + if optional: + return None + raise KeyError(f"(vision) could not find any of: {keys}") + def set_vocab(self): self._set_vocab_gpt2() @@ -269,6 +279,20 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_length(head_dim) self.gguf_writer.add_value_length(head_dim) + # Vision model parameters + if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None: + self.gguf_writer.add_vision_type("clip-vit") + self.gguf_writer.add_vision_image_size(self.vparams["image_size"]) + self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"]) + self.gguf_writer.add_vision_clip_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch]) + self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"]) + self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"]) + self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"]) + self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"]) + self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"]) + self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"]) + self.gguf_writer.add_vision_clip_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"])) + self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") @@ -488,17 +512,14 @@ def load_hparams(dir_model: Path): return hparams @staticmethod - def load_preprocessor_config(dir_or_model_id: Path | str): + def load_preprocessor_config(dir_model: Path): # TODO: this varies vastly among models, need to handle more cases in the future - if isinstance(dir_or_model_id, Path): - file_path = dir_or_model_id / "preprocessor_config.json" - if os.path.exists(file_path): - with open(file_path, "r", encoding="utf-8") as f: - return json.load(f) - else: - raise Exception(f"Preprocessor config not found at {file_path}") + file_path = dir_model / "preprocessor_config.json" + if os.path.exists(file_path): + with open(file_path, "r", encoding="utf-8") as f: + return json.load(f) else: - return AutoImageProcessor.from_pretrained(dir_or_model_id).to_dict() + raise Exception(f"Preprocessor config not found at {file_path}") @classmethod def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: @@ -551,7 +572,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + # DEBIAN_FRONTEND=noninteractive means that the script is running in a non-interactive environment (i.e. CI), so we cannot answer Y/N when it asks for user input + is_cli_non_interactive = os.environ.get("DEBIAN_FRONTEND", "") == "noninteractive" + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=is_cli_non_interactive) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size @@ -1607,9 +1630,10 @@ def __init__(self, *args, **kwargs): # only tested with https://huggingface.co/mtgv/MobileVLM_V2-1.7B if "mm_vision_tower" in self.hparams and model_type == "mobilevlm": + from transformers import AutoImageProcessor vision_model_id = self.hparams["mm_vision_tower"] self.vparams = AutoConfig.from_pretrained(vision_model_id).to_dict()["vision_config"] - self.preprocessor_config = self.load_preprocessor_config(vision_model_id) + self.preprocessor_config = AutoImageProcessor.from_pretrained(vision_model_id).to_dict() self.vision_arch = gguf.MODEL_ARCH.VISION_MOBILEVLM if self.vparams is not None and self.vision_arch is not None: @@ -1648,34 +1672,6 @@ def set_vocab(self): if self.hparams.get("vocab_size", 32000) == 49152: self.gguf_writer.add_add_bos_token(False) - # For vision model - if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None: - self.gguf_writer.add_vision_type("clip-vit") - self.gguf_writer.add_vision_image_size(self.vparams["image_size"]) - self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"]) - self.gguf_writer.add_vision_clip_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch]) - self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"]) - self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"]) - self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"]) - self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"]) - self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"]) - self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"]) - self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) - max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 - self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) - if "vision_feature_layer" in self.hparams: - self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"]) - elif "mm_vision_select_layer" in self.hparams: - self.gguf_writer.add_vision_clip_select_layer(self.hparams["mm_vision_select_layer"]) - else: - raise ValueError("gguf: can not find vision_feature_layer parameter.") - # TODO: should not hardcode these, but they are currently missing from config.json - if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA: - self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) - if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM: - self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.LDPV2) - self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05) - def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -1692,6 +1688,18 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + # For vision model + if self.vparams is not None: + self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) + # TODO: should not hardcode these, but they are currently missing from config.json + if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA: + self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) + if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM: + self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.LDPV2) + self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05) + max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 + self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) + @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: @@ -2132,16 +2140,50 @@ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: @Model.register("MiniCPMForCausalLM", "MiniCPMV") class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM + proj_type: gguf.constants.CLIPProjectorType | None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + model_type = self.hparams.get("model_type", None) + + # only tested with https://huggingface.co/openbmb/MiniCPM-V-2_6 + if "vision_config" in self.hparams and model_type == "minicpmv": + self.vparams = self.hparams["vision_config"] + self.preprocessor_config = self.load_preprocessor_config(self.dir_model) + self.vision_arch = gguf.MODEL_ARCH.VISION_MINICPMV + version = str(self.hparams.get("version", "unknown")) + if version == "2.5": + self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_5 + elif version == "2.6": + self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_6 + else: + raise ValueError(f"Unsupported MiniCPM-V version: {version}") + + if self.vparams is not None and self.vision_arch is not None and self.preprocessor_config is not None: + self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5] + self.preprocessor_config["image_std"] = [0.5, 0.5, 0.5] + self.hparams["vision_feature_layer"] = 0 + self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"]) def set_gguf_parameters(self): super().set_gguf_parameters() - embedding_scale = float(self.hparams["scale_emb"]) + # scale_emb + embedding_scale = float(self.hparams.get("scale_emb", 1.0)) self.gguf_writer.add_embedding_scale(embedding_scale) logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") - residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 + # scale_depth + if "scale_depth" in self.hparams: + residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 + else: + residual_scale = 1.0 self.gguf_writer.add_residual_scale(residual_scale) logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") - logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] + # logit_scale + if "dim_model_base" in self.hparams: + logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] + else: + logit_scale = 1.0 self.gguf_writer.add_logit_scale(logit_scale) logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") if self.hparams.get("rope_scaling") is not None: @@ -2149,6 +2191,15 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") + # For vision model + if self.vparams is not None and self.proj_type is not None: + self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) + self.gguf_writer.add_vision_clip_projector_type(self.proj_type) + self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-06) + max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] @@ -2167,18 +2218,33 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) def set_vocab(self): - self._set_vocab_sentencepiece() + if self.vision_arch == gguf.MODEL_ARCH.VISION_MINICPMV: + # undocumented anywhere, I only found this thanks to https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf + self._set_vocab_gpt2() + else: + self._set_vocab_sentencepiece() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + # For vision model + if name.startswith("llm."): + name = name.replace("llm.", "") + # attention, someone mess up and use underscore instead of dot + if name.endswith("in_proj_weight"): + name = name.replace("_weight", ".weight") + if name.endswith("in_proj_bias"): + name = name.replace("_bias", ".bias") + if "post_layernorm" in name: + return [] # skip post_layernorm + n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") # HF models permute some of the tensors, so we need to undo that - if name.endswith(("q_proj.weight")): + if not name.startswith("vpm") and name.endswith(("q_proj.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight")): + if not name.startswith("vpm") and name.endswith(("k_proj.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) return [(self.map_tensor_name(name), data_torch)] @@ -5064,7 +5130,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file") + description="Convert a huggingface model to a GGML compatible file\n\nNote: When converting vision models, this script may use internet connection to download configuration files via Hugging Face.") parser.add_argument( "--vocab-only", action="store_true", help="extract only the vocab", diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7007ecfd860de..bd7befed207ed 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -310,6 +310,7 @@ class MODEL_ARCH(IntEnum): # vision models VISION_LLAVA = auto() VISION_MOBILEVLM = auto() + VISION_MINICPMV = auto() class MODEL_TENSOR(IntEnum): @@ -455,6 +456,15 @@ class MODEL_TENSOR(IntEnum): V_ENC_FFN_DOWN = auto() V_PRE_NORM = auto() V_POST_NORM = auto() + V_RESMPL_POS_EMBD_K = auto() # minicpmv + V_RESMPL_ATTN_IN = auto() # minicpmv + V_RESMPL_ATTN_OUT = auto() # minicpmv + V_RESMPL_KV_PROJ = auto() # minicpmv + V_RESMPL_NORM_POST = auto() # minicpmv + V_RESMPL_NORM_KV = auto() # minicpmv + V_RESMPL_NORM_Q = auto() # minicpmv + V_RESMPL_PROJ = auto() # minicpmv + V_RESMPL_QUERY = auto() # minicpmv MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -518,6 +528,7 @@ class MODEL_TENSOR(IntEnum): # vision MODEL_ARCH.VISION_LLAVA: "llava", MODEL_ARCH.VISION_MOBILEVLM: "mobilevlm", + MODEL_ARCH.VISION_MINICPMV: "minicpmv", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -662,6 +673,15 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down", MODEL_TENSOR.V_PRE_NORM: "v.pre_norm", MODEL_TENSOR.V_POST_NORM: "v.post_norm", + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "v.resmpl.pos_embd_k", + MODEL_TENSOR.V_RESMPL_ATTN_IN: "v.resmpl.attn_in", + MODEL_TENSOR.V_RESMPL_ATTN_OUT: "v.resmpl.attn_out", + MODEL_TENSOR.V_RESMPL_KV_PROJ: "v.resmpl.kv_proj", + MODEL_TENSOR.V_RESMPL_NORM_POST: "v.resmpl.norm_post", + MODEL_TENSOR.V_RESMPL_NORM_KV: "v.resmpl.norm_kv", + MODEL_TENSOR.V_RESMPL_NORM_Q: "v.resmpl.norm_q", + MODEL_TENSOR.V_RESMPL_PROJ: "v.resmpl.proj", + MODEL_TENSOR.V_RESMPL_QUERY: "v.resmpl.query", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1636,6 +1656,26 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_PRE_NORM, MODEL_TENSOR.V_POST_NORM, ], + MODEL_ARCH.VISION_MINICPMV: [ + MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_ATTN_Q, + MODEL_TENSOR.V_ENC_ATTN_K, + MODEL_TENSOR.V_ENC_ATTN_V, + MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_OUTPUT, + MODEL_TENSOR.V_ENC_OUTPUT_NORM, + MODEL_TENSOR.V_ENC_FFN_UP, + MODEL_TENSOR.V_ENC_FFN_DOWN, + MODEL_TENSOR.V_RESMPL_ATTN_IN, + MODEL_TENSOR.V_RESMPL_ATTN_OUT, + MODEL_TENSOR.V_RESMPL_KV_PROJ, + MODEL_TENSOR.V_RESMPL_NORM_POST, + MODEL_TENSOR.V_RESMPL_NORM_KV, + MODEL_TENSOR.V_RESMPL_NORM_Q, + MODEL_TENSOR.V_RESMPL_PROJ, + MODEL_TENSOR.V_RESMPL_QUERY, + ], # TODO } @@ -1718,8 +1758,10 @@ class PoolingType(IntEnum): class CLIPProjectorType(Enum): - MLP = 'mlp' - LDPV2 = 'ldpv2' + MLP = 'mlp' + LDPV2 = 'ldpv2' + MINICPMV_2_5 = 'minicpmv-2.5' # resampler + MINICPMV_2_6 = 'minicpmv-2.6' # resampler class CLIPPatchMergeType(Enum): diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f7ff9a032f341..92e1d499ae0d9 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -808,42 +808,52 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_PATCH: ( "vision_tower.vision_model.embeddings.patch_embedding", + "vpm.embeddings.patch_embedding", ), MODEL_TENSOR.V_ENC_EMBD_POS: ( "vision_tower.vision_model.embeddings.position_embedding", + "vpm.embeddings.position_embedding", ), MODEL_TENSOR.V_ENC_ATTN_Q: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", + "vpm.encoder.layers.{bid}.self_attn.q_proj", ), MODEL_TENSOR.V_ENC_ATTN_K: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", + "vpm.encoder.layers.{bid}.self_attn.k_proj", ), MODEL_TENSOR.V_ENC_ATTN_V: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", + "vpm.encoder.layers.{bid}.self_attn.v_proj", ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", + "vpm.encoder.layers.{bid}.layer_norm1", ), MODEL_TENSOR.V_ENC_OUTPUT: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", + "vpm.encoder.layers.{bid}.self_attn.out_proj", ), MODEL_TENSOR.V_ENC_OUTPUT_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", + "vpm.encoder.layers.{bid}.layer_norm2", ), MODEL_TENSOR.V_ENC_FFN_UP: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", + "vpm.encoder.layers.{bid}.mlp.fc1", ), MODEL_TENSOR.V_ENC_FFN_DOWN: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", + "vpm.encoder.layers.{bid}.mlp.fc2", ), MODEL_TENSOR.V_PRE_NORM: ( @@ -853,6 +863,42 @@ class TensorNameMap: MODEL_TENSOR.V_POST_NORM: ( "vision_tower.vision_model.post_layernorm", ), + + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ( + "resampler.pos_embed_k", + ), + + MODEL_TENSOR.V_RESMPL_ATTN_IN: ( + "resampler.attn.in_proj", + ), + + MODEL_TENSOR.V_RESMPL_ATTN_OUT: ( + "resampler.attn.out_proj", + ), + + MODEL_TENSOR.V_RESMPL_KV_PROJ: ( + "resampler.kv_proj", + ), + + MODEL_TENSOR.V_RESMPL_NORM_POST: ( + "resampler.ln_post", + ), + + MODEL_TENSOR.V_RESMPL_NORM_KV: ( + "resampler.ln_kv", + ), + + MODEL_TENSOR.V_RESMPL_NORM_Q: ( + "resampler.ln_q", + ), + + MODEL_TENSOR.V_RESMPL_PROJ: ( + "resampler.proj", + ), + + MODEL_TENSOR.V_RESMPL_QUERY: ( + "resampler.query", + ), } # architecture-specific block mappings diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b474e07507bbd..e2908c0ae0956 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -3,6 +3,7 @@ #include "llama-impl.h" #include +#include static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA, "llama" }, @@ -65,12 +66,6 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_UNKNOWN, "(unknown)" }, }; -static const std::map VISION_ARCH_NAMES = { - { VISION_ARCH_LLAVA, "llava" }, - { VISION_ARCH_MOBILEVLM, "mobilevlm" }, - { VISION_ARCH_UNKNOWN, "(unknown)" }, -}; - static const std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_TYPE, "general.type" }, { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, @@ -1367,6 +1362,30 @@ static const std::map> VISION { VISION_TENSOR_POST_NORM, "v.post_norm" }, } }, + { + VISION_ARCH_MINICPMV, + { + { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, + { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, + { VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, + { VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, + { VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, + { VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, + { VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" }, + { VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, + { VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, + { VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, + { VISION_TENSOR_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" }, + { VISION_TENSOR_RESMPL_ATTN_IN, "v.resmpl.attn_in" }, + { VISION_TENSOR_RESMPL_ATTN_OUT, "v.resmpl.attn_out" }, + { VISION_TENSOR_RESMPL_KV_PROJ, "v.resmpl.kv_proj" }, + { VISION_TENSOR_RESMPL_NORM_POST, "v.resmpl.norm_post" }, + { VISION_TENSOR_RESMPL_NORM_KV, "v.resmpl.norm_kv" }, + { VISION_TENSOR_RESMPL_NORM_Q, "v.resmpl.norm_q" }, + { VISION_TENSOR_RESMPL_PROJ, "v.resmpl.proj" }, + { VISION_TENSOR_RESMPL_QUERY, "v.resmpl.query" }, + } + }, }; static const std::map LLM_TENSOR_INFOS = { @@ -1576,16 +1595,6 @@ llm_arch llm_arch_from_string(const std::string & name) { return LLM_ARCH_UNKNOWN; } -vision_arch vision_arch_from_string(const std::string & name) { - for (const auto & kv : VISION_ARCH_NAMES) { // NOLINT - if (kv.second == name) { - return kv.first; - } - } - - return VISION_ARCH_UNKNOWN; -} - const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) { return LLM_TENSOR_INFOS.at(tensor); } diff --git a/src/llama-arch.h b/src/llama-arch.h index 87966b11fe510..7d4a1cd8c9567 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -73,6 +73,7 @@ enum vision_arch { VISION_ARCH_UNKNOWN, VISION_ARCH_LLAVA, VISION_ARCH_MOBILEVLM, + VISION_ARCH_MINICPMV, }; enum llm_kv { @@ -372,6 +373,16 @@ enum vision_tensor { VISION_TENSOR_ENC_FFN_DOWN, VISION_TENSOR_PRE_NORM, VISION_TENSOR_POST_NORM, + // minicpmv + VISION_TENSOR_RESMPL_POS_EMBD_K, + VISION_TENSOR_RESMPL_ATTN_IN, + VISION_TENSOR_RESMPL_ATTN_OUT, + VISION_TENSOR_RESMPL_KV_PROJ, + VISION_TENSOR_RESMPL_NORM_POST, + VISION_TENSOR_RESMPL_NORM_KV, + VISION_TENSOR_RESMPL_NORM_Q, + VISION_TENSOR_RESMPL_PROJ, + VISION_TENSOR_RESMPL_QUERY, }; enum llm_tensor_layer { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 1fe45410371b9..52df05e6ac71c 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -96,7 +96,7 @@ struct llama_hparams { float f_max_alibi_bias = 0.0f; float f_logit_scale = 0.0f; - // Additional scale factors (Granite/Granite MoE) + // Additional scale factors (Granite/Granite MoE/MiniCPM) float f_residual_scale = 0.0f; float f_embedding_scale = 0.0f; float f_attention_scale = 0.0f; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index cd669744f15c4..d4d53aba69047 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2,6 +2,7 @@ #include "llama-impl.h" #include "llama-mmap.h" +#include "llama-vision.h" #include "llama-model-loader.h" #include "ggml-cpp.h" @@ -1263,6 +1264,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true); ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true); ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true); + ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true); { std::string name; ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true); @@ -1289,14 +1291,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { } // arch-specific CLIP hparams - switch (vparams.arch) { - case VISION_ARCH_LLAVA: - case VISION_ARCH_MOBILEVLM: - { - ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true); - } break; - default: (void)0; - } + // switch (vparams.arch) { + // case VISION_ARCH_LLAVA: + // default: (void)0; + // } } void llama_model::load_vocab(llama_model_loader & ml) { @@ -3457,6 +3455,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) { clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + for (int i = 0; i < n_vlayer; ++i) { + auto & layer = clip.layers[i]; + + layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); + layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_vembd}); + layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); + layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_vembd}); + layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); + layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_vembd}); + + layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); + layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_vff}); + layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); + layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_vembd}); + + layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_vembd}); + layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_vembd}); + layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); + layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); + + layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); + layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_vembd}); + } + } break; + case VISION_ARCH_MINICPMV: + { + clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); + clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + + // TODO: load all resampler tensors + for (int i = 0; i < n_vlayer; ++i) { auto & layer = clip.layers[i]; diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index 9b78ec1a6099e..73c96031501f9 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -63,6 +63,10 @@ uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model) { return clip_model.mm_2_b->ne[0]; } else if (proj_type == CLIP_PROJECTOR_TYPE_LDPV2) { return clip_model.mm_model_peg_0_b->ne[0]; + } else if (proj_type == CLIP_PROJECTOR_TYPE_MINICPMV_2_5) { + return 4096; + } else if (proj_type == CLIP_PROJECTOR_TYPE_MINICPMV_2_6) { + return 3584; } else { GGML_ASSERT(false && "invalid proj type"); } @@ -243,6 +247,173 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, std::vector(std::round(static_cast(length) / patch_size) * patch_size), patch_size); + } + + std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.first; + int height = original_size.second; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + int best_width = ensure_divide(width, patch_size); + int best_height = ensure_divide(height, patch_size); + return std::make_pair(best_width, best_height); + } + + std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width, height; + std::tie(width, height) = original_size; + int grid_x, grid_y; + std::tie(grid_x, grid_y) = grid; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + int grid_width = refine_width / grid_x; + int grid_height = refine_height / grid_y; + + // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) + auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair + int best_grid_width, best_grid_height; + std::tie(best_grid_width, best_grid_height) = best_grid_size; + + // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) + std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) + return refine_size; + } + + std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector> candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.emplace_back(m, split_grids_nums / m); + } + ++m; + } + } + + std::pair best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; + } + + std::vector> uhd_slice_image( + const clip_image_u8 & img, + const int max_slice_nums = 9, + const int scale_resolution = 448, + const int patch_size = 14) { + const std::pair original_size={img.nx,img.ny}; + const int original_width = img.nx; + const int original_height = img.ny; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + std::vector> images; + LLAMA_LOG_DEBUG("%s: multiple %d\n", __func__, multiple); + images.push_back(std::vector()); + + if (multiple <= 1) { + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); + clip_image_u8 source_image; + bicubic_resize(img, source_image, best_size.first, best_size.second); + // source_image = image.resize(best_size, Image.Resampling.BICUBIC) + images[images.size()-1].push_back(source_image); + } + else if (multiple > 1) { + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); + clip_image_u8 source_image; + bicubic_resize(img, source_image, best_size.first, best_size.second); + // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) + LLAMA_LOG_DEBUG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img.nx, img.ny, best_size.first, best_size.second); + images[images.size()-1].push_back(source_image); + + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + LLAMA_LOG_DEBUG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img.nx, img.ny, best_grid.first, best_grid.second); + + auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); + clip_image_u8 refine_image; + bicubic_resize(img, refine_image, refine_size.first, refine_size.second); + + LLAMA_LOG_DEBUG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image.nx, refine_image.ny, refine_size.first, refine_size.second); + + // split_to_patches + int width = refine_image.nx; + int height = refine_image.ny; + int grid_x = int(width / best_grid.first); + int grid_y = int(height / best_grid.second); + for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ + images.push_back(std::vector()); + for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ + clip_image_u8 patch; + patch.nx = grid_x; + patch.ny = grid_y; + patch.buf.resize(3 * patch.nx * patch.ny); + for (int y = patches_i; y < patches_i + grid_y; ++y) { + for (int x = patches_j; x < patches_j + grid_x; ++x) { + const int i = 3 * (y * refine_image.nx + x); + const int j = 3 * ((y-patches_i) * patch.nx + (x-patches_j)); + patch.buf[j] = refine_image.buf[i]; + patch.buf[j+1] = refine_image.buf[i+1]; + patch.buf[j+2] = refine_image.buf[i+2]; + } + } + images[images.size()-1].push_back(patch); + } + } + } + return images; + } +}; + +static llama_vision_patches clip_image_preprocess_minicpmv(const clip_context & ctx, const clip_image_u8 & img) { + auto & params = ctx.model->hparams; + GGML_ASSERT(params.arch == VISION_ARCH_MINICPMV); + + static const int max_slice_nums = 9; + minicpmv_preprocessor preprocessor; + std::vector> imgs = preprocessor.uhd_slice_image(img, max_slice_nums); + + llama_vision_patches output_patches; + output_patches.n_px = clip_n_patches_x(ctx); + output_patches.n_py = clip_n_patches_y(ctx); + output_patches.px = params.patch_size; + output_patches.py = params.patch_size; + + for (size_t i = 0; i < imgs.size(); ++i) { + for (size_t j = 0; j < imgs[i].size(); ++j) { + std::vector res; + normalize_image_u8_to_f32(imgs[i][j], res, params.image_mean, params.image_std); + output_patches.buf.push_back(res); + } + } +} + // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found static llama_vision_patches clip_image_preprocess(const clip_context & ctx, const clip_image_u8 & img) { @@ -724,8 +895,10 @@ struct llama_vision_patches * llama_vision_patches_init( struct llama_context * ctx, llama_vision_bitmap * bmp) { clip_context & vctx = ctx->vctx; - llama_vision_patches p = clip_image_preprocess(vctx, *bmp); - return new llama_vision_patches(p); + if (vctx.model->hparams.arch == VISION_ARCH_MINICPMV) { + return new llama_vision_patches(clip_image_preprocess_minicpmv(vctx, *bmp)); + } + return new llama_vision_patches(clip_image_preprocess(vctx, *bmp)); } void llama_vision_patches_free(llama_vision_patches * p) { diff --git a/src/llama-vision.h b/src/llama-vision.h index 5401cb51a5160..19377abef722e 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -11,6 +11,8 @@ enum clip_projector_type { CLIP_PROJECTOR_TYPE_UNKNOWN, CLIP_PROJECTOR_TYPE_MLP, CLIP_PROJECTOR_TYPE_LDPV2, + CLIP_PROJECTOR_TYPE_MINICPMV_2_5, + CLIP_PROJECTOR_TYPE_MINICPMV_2_6, }; enum mm_patch_merge { @@ -36,7 +38,7 @@ struct clip_hparams { float eps; clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_UNKNOWN; - mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_FLAT; + mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_UNKNOWN; std::array image_mean; std::array image_std; @@ -107,6 +109,26 @@ struct clip_vision_model { struct ggml_tensor * mm_model_peg_0_w = nullptr; struct ggml_tensor * mm_model_peg_0_b = nullptr; + // MINICPMV projection + struct ggml_tensor * mm_model_pos_embed_k; + struct ggml_tensor * mm_model_query; + struct ggml_tensor * mm_model_proj; + struct ggml_tensor * mm_model_kv_proj; + struct ggml_tensor * mm_model_attn_q_w; + struct ggml_tensor * mm_model_attn_q_b; + struct ggml_tensor * mm_model_attn_k_w; + struct ggml_tensor * mm_model_attn_k_b; + struct ggml_tensor * mm_model_attn_v_w; + struct ggml_tensor * mm_model_attn_v_b; + struct ggml_tensor * mm_model_attn_o_w; + struct ggml_tensor * mm_model_attn_o_b; + struct ggml_tensor * mm_model_ln_q_w; + struct ggml_tensor * mm_model_ln_q_b; + struct ggml_tensor * mm_model_ln_kv_w; + struct ggml_tensor * mm_model_ln_kv_b; + struct ggml_tensor * mm_model_ln_post_w; + struct ggml_tensor * mm_model_ln_post_b; + struct ggml_tensor * image_newline = nullptr; }; @@ -135,6 +157,18 @@ struct llama_vision_patches { std::vector> buf; // preprocessed image data }; +inline vision_arch vision_arch_from_string(const std::string & name) { + if (name == "llava") { + return VISION_ARCH_LLAVA; + } else if (name == "mobilevlm") { + return VISION_ARCH_MOBILEVLM; + } else if (name == "minicpmv") { + return VISION_ARCH_MINICPMV; + } + + return VISION_ARCH_UNKNOWN; +} + inline mm_patch_merge mm_patch_merge_from_name(std::string & name) { if (name == "flat") { return MM_PATCH_MERGE_FLAT; @@ -149,6 +183,10 @@ inline clip_projector_type clip_projector_type_from_name(std::string & name) { return CLIP_PROJECTOR_TYPE_MLP; } else if (name == "ldpv2") { return CLIP_PROJECTOR_TYPE_LDPV2; + } else if (name == "minicpmv-2.5") { + return CLIP_PROJECTOR_TYPE_MINICPMV_2_5; + } else if (name == "minicpmv-2.6") { + return CLIP_PROJECTOR_TYPE_MINICPMV_2_6; } return CLIP_PROJECTOR_TYPE_UNKNOWN; } From 431bb0805919ef74cbae8ca918301c468c642380 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 21 Jan 2025 10:51:26 +0100 Subject: [PATCH 06/25] change gguf KV from clip to vit --- convert_hf_to_gguf.py | 36 ++++++++++++------------- gguf-py/gguf/constants.py | 34 ++++++++++++------------ gguf-py/gguf/gguf_writer.py | 52 ++++++++++++++++++------------------- src/llama-arch.cpp | 30 ++++++++++----------- src/llama-arch.h | 30 ++++++++++----------- src/llama-model.cpp | 24 ++++++++--------- 6 files changed, 103 insertions(+), 103 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bf6ffb49c66e1..d32272ac2da19 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -281,17 +281,17 @@ def set_gguf_parameters(self): # Vision model parameters if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None: - self.gguf_writer.add_vision_type("clip-vit") + self.gguf_writer.add_vision_type("vit") self.gguf_writer.add_vision_image_size(self.vparams["image_size"]) self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"]) - self.gguf_writer.add_vision_clip_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch]) - self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"]) - self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"]) - self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"]) - self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"]) - self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"]) - self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"]) - self.gguf_writer.add_vision_clip_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"])) + self.gguf_writer.add_vision_vit_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch]) + self.gguf_writer.add_vision_vit_block_count(self.vparams["num_hidden_layers"]) + self.gguf_writer.add_vision_vit_embedding_length(self.vparams["hidden_size"]) + self.gguf_writer.add_vision_vit_feed_forward_length(self.vparams["intermediate_size"]) + self.gguf_writer.add_vision_vit_head_count(self.vparams["num_attention_heads"]) + self.gguf_writer.add_vision_vit_image_mean(self.preprocessor_config["image_mean"]) + self.gguf_writer.add_vision_vit_image_std(self.preprocessor_config["image_std"]) + self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"])) self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") @@ -1690,15 +1690,15 @@ def set_gguf_parameters(self): # For vision model if self.vparams is not None: - self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) + self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) # TODO: should not hardcode these, but they are currently missing from config.json if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA: - self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) + self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP) if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM: - self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.LDPV2) - self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05) + self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.LDPV2) + self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-05) max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 - self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) + self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd) @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): @@ -2193,11 +2193,11 @@ def set_gguf_parameters(self): # For vision model if self.vparams is not None and self.proj_type is not None: - self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) - self.gguf_writer.add_vision_clip_projector_type(self.proj_type) - self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-06) + self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) + self.gguf_writer.add_vision_vit_projector_type(self.proj_type) + self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06) max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 - self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) + self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bd7befed207ed..601016eda7449 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -215,29 +215,29 @@ class Adapter: LORA_ALPHA = "adapter.lora.alpha" class Vision: - # only support vision.type = "clip-vit" for now + # only support vision.type = "vit" for now TYPE = "vision.type" IMAGE_SIZE = "vision.image_size" PATCH_SIZE = "vision.patch_size" IMAGE_MEAN = "vision.image_mean" IMAGE_STD = "vision.image_std" - class Clip: - ARCHITECTURE = "vision.clip.architecture" - CONTEXT_LENGTH = "vision.clip.context_length" - EMBEDDING_LENGTH = "vision.clip.embedding_length" - BLOCK_COUNT = "vision.clip.block_count" - FEED_FORWARD_LENGTH = "vision.clip.feed_forward_length" - PROJECTION_TYPE = "vision.clip.projection_type" - PROJECTION_DIM = "vision.clip.projection_dim" - USE_GELU = "vision.clip.use_gelu" - MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings" - MAX_SLICES = "vision.clip.max_slices" - PROJECTOR_TYPE = "vision.clip.projector_type" - SELECT_LAYER = "vision.clip.select_layer" - PATCH_MERGE_TYPE = "vision.clip.patch_merge_type" - HEAD_COUNT = "vision.clip.attention.head_count" - LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon" + class Vit: + ARCHITECTURE = "vision.vit.architecture" + CONTEXT_LENGTH = "vision.vit.context_length" + EMBEDDING_LENGTH = "vision.vit.embedding_length" + BLOCK_COUNT = "vision.vit.block_count" + FEED_FORWARD_LENGTH = "vision.vit.feed_forward_length" + PROJECTION_TYPE = "vision.vit.projection_type" + PROJECTION_DIM = "vision.vit.projection_dim" + USE_GELU = "vision.vit.use_gelu" + MAX_POS_EMBEDDING = "vision.vit.max_position_embeddings" + MAX_SLICES = "vision.vit.max_slices" + PROJECTOR_TYPE = "vision.vit.projector_type" + SELECT_LAYER = "vision.vit.select_layer" + PATCH_MERGE_TYPE = "vision.vit.patch_merge_type" + HEAD_COUNT = "vision.vit.attention.head_count" + LAYERNORM_EPS = "vision.vit.attention.layer_norm_epsilon" # # recommended mapping of model tensor names for storage in gguf diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 4b9a0c9662dfe..65d0e8f3004ab 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -886,46 +886,46 @@ def add_vision_image_size(self, value: int) -> None: def add_vision_patch_size(self, value: int) -> None: self.add_uint32(Keys.Vision.PATCH_SIZE, value) - def add_vision_clip_architecture(self, value: str) -> None: - self.add_string(Keys.Vision.Clip.ARCHITECTURE, value) + def add_vision_vit_architecture(self, value: str) -> None: + self.add_string(Keys.Vision.Vit.ARCHITECTURE, value) - def add_vision_clip_context_length(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.CONTEXT_LENGTH, value) + def add_vision_vit_context_length(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.CONTEXT_LENGTH, value) - def add_vision_clip_embedding_length(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.EMBEDDING_LENGTH, value) + def add_vision_vit_embedding_length(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.EMBEDDING_LENGTH, value) - def add_vision_clip_block_count(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.BLOCK_COUNT, value) + def add_vision_vit_block_count(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.BLOCK_COUNT, value) - def add_vision_clip_feed_forward_length(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.FEED_FORWARD_LENGTH, value) + def add_vision_vit_feed_forward_length(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.FEED_FORWARD_LENGTH, value) - def add_vision_clip_head_count(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value) + def add_vision_vit_head_count(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.HEAD_COUNT, value) - def add_vision_clip_max_position_embeddings(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value) + def add_vision_vit_max_position_embeddings(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.MAX_POS_EMBEDDING, value) - def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None: - self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value) + def add_vision_vit_projector_type(self, value: CLIPProjectorType) -> None: + self.add_string(Keys.Vision.Vit.PROJECTOR_TYPE, value.value) - def add_vision_clip_max_slices(self, value: int) -> None: - self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value) + def add_vision_vit_max_slices(self, value: int) -> None: + self.add_uint32(Keys.Vision.Vit.MAX_SLICES, value) - def add_vision_clip_select_layer(self, value: int) -> None: - self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value) + def add_vision_vit_select_layer(self, value: int) -> None: + self.add_int32(Keys.Vision.Vit.SELECT_LAYER, value) - def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None: - self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value) + def add_vision_vit_patch_merge_type(self, value: CLIPPatchMergeType) -> None: + self.add_string(Keys.Vision.Vit.PATCH_MERGE_TYPE, value.value) - def add_vision_clip_layer_norm_epsilon(self, value: float) -> None: - self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value) + def add_vision_vit_layer_norm_epsilon(self, value: float) -> None: + self.add_float32(Keys.Vision.Vit.LAYERNORM_EPS, value) - def add_vision_clip_image_mean(self, value: Sequence[float]) -> None: + def add_vision_vit_image_mean(self, value: Sequence[float]) -> None: self.add_array(Keys.Vision.IMAGE_MEAN, value) - def add_vision_clip_image_std(self, value: Sequence[float]) -> None: + def add_vision_vit_image_std(self, value: Sequence[float]) -> None: self.add_array(Keys.Vision.IMAGE_STD, value) def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index e2908c0ae0956..48336943c2157 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -195,21 +195,21 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_VISION_PATCH_SIZE, "vision.patch_size" }, { LLM_KV_VISION_IMAGE_MEAN, "vision.image_mean" }, { LLM_KV_VISION_IMAGE_STD, "vision.image_std" }, - { LLM_KV_VISION_CLIP_ARCHITECTURE, "vision.clip.architecture" }, - { LLM_KV_VISION_CLIP_CONTEXT_LENGTH, "vision.clip.context_length" }, - { LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, "vision.clip.embedding_length" }, - { LLM_KV_VISION_CLIP_BLOCK_COUNT, "vision.clip.block_count" }, - { LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, "vision.clip.feed_forward_length" }, - { LLM_KV_VISION_CLIP_PROJECTION_TYPE, "vision.clip.projection_type" }, - { LLM_KV_VISION_CLIP_PROJECTION_DIM, "vision.clip.projection_dim" }, - { LLM_KV_VISION_CLIP_USE_GELU, "vision.clip.use_gelu" }, - { LLM_KV_VISION_CLIP_MAX_POS_EMBD, "vision.clip.max_position_embeddings" }, - { LLM_KV_VISION_CLIP_MAX_SLICES, "vision.clip.max_slices" }, - { LLM_KV_VISION_CLIP_PROJECTOR_TYPE, "vision.clip.projector_type" }, - { LLM_KV_VISION_CLIP_SELECT_LAYER, "vision.clip.select_layer" }, - { LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, "vision.clip.patch_merge_type" }, - { LLM_KV_VISION_CLIP_HEAD_COUNT, "vision.clip.attention.head_count" }, - { LLM_KV_VISION_CLIP_LAYERNORM_EPS, "vision.clip.attention.layer_norm_epsilon" }, + { LLM_KV_VISION_VIT_ARCHITECTURE, "vision.vit.architecture" }, + { LLM_KV_VISION_VIT_CONTEXT_LENGTH, "vision.vit.context_length" }, + { LLM_KV_VISION_VIT_EMBEDDING_LENGTH, "vision.vit.embedding_length" }, + { LLM_KV_VISION_VIT_BLOCK_COUNT, "vision.vit.block_count" }, + { LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, "vision.vit.feed_forward_length" }, + { LLM_KV_VISION_VIT_PROJECTION_TYPE, "vision.vit.projection_type" }, + { LLM_KV_VISION_VIT_PROJECTION_DIM, "vision.vit.projection_dim" }, + { LLM_KV_VISION_VIT_USE_GELU, "vision.vit.use_gelu" }, + { LLM_KV_VISION_VIT_MAX_POS_EMBD, "vision.vit.max_position_embeddings" }, + { LLM_KV_VISION_VIT_MAX_SLICES, "vision.vit.max_slices" }, + { LLM_KV_VISION_VIT_PROJECTOR_TYPE, "vision.vit.projector_type" }, + { LLM_KV_VISION_VIT_SELECT_LAYER, "vision.vit.select_layer" }, + { LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, "vision.vit.patch_merge_type" }, + { LLM_KV_VISION_VIT_HEAD_COUNT, "vision.vit.attention.head_count" }, + { LLM_KV_VISION_VIT_LAYERNORM_EPS, "vision.vit.attention.layer_norm_epsilon" }, // deprecated { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 7d4a1cd8c9567..5629dc46d084d 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -205,21 +205,21 @@ enum llm_kv { LLM_KV_VISION_PATCH_SIZE, LLM_KV_VISION_IMAGE_MEAN, LLM_KV_VISION_IMAGE_STD, - LLM_KV_VISION_CLIP_ARCHITECTURE, - LLM_KV_VISION_CLIP_CONTEXT_LENGTH, - LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, - LLM_KV_VISION_CLIP_BLOCK_COUNT, - LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, - LLM_KV_VISION_CLIP_PROJECTION_TYPE, - LLM_KV_VISION_CLIP_PROJECTION_DIM, - LLM_KV_VISION_CLIP_USE_GELU, - LLM_KV_VISION_CLIP_MAX_POS_EMBD, - LLM_KV_VISION_CLIP_MAX_SLICES, - LLM_KV_VISION_CLIP_PROJECTOR_TYPE, - LLM_KV_VISION_CLIP_SELECT_LAYER, - LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, - LLM_KV_VISION_CLIP_HEAD_COUNT, - LLM_KV_VISION_CLIP_LAYERNORM_EPS, + LLM_KV_VISION_VIT_ARCHITECTURE, + LLM_KV_VISION_VIT_CONTEXT_LENGTH, + LLM_KV_VISION_VIT_EMBEDDING_LENGTH, + LLM_KV_VISION_VIT_BLOCK_COUNT, + LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, + LLM_KV_VISION_VIT_PROJECTION_TYPE, + LLM_KV_VISION_VIT_PROJECTION_DIM, + LLM_KV_VISION_VIT_USE_GELU, + LLM_KV_VISION_VIT_MAX_POS_EMBD, + LLM_KV_VISION_VIT_MAX_SLICES, + LLM_KV_VISION_VIT_PROJECTOR_TYPE, + LLM_KV_VISION_VIT_SELECT_LAYER, + LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, + LLM_KV_VISION_VIT_HEAD_COUNT, + LLM_KV_VISION_VIT_LAYERNORM_EPS, // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d4d53aba69047..a305fa46310b0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1251,23 +1251,23 @@ void llama_model::load_hparams(llama_model_loader & ml) { auto & vparams = clip.hparams; std::string vision_type; ml.get_key(LLM_KV_VISION_TYPE, vision_type, false); - if (vision_type == "clip-vit") { - LLAMA_LOG_INFO("%s: loading clip-vit vision model\n", __func__); + if (vision_type == "vit") { + LLAMA_LOG_INFO("%s: loading ViT vision model\n", __func__); has_vision = true; ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true); ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true); ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true); ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD, vparams.image_std, 3, true); - ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, vparams.hidden_size, true); - ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT, vparams.n_layer, true); - ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, vparams.n_intermediate, true); - ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true); - ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true); - ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true); - ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true); + ml.get_key(LLM_KV_VISION_VIT_EMBEDDING_LENGTH, vparams.hidden_size, true); + ml.get_key(LLM_KV_VISION_VIT_BLOCK_COUNT, vparams.n_layer, true); + ml.get_key(LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, vparams.n_intermediate, true); + ml.get_key(LLM_KV_VISION_VIT_HEAD_COUNT, vparams.n_head, true); + ml.get_key(LLM_KV_VISION_VIT_LAYERNORM_EPS, vparams.eps, true); + ml.get_key(LLM_KV_VISION_VIT_SELECT_LAYER, vparams.select_layer, true); + ml.get_key(LLM_KV_VISION_VIT_MAX_POS_EMBD, vparams.max_pos_embd, true); { std::string name; - ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true); + ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true); vparams.proj_type = clip_projector_type_from_name(name); if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) { throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str())); @@ -1275,12 +1275,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { } { std::string name; - ml.get_key(LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, name, false); + ml.get_key(LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, name, false); vparams.mm_patch_merge_type = mm_patch_merge_from_name(name); } { std::string arch; - ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true); + ml.get_key(LLM_KV_VISION_VIT_ARCHITECTURE, arch, true); vparams.arch = vision_arch_from_string(arch); if (vparams.arch == VISION_ARCH_UNKNOWN) { throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str())); From bd0714b977ae7cb59569c28ced0a9a1186ba68d9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 21 Jan 2025 14:27:16 +0100 Subject: [PATCH 07/25] reuse LLM_ARCH and LLM_TENSOR --- src/llama-arch.cpp | 144 ++++++++++++++++++------------------------- src/llama-arch.h | 95 +++++++++++----------------- src/llama-model.cpp | 130 +++++++++++++++++++------------------- src/llama-vision.cpp | 6 +- src/llama-vision.h | 14 +---- 5 files changed, 165 insertions(+), 224 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 48336943c2157..a2e848c11d76e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -63,6 +63,9 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, + { LLM_ARCH_VISION_LLAVA, "llava" }, + { LLM_ARCH_VISION_MOBILEVLM, "mobilevlm" }, + { LLM_ARCH_VISION_MINICPMV, "minicpmv" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1314,77 +1317,75 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, }, }, + // vision { - LLM_ARCH_UNKNOWN, + LLM_ARCH_VISION_LLAVA, { - { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - }, + { LLM_TENSOR_V_MMPROJ, "v.mmproj_%d" }, + { LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" }, + { LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" }, + { LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" }, + { LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, + { LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, + { LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, + { LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, + { LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" }, + { LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, + { LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, + { LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, + { LLM_TENSOR_V_PRE_NORM, "v.pre_norm" }, + { LLM_TENSOR_V_POST_NORM, "v.post_norm" }, + } }, -}; - -static const std::map> VISION_TENSOR_NAMES = { { - VISION_ARCH_LLAVA, + LLM_ARCH_VISION_MOBILEVLM, { - { VISION_TENSOR_MMPROJ, "v.mmproj_%d" }, - { VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" }, - { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, - { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, - { VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, - { VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, - { VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, - { VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, - { VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" }, - { VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, - { VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, - { VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, - { VISION_TENSOR_PRE_NORM, "v.pre_norm" }, - { VISION_TENSOR_POST_NORM, "v.post_norm" }, + { LLM_TENSOR_V_MMPROJ_MLP, "v.mmproj.mlp.%d" }, + { LLM_TENSOR_V_MMPROJ_PEG, "v.mmproj.peg.%d" }, + { LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" }, + { LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" }, + { LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" }, + { LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, + { LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, + { LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, + { LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, + { LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" }, + { LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, + { LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, + { LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, + { LLM_TENSOR_V_PRE_NORM, "v.pre_norm" }, + { LLM_TENSOR_V_POST_NORM, "v.post_norm" }, } }, { - VISION_ARCH_MOBILEVLM, + LLM_ARCH_VISION_MINICPMV, { - { VISION_TENSOR_MMPROJ_MLP, "v.mmproj.mlp.%d" }, - { VISION_TENSOR_MMPROJ_PEG, "v.mmproj.peg.%d" }, - { VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" }, - { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, - { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, - { VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, - { VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, - { VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, - { VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, - { VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" }, - { VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, - { VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, - { VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, - { VISION_TENSOR_PRE_NORM, "v.pre_norm" }, - { VISION_TENSOR_POST_NORM, "v.post_norm" }, + { LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" }, + { LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" }, + { LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, + { LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, + { LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, + { LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, + { LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" }, + { LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, + { LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, + { LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, + { LLM_TENSOR_V_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" }, + { LLM_TENSOR_V_RESMPL_ATTN_IN, "v.resmpl.attn_in" }, + { LLM_TENSOR_V_RESMPL_ATTN_OUT, "v.resmpl.attn_out" }, + { LLM_TENSOR_V_RESMPL_KV_PROJ, "v.resmpl.kv_proj" }, + { LLM_TENSOR_V_RESMPL_NORM_POST, "v.resmpl.norm_post" }, + { LLM_TENSOR_V_RESMPL_NORM_KV, "v.resmpl.norm_kv" }, + { LLM_TENSOR_V_RESMPL_NORM_Q, "v.resmpl.norm_q" }, + { LLM_TENSOR_V_RESMPL_PROJ, "v.resmpl.proj" }, + { LLM_TENSOR_V_RESMPL_QUERY, "v.resmpl.query" }, } }, { - VISION_ARCH_MINICPMV, + LLM_ARCH_UNKNOWN, { - { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, - { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, - { VISION_TENSOR_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, - { VISION_TENSOR_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, - { VISION_TENSOR_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, - { VISION_TENSOR_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, - { VISION_TENSOR_ENC_OUTPUT, "v.enc.blk.%d.output" }, - { VISION_TENSOR_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, - { VISION_TENSOR_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, - { VISION_TENSOR_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, - { VISION_TENSOR_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" }, - { VISION_TENSOR_RESMPL_ATTN_IN, "v.resmpl.attn_in" }, - { VISION_TENSOR_RESMPL_ATTN_OUT, "v.resmpl.attn_out" }, - { VISION_TENSOR_RESMPL_KV_PROJ, "v.resmpl.kv_proj" }, - { VISION_TENSOR_RESMPL_NORM_POST, "v.resmpl.norm_post" }, - { VISION_TENSOR_RESMPL_NORM_KV, "v.resmpl.norm_kv" }, - { VISION_TENSOR_RESMPL_NORM_Q, "v.resmpl.norm_q" }, - { VISION_TENSOR_RESMPL_PROJ, "v.resmpl.proj" }, - { VISION_TENSOR_RESMPL_QUERY, "v.resmpl.query" }, - } + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + }, }, }; @@ -1537,12 +1538,7 @@ std::string LLM_KV::operator()(llm_kv kv) const { return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch)); } -template<> -std::string BASE_TN_IMPL::str() const { - if (LLM_TENSOR_NAMES.find(arch) == LLM_TENSOR_NAMES.end()) { - throw std::runtime_error(format("Cannot find tensor name mapping for arch %d", arch)); - } - +std::string LLM_TN_IMPL::str() const { if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } @@ -1557,26 +1553,6 @@ std::string BASE_TN_IMPL::str() const { return name; } -template<> -std::string BASE_TN_IMPL::str() const { - if (VISION_TENSOR_NAMES.find(arch) == VISION_TENSOR_NAMES.end()) { - throw std::runtime_error(format("Cannot find tensor name mapping for arch %d", arch)); - } - - if (VISION_TENSOR_NAMES.at(arch).find(tensor) == VISION_TENSOR_NAMES.at(arch).end()) { - return "__missing__"; - } - - std::string name = ::format(VISION_TENSOR_NAMES.at(arch).at(tensor), bid, xid); - - if (suffix != nullptr) { - name += "."; - name += suffix; - } - - return name; -} - const char * llm_arch_name(llm_arch arch) { auto it = LLM_ARCH_NAMES.find(arch); if (it == LLM_ARCH_NAMES.end()) { diff --git a/src/llama-arch.h b/src/llama-arch.h index 5629dc46d084d..da118e1e109d8 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -66,16 +66,13 @@ enum llm_arch { LLM_ARCH_GRANITE_MOE, LLM_ARCH_CHAMELEON, LLM_ARCH_WAVTOKENIZER_DEC, + // vision + LLM_ARCH_VISION_LLAVA, + LLM_ARCH_VISION_MOBILEVLM, + LLM_ARCH_VISION_MINICPMV, LLM_ARCH_UNKNOWN, }; -enum vision_arch { - VISION_ARCH_UNKNOWN, - VISION_ARCH_LLAVA, - VISION_ARCH_MOBILEVLM, - VISION_ARCH_MINICPMV, -}; - enum llm_kv { LLM_KV_GENERAL_TYPE, LLM_KV_GENERAL_ARCHITECTURE, @@ -354,35 +351,33 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, -}; - -enum vision_tensor { - VISION_TENSOR_MMPROJ, - VISION_TENSOR_MMPROJ_MLP, - VISION_TENSOR_MMPROJ_PEG, - VISION_TENSOR_ENC_EMBD_CLS, - VISION_TENSOR_ENC_EMBD_PATCH, - VISION_TENSOR_ENC_EMBD_POS, - VISION_TENSOR_ENC_ATTN_Q, - VISION_TENSOR_ENC_ATTN_K, - VISION_TENSOR_ENC_ATTN_V, - VISION_TENSOR_ENC_INPUT_NORM, - VISION_TENSOR_ENC_OUTPUT, - VISION_TENSOR_ENC_OUTPUT_NORM, - VISION_TENSOR_ENC_FFN_UP, - VISION_TENSOR_ENC_FFN_DOWN, - VISION_TENSOR_PRE_NORM, - VISION_TENSOR_POST_NORM, - // minicpmv - VISION_TENSOR_RESMPL_POS_EMBD_K, - VISION_TENSOR_RESMPL_ATTN_IN, - VISION_TENSOR_RESMPL_ATTN_OUT, - VISION_TENSOR_RESMPL_KV_PROJ, - VISION_TENSOR_RESMPL_NORM_POST, - VISION_TENSOR_RESMPL_NORM_KV, - VISION_TENSOR_RESMPL_NORM_Q, - VISION_TENSOR_RESMPL_PROJ, - VISION_TENSOR_RESMPL_QUERY, + // vision + LLM_TENSOR_V_MMPROJ, + LLM_TENSOR_V_MMPROJ_MLP, + LLM_TENSOR_V_MMPROJ_PEG, + LLM_TENSOR_V_ENC_EMBD_CLS, + LLM_TENSOR_V_ENC_EMBD_PATCH, + LLM_TENSOR_V_ENC_EMBD_POS, + LLM_TENSOR_V_ENC_ATTN_Q, + LLM_TENSOR_V_ENC_ATTN_K, + LLM_TENSOR_V_ENC_ATTN_V, + LLM_TENSOR_V_ENC_INPUT_NORM, + LLM_TENSOR_V_ENC_OUTPUT, + LLM_TENSOR_V_ENC_OUTPUT_NORM, + LLM_TENSOR_V_ENC_FFN_UP, + LLM_TENSOR_V_ENC_FFN_DOWN, + LLM_TENSOR_V_PRE_NORM, + LLM_TENSOR_V_POST_NORM, + // vision - minicpmv + LLM_TENSOR_V_RESMPL_POS_EMBD_K, + LLM_TENSOR_V_RESMPL_ATTN_IN, + LLM_TENSOR_V_RESMPL_ATTN_OUT, + LLM_TENSOR_V_RESMPL_KV_PROJ, + LLM_TENSOR_V_RESMPL_NORM_POST, + LLM_TENSOR_V_RESMPL_NORM_KV, + LLM_TENSOR_V_RESMPL_NORM_Q, + LLM_TENSOR_V_RESMPL_PROJ, + LLM_TENSOR_V_RESMPL_QUERY, }; enum llm_tensor_layer { @@ -408,10 +403,9 @@ struct LLM_KV { // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias" // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight" // -template -struct BASE_TN_IMPL { - const Tname arch; - const Ttensor tensor; +struct LLM_TN_IMPL { + const llm_arch arch; + const llm_tensor tensor; const char * const suffix; const int bid; const int xid; @@ -422,16 +416,15 @@ struct BASE_TN_IMPL { return str(); } - friend bool operator==(const std::string & str, const BASE_TN_IMPL & tn) { + friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) { return str == tn.str(); } - friend bool operator!=(const std::string & str, const BASE_TN_IMPL & tn) { + friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) { return str != tn.str(); } }; -using LLM_TN_IMPL = BASE_TN_IMPL; struct LLM_TN { LLM_TN(llm_arch arch) : arch(arch) {} @@ -446,20 +439,6 @@ struct LLM_TN { } }; -struct VISION_TN { - VISION_TN(vision_arch arch) : arch(arch) {} - - vision_arch arch; - - BASE_TN_IMPL operator()(vision_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const { - return { arch, tensor, suffix, bid, xid }; - } - - BASE_TN_IMPL operator()(vision_tensor tensor, int bid = -1, int xid = -1) const { - return { arch, tensor, nullptr, bid, xid }; - } -}; - struct llm_tensor_info { llm_tensor_layer layer; @@ -470,6 +449,4 @@ const char * llm_arch_name(llm_arch arch); llm_arch llm_arch_from_string(const std::string & name); -vision_arch vision_arch_from_string(const std::string & name); - const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a305fa46310b0..0ea66d254a73c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1281,8 +1281,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { { std::string arch; ml.get_key(LLM_KV_VISION_VIT_ARCHITECTURE, arch, true); - vparams.arch = vision_arch_from_string(arch); - if (vparams.arch == VISION_ARCH_UNKNOWN) { + vparams.arch = llm_arch_from_string(arch); + if (vparams.arch == LLM_ARCH_UNKNOWN) { throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str())); } } @@ -3421,7 +3421,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t max_pos_embd = vparams.max_pos_embd; const int64_t n_channel = 3; // always RGB const int64_t patch_size = vparams.patch_size; - const auto tn = VISION_TN(vparams.arch); + const auto tn = LLM_TN(vparams.arch); // clip is CPU-only for now clip.buft = ggml_backend_cpu_buffer_type(); @@ -3429,85 +3429,85 @@ bool llama_model::load_tensors(llama_model_loader & ml) { clip.layers.resize(n_vlayer); switch (vparams.arch) { - case VISION_ARCH_LLAVA: - case VISION_ARCH_MOBILEVLM: + case LLM_ARCH_VISION_LLAVA: + case LLM_ARCH_VISION_MOBILEVLM: { - if (vparams.arch == VISION_ARCH_LLAVA) { - clip.mm_1_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_vembd, n_vff}); - clip.mm_1_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_vff}); - clip.mm_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_vff, n_vff}); - clip.mm_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_vff}); - } else if (vparams.arch == VISION_ARCH_MOBILEVLM) { - clip.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd}); - clip.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "bias", 0), {n_embd}); - clip.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "weight", 2), {n_embd, n_embd}); - clip.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_MLP, "bias", 2), {n_embd}); - clip.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd}); - clip.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_PEG, "bias", 0), {n_embd}); + if (vparams.arch == LLM_ARCH_VISION_LLAVA) { + clip.mm_1_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 1), {n_vembd, n_vff}); + clip.mm_1_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 1), {n_vff}); + clip.mm_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 2), {n_vff, n_vff}); + clip.mm_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 2), {n_vff}); + } else if (vparams.arch == LLM_ARCH_VISION_MOBILEVLM) { + clip.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd}); + clip.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 0), {n_embd}); + clip.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 2), {n_embd, n_embd}); + clip.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 2), {n_embd}); + clip.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd}); + clip.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "bias", 0), {n_embd}); } - clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_vembd}); - clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); - clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + clip.class_embedding = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_CLS ), {n_vembd}); + clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); + clip.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); - clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "weight"), {n_vembd}); - clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_vembd}); - clip.post_norm_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); - clip.post_norm_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "weight"), {n_vembd}); + clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "bias" ), {n_vembd}); + clip.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + clip.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); for (int i = 0; i < n_vlayer; ++i) { auto & layer = clip.layers[i]; - layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); - layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_vembd}); - layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); - layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_vembd}); - layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); - layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_vembd}); - - layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); - layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_vff}); - layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); - layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_vembd}); - - layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_vembd}); - layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_vembd}); - layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); - layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); - - layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); - layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_vembd}); + layer.k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); + layer.k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}); + layer.v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); + layer.v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}); + layer.q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); + layer.q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}); + + layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); + layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}); + layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); + layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}); + + layer.norm_in_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}); + layer.norm_in_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}); + layer.norm_out_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); + layer.norm_out_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); + + layer.output_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); + layer.output_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}); } } break; - case VISION_ARCH_MINICPMV: + case LLM_ARCH_VISION_MINICPMV: { - clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); - clip.position_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); + clip.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); // TODO: load all resampler tensors for (int i = 0; i < n_vlayer; ++i) { auto & layer = clip.layers[i]; - layer.k_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); - layer.k_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_K, "bias" , i), {n_vembd}); - layer.v_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); - layer.v_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_V, "bias" , i), {n_vembd}); - layer.q_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); - layer.q_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_ATTN_Q, "bias" , i), {n_vembd}); - - layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); - layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_UP, "bias" , i), {n_vff}); - layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); - layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_FFN_DOWN, "bias" , i), {n_vembd}); - - layer.norm_in_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "weight", i), {n_vembd}); - layer.norm_in_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_INPUT_NORM, "bias" , i), {n_vembd}); - layer.norm_out_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); - layer.norm_out_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); - - layer.output_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); - layer.output_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_OUTPUT, "bias" , i), {n_vembd}); + layer.k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); + layer.k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}); + layer.v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); + layer.v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}); + layer.q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); + layer.q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}); + + layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); + layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}); + layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); + layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}); + + layer.norm_in_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}); + layer.norm_in_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}); + layer.norm_out_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); + layer.norm_out_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); + + layer.output_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); + layer.output_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}); } } break; default: diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index 73c96031501f9..e348d31da137a 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -393,7 +393,7 @@ struct minicpmv_preprocessor { static llama_vision_patches clip_image_preprocess_minicpmv(const clip_context & ctx, const clip_image_u8 & img) { auto & params = ctx.model->hparams; - GGML_ASSERT(params.arch == VISION_ARCH_MINICPMV); + GGML_ASSERT(params.arch == LLM_ARCH_VISION_MINICPMV); static const int max_slice_nums = 9; minicpmv_preprocessor preprocessor; @@ -775,7 +775,7 @@ static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches auto & model = *ctx.model; auto & hparams = ctx.model->hparams; - if (hparams.arch == VISION_ARCH_LLAVA) { + if (hparams.arch == LLM_ARCH_VISION_LLAVA) { GGML_ASSERT(batch_size == 1); // TODO: support multiple images } @@ -895,7 +895,7 @@ struct llama_vision_patches * llama_vision_patches_init( struct llama_context * ctx, llama_vision_bitmap * bmp) { clip_context & vctx = ctx->vctx; - if (vctx.model->hparams.arch == VISION_ARCH_MINICPMV) { + if (vctx.model->hparams.arch == LLM_ARCH_VISION_MINICPMV) { return new llama_vision_patches(clip_image_preprocess_minicpmv(vctx, *bmp)); } return new llama_vision_patches(clip_image_preprocess(vctx, *bmp)); diff --git a/src/llama-vision.h b/src/llama-vision.h index 19377abef722e..a9304867fd4d9 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -22,7 +22,7 @@ enum mm_patch_merge { }; struct clip_hparams { - vision_arch arch = VISION_ARCH_UNKNOWN; + llm_arch arch = LLM_ARCH_UNKNOWN; uint32_t image_size; uint32_t patch_size; @@ -157,18 +157,6 @@ struct llama_vision_patches { std::vector> buf; // preprocessed image data }; -inline vision_arch vision_arch_from_string(const std::string & name) { - if (name == "llava") { - return VISION_ARCH_LLAVA; - } else if (name == "mobilevlm") { - return VISION_ARCH_MOBILEVLM; - } else if (name == "minicpmv") { - return VISION_ARCH_MINICPMV; - } - - return VISION_ARCH_UNKNOWN; -} - inline mm_patch_merge mm_patch_merge_from_name(std::string & name) { if (name == "flat") { return MM_PATCH_MERGE_FLAT; From ad38e87329647a3d107cae43e7a0f97776ca051f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 21 Jan 2025 15:53:39 +0100 Subject: [PATCH 08/25] rename everywhere --- examples/vision/vision.cpp | 10 +- include/llama.h | 15 +- src/llama-context.h | 2 +- src/llama-model.cpp | 6 +- src/llama-model.h | 2 +- src/llama-vision.cpp | 497 +++++++++++++++++++------------------ src/llama-vision.h | 144 ++++++----- 7 files changed, 350 insertions(+), 326 deletions(-) diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp index 73f8ef1b6ac79..88b5be5bb3d29 100644 --- a/examples/vision/vision.cpp +++ b/examples/vision/vision.cpp @@ -122,7 +122,7 @@ int main(int argc, char ** argv) { int n_prompt = 0; // process image - llama_vision_patches * img_patches = nullptr; + llama_vision_tokens * img_tokens = nullptr; { const char * img_path = params.image[0].c_str(); if (params.image[0].empty()) { @@ -131,12 +131,12 @@ int main(int argc, char ** argv) { } llama_vision_bitmap * img = load_image_from_file(img_path); LOG_INF("loaded image %s, size = %d x %d\n", img_path, img->nx, img->ny); - img_patches = llama_vision_patches_init(ctx, img); - if (!img_patches) { - LOG_ERR("failed to create image patches\n"); + img_tokens = llama_vision_tokenize(ctx, img); + if (!img_tokens) { + LOG_ERR("failed to create image tokens\n"); return 1; } - if (llama_vision_encode(ctx, img_patches)) { + if (llama_vision_encode(ctx, img_tokens)) { LOG_ERR("failed to encode image\n"); return 1; } diff --git a/include/llama.h b/include/llama.h index bd8e696585693..c230e0c3d9776 100644 --- a/include/llama.h +++ b/include/llama.h @@ -229,7 +229,9 @@ extern "C" { bool sorted; } llama_token_data_array; - struct llama_vision_patches; + // Structure represents the basic input unit of vision model + // This can be a processed image or slices of images under the hood + struct llama_vision_tokens; // represent an RGB image // size of data must be equal to 3*nx*ny @@ -1286,12 +1288,15 @@ extern "C" { LLAMA_API struct llama_vision_bitmap * llama_vision_bitmap_init(uint32_t nx, uint32_t ny); LLAMA_API void llama_vision_bitmap_free(struct llama_vision_bitmap * bmp); - // Create patches from the RGB bitmap - LLAMA_API struct llama_vision_patches * llama_vision_patches_init(struct llama_context * ctx, llama_vision_bitmap * bmp); - LLAMA_API void llama_vision_patches_free(struct llama_vision_patches * p); + // Create image tokens from the RGB bitmap + LLAMA_API struct llama_vision_tokens * llama_vision_tokenize(struct llama_context * ctx, llama_vision_bitmap * bmp); + LLAMA_API void llama_vision_tokens_free(struct llama_vision_tokens * img_tokens); + + // User must reserve N number of tokens in tokenized text prompt for each image + // LLAMA_API int32_t llama_vision_get_n_tokens(const llama_vision_img_tokens * img_tokens); // Encode patches into embeddings - LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, struct llama_vision_patches * p); + LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, struct llama_vision_tokens * img_tokens); LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(struct llama_context * ctx); // diff --git a/src/llama-context.h b/src/llama-context.h index 10c839f55ebca..f2704b89e96b8 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -110,7 +110,7 @@ struct llama_context { struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] // vision - clip_context vctx; + llama_vision_context vctx; }; // TODO: make these methods of llama_context diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0ea66d254a73c..9e81dafe843a1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1268,8 +1268,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { { std::string name; ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true); - vparams.proj_type = clip_projector_type_from_name(name); - if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) { + vparams.proj_type = vision_projector_type_from_name(name); + if (vparams.proj_type == VISION_PROJECTOR_TYPE_UNKNOWN) { throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str())); } } @@ -3514,7 +3514,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { throw std::runtime_error("unknown vision architecture"); } - if (clip_n_mmproj_embd(clip) != hparams.n_embd) { + if (llama_vision_n_mmproj_embd(clip) != hparams.n_embd) { std::runtime_error("model has vision, but n_mmproj_embd != n_embd"); } } diff --git a/src/llama-model.h b/src/llama-model.h index fd3820f1e418b..d7a17d993efaf 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -365,7 +365,7 @@ struct llama_model { // vision bool has_vision = false; - clip_vision_model clip; + llama_vision_model clip; private: struct impl; diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index e348d31da137a..c583593b73dba 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -13,25 +13,25 @@ #include #include -// export clip_image_u8 to bmp file for debugging +// export llama_image_u8 to bmp file for debugging // https://codereview.stackexchange.com/questions/195121/writing-a-bitmap-image-from-c -struct clip_image_size; -static int bmp_export(const struct clip_image_u8 &img, const std::string &location); +struct img_size; +static int bmp_export(const struct llama_image_u8 &img, const std::string &location); #endif -struct clip_image_size { +struct img_size { int width; int height; }; // RGB uint8 image // Memory layout: RGBRGBRGB... -struct clip_image_u8 { +struct llama_image_u8 { int nx; int ny; std::vector buf; - clip_image_u8() {} - clip_image_u8(const llama_vision_bitmap & bmp) { + llama_image_u8() {} + llama_image_u8(const llama_vision_bitmap & bmp) { nx = bmp.nx; ny = bmp.ny; buf.resize(nx*ny*3); @@ -39,38 +39,42 @@ struct clip_image_u8 { } }; -struct clip_image_u8_batch { - struct clip_image_u8 * data; - size_t size; -}; +uint32_t llama_vision_n_mmproj_embd(const llama_vision_model & vmodel) { + auto & proj_type = vmodel.hparams.proj_type; + if (proj_type == VISION_PROJECTOR_TYPE_MLP) { + return vmodel.mm_2_b->ne[0]; + } else if (proj_type == VISION_PROJECTOR_TYPE_LDPV2) { + return vmodel.mm_model_peg_0_b->ne[0]; + } else if (proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_5) { + return 4096; + } else if (proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_6) { + return 3584; + } else { + GGML_ASSERT(false && "invalid proj type"); + } +} + + +// +// internal utils +// -static int clip_n_patches_x(const clip_context & ctx) { +static int get_n_patches_x(const llama_vision_context & ctx) { auto & hparams = ctx.model->hparams; return hparams.image_size / hparams.patch_size; } -static int clip_n_patches_y(const clip_context & ctx) { - return clip_n_patches_x(ctx); +static int get_n_patches_y(const llama_vision_context & ctx) { + return get_n_patches_x(ctx); } -static int clip_n_patches(const clip_context & ctx) { - return clip_n_patches_x(ctx) * clip_n_patches_y(ctx); +static int get_n_patches(const llama_vision_context & ctx) { + return get_n_patches_x(ctx) * get_n_patches_y(ctx); } -uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model) { - auto & proj_type = clip_model.hparams.proj_type; - if (proj_type == CLIP_PROJECTOR_TYPE_MLP) { - return clip_model.mm_2_b->ne[0]; - } else if (proj_type == CLIP_PROJECTOR_TYPE_LDPV2) { - return clip_model.mm_model_peg_0_b->ne[0]; - } else if (proj_type == CLIP_PROJECTOR_TYPE_MINICPMV_2_5) { - return 4096; - } else if (proj_type == CLIP_PROJECTOR_TYPE_MINICPMV_2_6) { - return 3584; - } else { - GGML_ASSERT(false && "invalid proj type"); - } -} +// +// bitmap utils +// /** * Selects the best resolution from a list of possible resolutions based on the original size. @@ -79,11 +83,11 @@ uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model) { * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. * @return The best fit resolution in the format (width, height). */ -static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector& possible_resolutions) { +static img_size select_best_resolution(const img_size & original_size, const std::vector& possible_resolutions) { int original_width = original_size.width; int original_height = original_size.height; - clip_image_size best_fit; + img_size best_fit; int max_effective_resolution = 0; int min_wasted_resolution = std::numeric_limits::max(); @@ -106,7 +110,7 @@ static clip_image_size select_best_resolution(const clip_image_size & original_s return best_fit; } -static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { +static bool bicubic_resize(const llama_image_u8 & img, llama_image_u8 & dst, int target_width, int target_height) { auto clip = [](int x, int lower, int upper) -> int { return std::max(lower, std::min(x, upper)); }; @@ -173,13 +177,13 @@ static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int t return true; } -static std::vector divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { - std::vector patches; +static std::vector divide_to_patches_u8(const llama_image_u8 & image, int patch_size) { + std::vector patches; int width = image.nx; int height = image.ny; for (int i = 0; i < height; i += patch_size) { for (int j = 0; j < width; j += patch_size) { - clip_image_u8 patch; + llama_image_u8 patch; patch.nx = std::min(patch_size, width - j); patch.ny = std::min(patch_size, height - i); patch.buf.resize(3 * patch.nx * patch.ny); @@ -197,7 +201,7 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & ima } // llava-1.6 type of resize_and_pad (black) -static clip_image_u8 resize_and_pad_image(const clip_image_u8 & image, const clip_image_size & target_resolution) { +static llama_image_u8 resize_and_pad_image(const llama_image_u8 & image, const img_size & target_resolution) { int target_width = target_resolution.width; int target_height = target_resolution.height; @@ -214,11 +218,11 @@ static clip_image_u8 resize_and_pad_image(const clip_image_u8 & image, const cli new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); } - clip_image_u8 resized_image; + llama_image_u8 resized_image; // bilinear_resize(image, resized_image, new_width, new_height); bicubic_resize(image, resized_image, new_width, new_height); - clip_image_u8 padded_image; + llama_image_u8 padded_image; padded_image.nx = target_width; padded_image.ny = target_height; padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black @@ -238,7 +242,7 @@ static clip_image_u8 resize_and_pad_image(const clip_image_u8 & image, const cli return padded_image; } -static void normalize_image_u8_to_f32(const clip_image_u8 & src, std::vector & dst, const std::array & mean, const std::array & std) { +static void normalize_image_u8_to_f32(const llama_image_u8 & src, std::vector & dst, const std::array & mean, const std::array & std) { dst.resize(src.buf.size()); for (size_t i = 0; i < src.buf.size(); ++i) { @@ -247,15 +251,169 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, std::vectorhparams; + // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing + if (params.mm_patch_merge_type == MM_PATCH_MERGE_SPATIAL_UNPAD) { + pad_to_square = false; + } + + llama_vision_tokens output_slices; + output_slices.n_px = get_n_patches_x(ctx); + output_slices.n_py = get_n_patches_y(ctx); + output_slices.px = params.patch_size; + output_slices.py = params.patch_size; + + // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + + llama_image_u8 temp; + if (pad_to_square && img.nx != img.ny) { + // if the image is not square, pad it to a square + int longer_side = std::max(img.nx, img.ny); + temp.nx = longer_side; + temp.ny = longer_side; + temp.buf.resize(3 * longer_side * longer_side); + const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255) + + // fill with background color + for (size_t i = 0; i < temp.buf.size(); i++) { + temp.buf[i] = bc[i % 3]; + } + + // copy from the input image + for (int y = 0; y < img.ny; y++) { + for (int x = 0; x < img.nx; x++) { + const int i = 3 * (y * img.nx + x); + const int j = 3 * (y * temp.nx + x); + temp.buf[j] = img.buf[i]; + temp.buf[j+1] = img.buf[i+1]; + temp.buf[j+2] = img.buf[i+2]; + } + } + } else if (params.image_grid_pinpoints[0] != 0) { + // "spatial_unpad" with "anyres" processing for llava-1.6 + std::vector possible_resolutions; + for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i += 2) { + img_size s; + s.width = params.image_grid_pinpoints[i]; + s.height = params.image_grid_pinpoints[i+1]; + possible_resolutions.push_back(s); + } + img_size best_resolution = select_best_resolution({img.nx, img.ny}, possible_resolutions); + // debug_image_save_to_bmp(*img, "input.bmp"); + temp = resize_and_pad_image(img, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 + // debug_image_save_to_bmp(*temp, "resized.bmp"); + + std::vector patches = divide_to_patches_u8(temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) + + llama_image_u8 image_original_resize; + // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square + bicubic_resize(img, image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square + patches.insert(patches.begin(), image_original_resize); + output_slices.buf.resize(patches.size()); + int num = 0; + for (auto & patch : patches) { + normalize_image_u8_to_f32(patch, output_slices.buf[num], params.image_mean, params.image_std); + num++; + } + return output_slices; + } else { + temp.nx = img.nx; + temp.ny = img.ny; + temp.buf.resize(img.buf.size()); + memcpy(temp.buf.data(), img.buf.data(), temp.buf.size()); + } + + const int nx = temp.nx; + const int ny = temp.ny; + // bmp_export(temp, "resized_vanilla.bmp"); + + const int nx2 = params.image_size; + const int ny2 = params.image_size; + std::vector res; + res.resize(3 * nx2 * ny2); + + const float scale = std::max(nx, ny) / (float)params.image_size; + + const int nx3 = int(nx / scale + 0.5f); + const int ny3 = int(ny / scale + 0.5f); + + const auto & m3 = params.image_mean; // {0.48145466f, 0.4578275f, 0.40821073f}; + const auto & s3 = params.image_std; // {0.26862954f, 0.26130258f, 0.27577711f}; + + for (int y = 0; y < ny3; y++) { + for (int x = 0; x < nx3; x++) { + for (int c = 0; c < 3; c++) { + // linear interpolation + const float sx = (x + 0.5f) * scale - 0.5f; + const float sy = (y + 0.5f) * scale - 0.5f; + + const int x0 = std::max(0, (int)std::floor(sx)); + const int y0 = std::max(0, (int)std::floor(sy)); + + const int x1 = std::min(x0 + 1, nx - 1); + const int y1 = std::min(y0 + 1, ny - 1); + + const float dx = sx - x0; + const float dy = sy - y0; + + const int j00 = 3 * (y0 * nx + x0) + c; + const int j01 = 3 * (y0 * nx + x1) + c; + const int j10 = 3 * (y1 * nx + x0) + c; + const int j11 = 3 * (y1 * nx + x1) + c; + + const float v00 = temp.buf[j00]; + const float v01 = temp.buf[j01]; + const float v10 = temp.buf[j10]; + const float v11 = temp.buf[j11]; + + const float v0 = v00 * (1.0f - dx) + v01 * dx; + const float v1 = v10 * (1.0f - dx) + v11 * dx; + + const float v = v0 * (1.0f - dy) + v1 * dy; + + const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f); + + const int i = 3 * (y * nx3 + x) + c; + + res[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c]; + } + } + } + + output_slices.buf.resize(1); + output_slices.buf[0] = std::move(res); + + return output_slices; + }; +}; + +struct llama_vision_processor_uhd : llama_vision_processor { + llama_vision_processor_uhd(const llama_vision_context & ctx) : llama_vision_processor(ctx) {} + int ensure_divide(int length, int patch_size) { return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); } - std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + std::pair find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { int width = original_size.first; int height = original_size.second; if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { @@ -268,7 +426,7 @@ struct minicpmv_preprocessor { return std::make_pair(best_width, best_height); } - std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + std::pair get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { int width, height; std::tie(width, height) = original_size; int grid_x, grid_y; @@ -281,7 +439,7 @@ struct minicpmv_preprocessor { int grid_height = refine_height / grid_y; // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) - auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair + auto best_grid_size = find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair int best_grid_width, best_grid_height; std::tie(best_grid_width, best_grid_height) = best_grid_size; @@ -290,7 +448,7 @@ struct minicpmv_preprocessor { return refine_size; } - std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::pair find_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { std::vector candidate_split_grids_nums; for (int i : {multiple - 1, multiple, multiple + 1}) { if (i == 1 || i > max_slice_nums) { @@ -322,8 +480,8 @@ struct minicpmv_preprocessor { return best_grid; } - std::vector> uhd_slice_image( - const clip_image_u8 & img, + std::vector> slice_image( + const llama_image_u8 & img, const int max_slice_nums = 9, const int scale_resolution = 448, const int patch_size = 14) { @@ -334,30 +492,30 @@ struct minicpmv_preprocessor { const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); const int multiple = fmin(ceil(ratio), max_slice_nums); - std::vector> images; + std::vector> images; LLAMA_LOG_DEBUG("%s: multiple %d\n", __func__, multiple); - images.push_back(std::vector()); + images.push_back(std::vector()); if (multiple <= 1) { - auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); - clip_image_u8 source_image; + auto best_size = find_best_resize(original_size, scale_resolution, patch_size, true); + llama_image_u8 source_image; bicubic_resize(img, source_image, best_size.first, best_size.second); // source_image = image.resize(best_size, Image.Resampling.BICUBIC) images[images.size()-1].push_back(source_image); } else if (multiple > 1) { - auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); - clip_image_u8 source_image; + auto best_size = find_best_resize(original_size, scale_resolution, patch_size); + llama_image_u8 source_image; bicubic_resize(img, source_image, best_size.first, best_size.second); // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) LLAMA_LOG_DEBUG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img.nx, img.ny, best_size.first, best_size.second); images[images.size()-1].push_back(source_image); - std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + std::pair best_grid = find_best_grid(max_slice_nums, multiple, log_ratio); LLAMA_LOG_DEBUG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img.nx, img.ny, best_grid.first, best_grid.second); - auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); - clip_image_u8 refine_image; + auto refine_size = get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); + llama_image_u8 refine_image; bicubic_resize(img, refine_image, refine_size.first, refine_size.second); LLAMA_LOG_DEBUG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image.nx, refine_image.ny, refine_size.first, refine_size.second); @@ -368,9 +526,9 @@ struct minicpmv_preprocessor { int grid_x = int(width / best_grid.first); int grid_y = int(height / best_grid.second); for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ - images.push_back(std::vector()); + images.push_back(std::vector()); for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ - clip_image_u8 patch; + llama_image_u8 patch; patch.nx = grid_x; patch.ny = grid_y; patch.buf.resize(3 * patch.nx * patch.ny); @@ -389,173 +547,32 @@ struct minicpmv_preprocessor { } return images; } -}; -static llama_vision_patches clip_image_preprocess_minicpmv(const clip_context & ctx, const clip_image_u8 & img) { - auto & params = ctx.model->hparams; - GGML_ASSERT(params.arch == LLM_ARCH_VISION_MINICPMV); - - static const int max_slice_nums = 9; - minicpmv_preprocessor preprocessor; - std::vector> imgs = preprocessor.uhd_slice_image(img, max_slice_nums); - - llama_vision_patches output_patches; - output_patches.n_px = clip_n_patches_x(ctx); - output_patches.n_py = clip_n_patches_y(ctx); - output_patches.px = params.patch_size; - output_patches.py = params.patch_size; - - for (size_t i = 0; i < imgs.size(); ++i) { - for (size_t j = 0; j < imgs[i].size(); ++j) { - std::vector res; - normalize_image_u8_to_f32(imgs[i][j], res, params.image_mean, params.image_std); - output_patches.buf.push_back(res); - } - } -} + virtual llama_vision_tokens tokenize(const llama_image_u8 & img) override { + auto & params = ctx.model->hparams; + GGML_ASSERT(params.arch == LLM_ARCH_VISION_MINICPMV); -// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector -// res_imgs memory is being allocated here, previous allocations will be freed if found -static llama_vision_patches clip_image_preprocess(const clip_context & ctx, const clip_image_u8 & img) { - bool pad_to_square = true; - auto & params = ctx.model->hparams; - // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing - if (params.mm_patch_merge_type == MM_PATCH_MERGE_SPATIAL_UNPAD) { - pad_to_square = false; - } + std::vector> imgs = slice_image(img); - llama_vision_patches output_patches; - output_patches.n_px = clip_n_patches_x(ctx); - output_patches.n_py = clip_n_patches_y(ctx); - output_patches.px = params.patch_size; - output_patches.py = params.patch_size; - - // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) - // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 - - clip_image_u8 temp; - if (pad_to_square && img.nx != img.ny) { - // if the image is not square, pad it to a square - int longer_side = std::max(img.nx, img.ny); - temp.nx = longer_side; - temp.ny = longer_side; - temp.buf.resize(3 * longer_side * longer_side); - const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255) - - // fill with background color - for (size_t i = 0; i < temp.buf.size(); i++) { - temp.buf[i] = bc[i % 3]; - } + llama_vision_tokens output; + output.n_px = get_n_patches_x(ctx); + output.n_py = get_n_patches_y(ctx); + output.px = params.patch_size; + output.py = params.patch_size; - // copy from the input image - for (int y = 0; y < img.ny; y++) { - for (int x = 0; x < img.nx; x++) { - const int i = 3 * (y * img.nx + x); - const int j = 3 * (y * temp.nx + x); - temp.buf[j] = img.buf[i]; - temp.buf[j+1] = img.buf[i+1]; - temp.buf[j+2] = img.buf[i+2]; + for (size_t i = 0; i < imgs.size(); ++i) { + for (size_t j = 0; j < imgs[i].size(); ++j) { + std::vector res; + normalize_image_u8_to_f32(imgs[i][j], res, params.image_mean, params.image_std); + output.buf.push_back(res); } } - } else if (params.image_grid_pinpoints[0] != 0) { - // "spatial_unpad" with "anyres" processing for llava-1.6 - std::vector possible_resolutions; - for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i += 2) { - clip_image_size s; - s.width = params.image_grid_pinpoints[i]; - s.height = params.image_grid_pinpoints[i+1]; - possible_resolutions.push_back(s); - } - clip_image_size best_resolution = select_best_resolution({img.nx, img.ny}, possible_resolutions); - // clip_image_save_to_bmp(*img, "input.bmp"); - temp = resize_and_pad_image(img, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 - // clip_image_save_to_bmp(*temp, "resized.bmp"); - - std::vector patches = divide_to_patches_u8(temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) - - clip_image_u8 image_original_resize; - // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square - bicubic_resize(img, image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square - patches.insert(patches.begin(), image_original_resize); - // clip_image_f32_batch_init(patches.size()); - output_patches.buf.resize(patches.size()); - int num = 0; - for (auto & patch : patches) { - normalize_image_u8_to_f32(patch, output_patches.buf[num], params.image_mean, params.image_std); - num++; - } - return output_patches; - } else { - temp.nx = img.nx; - temp.ny = img.ny; - temp.buf.resize(img.buf.size()); - memcpy(temp.buf.data(), img.buf.data(), temp.buf.size()); - } - - const int nx = temp.nx; - const int ny = temp.ny; - // bmp_export(temp, "resized_vanilla.bmp"); - const int nx2 = params.image_size; - const int ny2 = params.image_size; - std::vector res; - res.resize(3 * nx2 * ny2); - - const float scale = std::max(nx, ny) / (float)params.image_size; - - const int nx3 = int(nx / scale + 0.5f); - const int ny3 = int(ny / scale + 0.5f); - - const auto & m3 = params.image_mean; // {0.48145466f, 0.4578275f, 0.40821073f}; - const auto & s3 = params.image_std; // {0.26862954f, 0.26130258f, 0.27577711f}; - - for (int y = 0; y < ny3; y++) { - for (int x = 0; x < nx3; x++) { - for (int c = 0; c < 3; c++) { - // linear interpolation - const float sx = (x + 0.5f) * scale - 0.5f; - const float sy = (y + 0.5f) * scale - 0.5f; - - const int x0 = std::max(0, (int)std::floor(sx)); - const int y0 = std::max(0, (int)std::floor(sy)); - - const int x1 = std::min(x0 + 1, nx - 1); - const int y1 = std::min(y0 + 1, ny - 1); - - const float dx = sx - x0; - const float dy = sy - y0; - - const int j00 = 3 * (y0 * nx + x0) + c; - const int j01 = 3 * (y0 * nx + x1) + c; - const int j10 = 3 * (y1 * nx + x0) + c; - const int j11 = 3 * (y1 * nx + x1) + c; - - const float v00 = temp.buf[j00]; - const float v01 = temp.buf[j01]; - const float v10 = temp.buf[j10]; - const float v11 = temp.buf[j11]; - - const float v0 = v00 * (1.0f - dx) + v01 * dx; - const float v1 = v10 * (1.0f - dx) + v11 * dx; - - const float v = v0 * (1.0f - dy) + v1 * dy; - - const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f); - - const int i = 3 * (y * nx3 + x) + c; - - res[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c]; - } - } + return output; } +}; - output_patches.buf.resize(1); - output_patches.buf[0] = std::move(res); - - return output_patches; -} - -static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, clip_image_size & image_size) { +static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int batch_size, img_size & image_size) { auto & model = *ctx.model; auto & hparams = ctx.model->hparams; @@ -726,7 +743,7 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, // ne is whcn, ne = [1024, 576, 1, 1] embeddings = ggml_get_rows(ctx0, embeddings, patches); - if (hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) { + if (hparams.proj_type == VISION_PROJECTOR_TYPE_MLP) { embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); @@ -734,7 +751,7 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - } else if (hparams.proj_type == CLIP_PROJECTOR_TYPE_LDPV2) { + } else if (hparams.proj_type == VISION_PROJECTOR_TYPE_LDPV2) { int n_patch = 24; struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); @@ -770,8 +787,8 @@ static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, return gf; } -static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches & patches) { - int batch_size = patches.buf.size(); +static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_vision_tokens & inp) { + int batch_size = inp.buf.size(); auto & model = *ctx.model; auto & hparams = ctx.model->hparams; @@ -779,7 +796,7 @@ static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches GGML_ASSERT(batch_size == 1); // TODO: support multiple images } - clip_image_size image_size{(int)hparams.image_size, (int)hparams.image_size}; + img_size image_size{(int)hparams.image_size, (int)hparams.image_size}; const int patch_size = hparams.patch_size; const int num_patches = ((image_size.width / patch_size) * (image_size.height / patch_size)); const int num_positions = num_patches + (model.class_embedding ? 1 : 0); @@ -788,7 +805,7 @@ static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches LLAMA_LOG_DEBUG("%s: num_positions = %d\n", __func__, num_positions); // build the inference graph - ggml_cgraph * gf = clip_image_build_graph(ctx, batch_size, image_size); + ggml_cgraph * gf = llama_vision_build_graph(ctx, batch_size, image_size); // alloc memory for graph bool ok = ggml_backend_sched_alloc_graph(ctx.sched, gf); @@ -803,15 +820,15 @@ static int32_t clip_image_encode(clip_context & ctx, const llama_vision_patches float * data = (float *)malloc(ggml_nbytes(inp_raw)); for (int i = 0; i < batch_size; i++) { - const int nx = patches.px * patches.n_px; - const int ny = patches.py * patches.n_py; + const int nx = inp.px * inp.n_px; + const int ny = inp.py * inp.n_py; const int n = nx * ny; for (int b = 0; b < batch_size; b++) { for (int k = 0; k < 3; k++) { for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { - data[(b * 3 * n) + k * n + y * nx + x] = patches.buf[b][3 * (y * nx + x) + k]; + data[(b * 3 * n) + k * n + y * nx + x] = inp.buf[b][3 * (y * nx + x) + k]; } } } @@ -891,34 +908,38 @@ void llama_vision_bitmap_free(llama_vision_bitmap * bmp) { delete bmp; } -struct llama_vision_patches * llama_vision_patches_init( +struct llama_vision_tokens * llama_vision_tokenize( struct llama_context * ctx, llama_vision_bitmap * bmp) { - clip_context & vctx = ctx->vctx; - if (vctx.model->hparams.arch == LLM_ARCH_VISION_MINICPMV) { - return new llama_vision_patches(clip_image_preprocess_minicpmv(vctx, *bmp)); + llama_vision_context & vctx = ctx->vctx; + switch (vctx.model->hparams.arch) { + case LLM_ARCH_VISION_LLAVA: + case LLM_ARCH_VISION_MOBILEVLM: + return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp)); + case LLM_ARCH_VISION_MINICPMV: + return new llama_vision_tokens(llama_vision_processor_uhd(vctx).tokenize(*bmp)); + default: + GGML_ASSERT(false && "unsupported arch"); } - return new llama_vision_patches(clip_image_preprocess(vctx, *bmp)); } -void llama_vision_patches_free(llama_vision_patches * p) { +void llama_vision_tokens_free(llama_vision_tokens * p) { delete p; } -int32_t llama_vision_encode(struct llama_context * ctx, llama_vision_patches * p) { +int32_t llama_vision_encode(struct llama_context * ctx, llama_vision_tokens * p) { if (p->buf.empty()) { LLAMA_LOG_ERROR("%s: nothing to encode\n", __func__); return -1; } - clip_context & vctx = ctx->vctx; + llama_vision_context & vctx = ctx->vctx; auto & hparams = vctx.model->hparams; switch (hparams.mm_patch_merge_type) { case MM_PATCH_MERGE_FLAT: { // flat / default llava-1.5 type embedding - // n_output = clip_n_patches(ctx); - int32_t encoded = clip_image_encode(vctx, *p); + int32_t encoded = llama_vision_encode_impl(vctx, *p); if (encoded != 0) { LLAMA_LOG_ERROR("Unable to encode image\n"); return encoded; @@ -944,7 +965,7 @@ struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx) { // for debugging #ifndef NDEBUG -static int bmp_export(const struct clip_image_u8 &img, const std::string &location) { +static int bmp_export(const struct llama_image_u8 &img, const std::string &location) { const uint32_t width = img.nx; const uint32_t height = img.ny; // swap red and blue channel diff --git a/src/llama-vision.h b/src/llama-vision.h index a9304867fd4d9..374ae45376af7 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -7,12 +7,12 @@ #include #include -enum clip_projector_type { - CLIP_PROJECTOR_TYPE_UNKNOWN, - CLIP_PROJECTOR_TYPE_MLP, - CLIP_PROJECTOR_TYPE_LDPV2, - CLIP_PROJECTOR_TYPE_MINICPMV_2_5, - CLIP_PROJECTOR_TYPE_MINICPMV_2_6, +enum vision_projector_type { + VISION_PROJECTOR_TYPE_UNKNOWN, + VISION_PROJECTOR_TYPE_MLP, + VISION_PROJECTOR_TYPE_LDPV2, + VISION_PROJECTOR_TYPE_MINICPMV_2_5, + VISION_PROJECTOR_TYPE_MINICPMV_2_6, }; enum mm_patch_merge { @@ -21,62 +21,33 @@ enum mm_patch_merge { MM_PATCH_MERGE_SPATIAL_UNPAD, }; -struct clip_hparams { - llm_arch arch = LLM_ARCH_UNKNOWN; - - uint32_t image_size; - uint32_t patch_size; - uint32_t hidden_size; - uint32_t n_intermediate; - uint32_t projection_dim; - uint32_t n_head; - uint32_t n_layer; - uint32_t max_pos_embd; - int32_t select_layer = 0; - bool use_gelu = false; - - float eps; - - clip_projector_type proj_type = CLIP_PROJECTOR_TYPE_UNKNOWN; - mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_UNKNOWN; - - std::array image_mean; - std::array image_std; - - std::array image_grid_pinpoints; // TODO: should this be array of (x, y) pairs? - int32_t image_crop_resolution; -}; - -struct clip_layer { - // attention - struct ggml_tensor * k_w = nullptr; - struct ggml_tensor * k_b = nullptr; - struct ggml_tensor * q_w = nullptr; - struct ggml_tensor * q_b = nullptr; - struct ggml_tensor * v_w = nullptr; - struct ggml_tensor * v_b = nullptr; - - struct ggml_tensor * output_w = nullptr; - struct ggml_tensor * output_b = nullptr; - - // layernorm 1 - struct ggml_tensor * norm_in_w = nullptr; - struct ggml_tensor * norm_in_b = nullptr; - - // ff - struct ggml_tensor * ffn_up_w = nullptr; - struct ggml_tensor * ffn_up_b = nullptr; - - struct ggml_tensor * ffn_down_w = nullptr; - struct ggml_tensor * ffn_down_b = nullptr; - - // layernorm 2 - struct ggml_tensor * norm_out_w = nullptr; - struct ggml_tensor * norm_out_b = nullptr; -}; - -struct clip_vision_model { - struct clip_hparams hparams; +struct llama_vision_model { + struct vision_hparams { + llm_arch arch = LLM_ARCH_UNKNOWN; + + uint32_t image_size; + uint32_t patch_size; + uint32_t hidden_size; + uint32_t n_intermediate; + uint32_t projection_dim; + uint32_t n_head; + uint32_t n_layer; + uint32_t max_pos_embd; + int32_t select_layer = 0; + bool use_gelu = false; + + float eps; + + vision_projector_type proj_type = VISION_PROJECTOR_TYPE_UNKNOWN; + mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_UNKNOWN; + + std::array image_mean; + std::array image_std; + + std::array image_grid_pinpoints; // TODO: should this be array of (x, y) pairs? + int32_t image_crop_resolution; + }; + struct vision_hparams hparams; ggml_backend_buffer_type_t buft; // embeddings @@ -88,7 +59,34 @@ struct clip_vision_model { struct ggml_tensor * pre_norm_w = nullptr; struct ggml_tensor * pre_norm_b = nullptr; - std::vector layers; + struct vision_layer { + // attention + struct ggml_tensor * k_w = nullptr; + struct ggml_tensor * k_b = nullptr; + struct ggml_tensor * q_w = nullptr; + struct ggml_tensor * q_b = nullptr; + struct ggml_tensor * v_w = nullptr; + struct ggml_tensor * v_b = nullptr; + + struct ggml_tensor * output_w = nullptr; + struct ggml_tensor * output_b = nullptr; + + // layernorm 1 + struct ggml_tensor * norm_in_w = nullptr; + struct ggml_tensor * norm_in_b = nullptr; + + // ff + struct ggml_tensor * ffn_up_w = nullptr; + struct ggml_tensor * ffn_up_b = nullptr; + + struct ggml_tensor * ffn_down_w = nullptr; + struct ggml_tensor * ffn_down_b = nullptr; + + // layernorm 2 + struct ggml_tensor * norm_out_w = nullptr; + struct ggml_tensor * norm_out_b = nullptr; + }; + std::vector layers; struct ggml_tensor * post_norm_w = nullptr; struct ggml_tensor * post_norm_b = nullptr; @@ -132,13 +130,13 @@ struct clip_vision_model { struct ggml_tensor * image_newline = nullptr; }; -struct clip_context { +struct llama_vision_context { // memory buffers used to evaluate the model std::vector buf_compute_meta; ggml_backend_sched_t sched = nullptr; struct ggml_context * ctx_ggml = nullptr; - const clip_vision_model * model; + const llama_vision_model * model; // temporary output data, to be picked up by llama_decode() struct ggml_tensor * output; @@ -147,7 +145,7 @@ struct clip_context { // for now, this only contains: // - the instruction for ggml_conv_2d to break the image into patches // - the pre-processed image data in f32 -struct llama_vision_patches { +struct llama_vision_tokens { uint32_t px; // size of patch uint32_t py; // size of patch size_t n_px; // number of patches in x direction @@ -166,20 +164,20 @@ inline mm_patch_merge mm_patch_merge_from_name(std::string & name) { return MM_PATCH_MERGE_UNKNOWN; } -inline clip_projector_type clip_projector_type_from_name(std::string & name) { +inline vision_projector_type vision_projector_type_from_name(std::string & name) { if (name == "mlp") { - return CLIP_PROJECTOR_TYPE_MLP; + return VISION_PROJECTOR_TYPE_MLP; } else if (name == "ldpv2") { - return CLIP_PROJECTOR_TYPE_LDPV2; + return VISION_PROJECTOR_TYPE_LDPV2; } else if (name == "minicpmv-2.5") { - return CLIP_PROJECTOR_TYPE_MINICPMV_2_5; + return VISION_PROJECTOR_TYPE_MINICPMV_2_5; } else if (name == "minicpmv-2.6") { - return CLIP_PROJECTOR_TYPE_MINICPMV_2_6; + return VISION_PROJECTOR_TYPE_MINICPMV_2_6; } - return CLIP_PROJECTOR_TYPE_UNKNOWN; + return VISION_PROJECTOR_TYPE_UNKNOWN; } // only for sanity check: must be equal to n_embd of language model -uint32_t clip_n_mmproj_embd(const clip_vision_model & clip_model); +uint32_t llama_vision_n_mmproj_embd(const llama_vision_model & vmodel); struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx); From 9716c7bff7e4208ed116acd721abcbf8c71d0071 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 22 Jan 2025 14:40:35 +0100 Subject: [PATCH 09/25] temporary refactor llama_vision_graph_builder --- examples/vision/vision.cpp | 8 +- src/llama-model.cpp | 5 + src/llama-vision.cpp | 281 +++++++++++++++++++++---------------- 3 files changed, 174 insertions(+), 120 deletions(-) diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp index 88b5be5bb3d29..d8f8d0f1172bf 100644 --- a/examples/vision/vision.cpp +++ b/examples/vision/vision.cpp @@ -50,7 +50,7 @@ static llama_vision_bitmap * load_image_from_file(const char * fname) { } // split string by a `std::string delim` instead of `char delim` -static std::vector string_split(std::string s, const std::string & delimiter) { +static std::vector string_split_str(std::string s, const std::string & delimiter) { std::vector tokens; size_t pos = 0; std::string token; @@ -76,7 +76,7 @@ static std::vector tokenize_with_img_placement( const std::string & text, bool add_special, bool parse_special) { - std::vector parts = string_split(text, IMG_PLACEMENT); + std::vector parts = string_split_str(text, IMG_PLACEMENT); std::vector output; for (const auto & part : parts) { //printf("tokenizing part: %s\n", part.c_str()); @@ -114,6 +114,10 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_init.context.get(); const llama_model * model = llama_init.model.get(); const llama_vocab * vocab = llama_model_get_vocab(model); + if (!model) { + LOG_ERR("failed to load model\n"); + return 1; + } struct common_sampler * smpl = common_sampler_init(model, params.sampling); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9d30f636e0c9f..d8ff9c0e56227 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4056,6 +4056,11 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { case LLM_ARCH_QWEN2VL: return LLAMA_ROPE_TYPE_MROPE; + case LLM_ARCH_VISION_LLAVA: + case LLM_ARCH_VISION_MOBILEVLM: + case LLM_ARCH_VISION_MINICPMV: + GGML_ABORT("vision arch does not use RoPE"); + // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: GGML_ABORT("unknown architecture"); diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index c583593b73dba..3cae05d6b457c 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -19,6 +19,8 @@ struct img_size; static int bmp_export(const struct llama_image_u8 &img, const std::string &location); #endif +#define VISION_GRAPH_MAX_NODE 1024 + struct img_size { int width; int height; @@ -403,7 +405,7 @@ struct llama_vision_processor_llava : llama_vision_processor { output_slices.buf[0] = std::move(res); return output_slices; - }; + } }; struct llama_vision_processor_uhd : llama_vision_processor { @@ -572,33 +574,56 @@ struct llama_vision_processor_uhd : llama_vision_processor { } }; -static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int batch_size, img_size & image_size) { - auto & model = *ctx.model; - auto & hparams = ctx.model->hparams; - - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - const int patch_size = hparams.patch_size; - const float eps = hparams.eps; - const int num_patches = ((image_size.width / patch_size) * (image_size.height / patch_size)); - const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - - LLAMA_LOG_DEBUG("%s: num_patches = %d\n", __func__, num_patches); - - struct ggml_init_params params = { - /*.mem_size =*/ ctx.buf_compute_meta.size(), - /*.mem_buffer =*/ ctx.buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; +// TODO: move this to llm_build_context in llama.cpp +struct llama_vision_graph_builder { + llama_vision_context & ctx; + const llama_vision_model & model; + struct ggml_context * ctx0; + int batch_size; + int hidden_size; + int n_head; + int d_head; + int patch_size; + float eps; + int num_patches; + int num_positions; + int img_w; + int img_h; + bool use_gelu; + int n_layers; + vision_projector_type proj_type; + + llama_vision_graph_builder(llama_vision_context & ctx, const llama_vision_tokens & inp) : ctx(ctx), model(*ctx.model) { + struct ggml_init_params params = { + /*.mem_size =*/ ctx.buf_compute_meta.size(), + /*.mem_buffer =*/ ctx.buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + ctx0 = ggml_init(params); + + auto & hparams = ctx.model->hparams; + + batch_size = inp.buf.size(); + hidden_size = hparams.hidden_size; + n_head = hparams.n_head; + d_head = hidden_size / n_head; + patch_size = hparams.patch_size; + eps = hparams.eps; + num_patches = inp.n_px * inp.n_py; + num_positions = num_patches + (model.class_embedding ? 1 : 0); + img_w = inp.px * inp.n_px; + img_h = inp.py * inp.n_py; + use_gelu = hparams.use_gelu; + n_layers = (int)hparams.n_layer + hparams.select_layer; + proj_type = hparams.proj_type; + } - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + ~llama_vision_graph_builder() { + ggml_free(ctx0); + } - // input - struct ggml_tensor * embeddings; - { - struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size.width, image_size.height, 3, batch_size); + struct ggml_tensor * build_inp() { + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img_w, img_h, 3, batch_size); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); @@ -612,37 +637,51 @@ static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int ba } // auto * ne = inp->ne; printf("%d %d %d %d\n", ne[0], ne[1], ne[2], ne[3]); - embeddings = inp; + struct ggml_tensor * embd = inp; if (model.class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + embd = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + ggml_set_name(embd, "inp_embd"); + ggml_set_input(embd); + + embd = ggml_acc(ctx0, embd, model.class_embedding, + embd->nb[1], embd->nb[2], embd->nb[3], 0); + embd = ggml_acc(ctx0, embd, inp, + embd->nb[1], embd->nb[2], embd->nb[3], model.class_embedding->nb[1]); } struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); - ggml_set_name(positions, "positions"); + ggml_set_name(positions, "inp_pos"); ggml_set_input(positions); - embeddings = ggml_add(ctx0, - embeddings, + embd = ggml_add(ctx0, + embd, ggml_get_rows(ctx0, model.position_embeddings, positions)); + + return embd; + } + + struct ggml_tensor * build_pre_norm(struct ggml_tensor * cur) { + if (model.pre_norm_w) { + cur = ggml_norm(ctx0, cur, eps); + ggml_set_name(cur, "pre_ln"); + + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.pre_norm_w), model.pre_norm_b); + } + return cur; } - // pre-layernorm - if (model.pre_norm_w) { - embeddings = ggml_norm(ctx0, embeddings, eps); - ggml_set_name(embeddings, "pre_ln"); + struct ggml_tensor * build_post_norm(struct ggml_tensor * cur) { + if (model.post_norm_w) { + cur = ggml_norm(ctx0, cur, eps); + ggml_set_name(cur, "post_ln"); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_norm_w), model.pre_norm_b); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.post_norm_w), model.post_norm_b); + } + return cur; } - // loop over layers - for (int il = 0; il < (int)hparams.n_layer + hparams.select_layer; il++) { - struct ggml_tensor * cur = embeddings; + struct ggml_tensor * build_layer(struct ggml_tensor * inpL, int il) { + struct ggml_tensor * cur = inpL; // layernorm1 { @@ -654,7 +693,6 @@ static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int ba // self-attention { - struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); @@ -693,9 +731,9 @@ static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int ba cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].output_w, cur), model.layers[il].output_b); // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, embeddings); + cur = ggml_add(ctx0, cur, inpL); - embeddings = cur; // embeddings = residual, cur = hidden_states + inpL = cur; // inpL = residual, cur = hidden_states // layernorm2 { @@ -708,7 +746,7 @@ static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int ba cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_w, cur); cur = ggml_add(ctx0, cur, model.layers[il].ffn_up_b); - if (hparams.use_gelu) { + if (use_gelu) { cur = ggml_gelu_inplace(ctx0, cur); } else { cur = ggml_gelu_quick_inplace(ctx0, cur); @@ -718,74 +756,76 @@ static ggml_cgraph * llama_vision_build_graph(llama_vision_context & ctx, int ba cur = ggml_add(ctx0, cur, model.layers[il].ffn_down_b); // residual 2 - cur = ggml_add(ctx0, embeddings, cur); + cur = ggml_add(ctx0, inpL, cur); - embeddings = cur; + return cur; } - // post-layernorm - if (model.post_norm_w) { - embeddings = ggml_norm(ctx0, embeddings, eps); - ggml_set_name(embeddings, "post_ln"); - - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_norm_w), model.post_norm_b); - } + // graph for each vision arch - // llava projector - { - embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); - - struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); - ggml_set_name(patches, "patches"); - ggml_set_input(patches); - - // shape [1, 576, 1024] - // ne is whcn, ne = [1024, 576, 1, 1] - embeddings = ggml_get_rows(ctx0, embeddings, patches); - - if (hparams.proj_type == VISION_PROJECTOR_TYPE_MLP) { - embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); - - embeddings = ggml_gelu(ctx0, embeddings); - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - - } else if (hparams.proj_type == VISION_PROJECTOR_TYPE_LDPV2) { - int n_patch = 24; - struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); - mlp_0 = ggml_gelu(ctx0, mlp_0); - struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); - mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); - // mlp_2 ne = [2048, 576, 1, 1] - // // AVG Pool Layer 2*2, strides = 2 - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); - // mlp_2 ne = [576, 2048, 1, 1] - mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); - // mlp_2 ne [24, 24, 2048, 1] - mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); - // weight ne = [3, 3, 2048, 1] - struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); - peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, mlp_2); - peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); - embeddings = peg_0; + struct ggml_cgraph * build_llava() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, VISION_GRAPH_MAX_NODE, false); + struct ggml_tensor * cur = build_inp(); + cur = build_pre_norm(cur); + for (int il = 0; il < n_layers; il++) { + cur = build_layer(cur, il); + } + cur = build_post_norm(cur); - } else { - GGML_ASSERT(false && "unsupported proj type"); + // llava projector + { + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1]); + + struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); + ggml_set_name(patches, "inp_patches"); + ggml_set_input(patches); + + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] + cur = ggml_get_rows(ctx0, cur, patches); + + if (proj_type == VISION_PROJECTOR_TYPE_MLP) { + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + cur = ggml_add(ctx0, cur, model.mm_1_b); + + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + cur = ggml_add(ctx0, cur, model.mm_2_b); + + } else if (proj_type == VISION_PROJECTOR_TYPE_LDPV2) { + int n_patch = 24; + struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, cur); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + cur = ggml_cont(ctx0, peg_0); + + } else { + GGML_ASSERT(false && "unsupported proj type"); + } } - } - embeddings = ggml_cont(ctx0, embeddings); + ggml_set_name(cur, "output"); + ggml_build_forward_expand(gf, cur); - // build the graph - ggml_build_forward_expand(gf, embeddings); - ggml_free(ctx0); - return gf; -} + return gf; + } +}; static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_vision_tokens & inp) { int batch_size = inp.buf.size(); @@ -805,7 +845,16 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ LLAMA_LOG_DEBUG("%s: num_positions = %d\n", __func__, num_positions); // build the inference graph - ggml_cgraph * gf = llama_vision_build_graph(ctx, batch_size, image_size); + llama_vision_graph_builder builder(ctx, inp); + ggml_cgraph * gf; + switch(hparams.arch) { + case LLM_ARCH_VISION_LLAVA: + case LLM_ARCH_VISION_MOBILEVLM: + gf = builder.build_llava(); + break; + default: + GGML_ASSERT(false && "unsupported arch"); + } // alloc memory for graph bool ok = ggml_backend_sched_alloc_graph(ctx.sched, gf); @@ -839,16 +888,12 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ } if (model.class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); + struct ggml_tensor * inp_embd = ggml_graph_get_tensor(gf, "inp_embd"); + ggml_set_zero(inp_embd); } { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "inp_pos"); int* positions_data = (int*)malloc(ggml_nbytes(positions)); for (int i = 0; i < num_positions; i++) { @@ -859,7 +904,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ } { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "inp_patches"); int* patches_data = (int*)malloc(ggml_nbytes(patches)); for (int i = 0; i < num_patches; i++) { patches_data[i] = i + 1; From ba489b474373758346a4efc5ff5273d0e4577530 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 22 Jan 2025 22:26:38 +0100 Subject: [PATCH 10/25] wip minicpmv --- convert_hf_to_gguf.py | 105 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e164e9a0712d7..8b8d3988b72e5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -204,9 +204,10 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: f"Missing tensors: {missing}\n" f"Extra tensors: {extra}") - def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") + def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight", is_vision = False) -> str: + arch = self.vision_arch if is_vision and self.vision_arch is not None else self.model_arch + if key not in gguf.MODEL_TENSORS[arch]: + raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {arch!r}") name: str = gguf.TENSOR_NAMES[key] if "{bid}" in name: assert bid is not None @@ -2144,6 +2145,7 @@ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM proj_type: gguf.constants.CLIPProjectorType | None + resampler_n_embd = 0 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2162,6 +2164,12 @@ def __init__(self, *args, **kwargs): self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_6 else: raise ValueError(f"Unsupported MiniCPM-V version: {version}") + # TODO: how to do this without reading the whole safetensor file? + for tname, tensor in self.get_tensors(): + if tname == "resampler.ln_post.bias": + self.resampler_n_embd = tensor.shape[0] + if self.resampler_n_embd < 2: + raise ValueError("Failed to detect resampler embedding size") if self.vparams is not None and self.vision_arch is not None and self.preprocessor_config is not None: self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5] @@ -2220,6 +2228,12 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + if self.vision_arch == gguf.MODEL_ARCH.VISION_MINICPMV: + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.V_RESMPL_POS_EMBD_K, is_vision=True), + torch.from_numpy(self._get_2d_sincos_pos_embed(self.resampler_n_embd, (70, 70))) + ) + def set_vocab(self): if self.vision_arch == gguf.MODEL_ARCH.VISION_MINICPMV: # undocumented anywhere, I only found this thanks to https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf @@ -2233,11 +2247,23 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # For vision model if name.startswith("llm."): name = name.replace("llm.", "") - # attention, someone mess up and use underscore instead of dot - if name.endswith("in_proj_weight"): - name = name.replace("_weight", ".weight") - if name.endswith("in_proj_bias"): - name = name.replace("_bias", ".bias") + + # split the resampler.attn.in_proj_(weight|bias) tensors into q, k, v + if name.endswith("in_proj_weight") or name.endswith("in_proj_bias"): + assert data_torch.shape[0] == 3 * self.resampler_n_embd + split_tensor = data_torch.chunk(3, dim=0) + name_q = name.replace("in_proj_", "in_proj_q.") # in_proj_q.(weight|bias) + name_k = name.replace("in_proj_", "in_proj_k.") # in_proj_k.(weight|bias) + name_v = name.replace("in_proj_", "in_proj_v.") # in_proj_v.(weight|bias) + return [ + (self.map_tensor_name(name_q), split_tensor[0]), + (self.map_tensor_name(name_k), split_tensor[1]), + (self.map_tensor_name(name_v), split_tensor[2]), + ] + + if name == "resampler.proj" or name == "resampler.query": + name += ".weight" + if "post_layernorm" in name: return [] # skip post_layernorm @@ -2251,6 +2277,69 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) return [(self.map_tensor_name(name), data_torch)] + + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + del name, bid # unused + if "v.resmpl.query" in new_name or "v.resmpl.pos_embd_k" in new_name: + return gguf.GGMLQuantizationType.F32 + if "v.resmpl." in new_name: + return gguf.GGMLQuantizationType.F32 if n_dims == 1 else gguf.GGMLQuantizationType.F16 + return False + + # utils to work with MiniCPM-V resampler + + # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 + def _get_2d_sincos_pos_embed(self, embed_dim: int, grid_size: tuple[int, int] | int, cls_token=False) -> np.ndarray: + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = self._get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + def _get_2d_sincos_pos_embed_from_grid(self, embed_dim: int, grid: np.ndarray) -> np.ndarray: + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = self._get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = self._get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + def _get_1d_sincos_pos_embed_from_grid(self, embed_dim: int, pos: np.ndarray) -> np.ndarray: + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb @Model.register("MiniCPM3ForCausalLM") From c0d93dd5093d00e3653190b7c5439eae50604fa2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 22 Jan 2025 22:42:00 +0100 Subject: [PATCH 11/25] minicpmv works but missing uhd slices --- convert_hf_to_gguf.py | 309 +++++++++++++++++---------------- examples/vision/vision.cpp | 5 +- gguf-py/gguf/constants.py | 37 ++-- gguf-py/gguf/tensor_mapping.py | 20 ++- src/llama-arch.cpp | 30 +++- src/llama-arch.h | 12 +- src/llama-model.cpp | 162 +++++++++-------- src/llama-model.h | 2 +- src/llama-vision.cpp | 103 +++++++++-- src/llama-vision.h | 38 ++-- src/llama.cpp | 4 +- 11 files changed, 432 insertions(+), 290 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8b8d3988b72e5..9a05e99602575 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2141,60 +2141,19 @@ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: return n_dims > 1 -@Model.register("MiniCPMForCausalLM", "MiniCPMV") +@Model.register("MiniCPMForCausalLM") class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM - proj_type: gguf.constants.CLIPProjectorType | None - resampler_n_embd = 0 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - model_type = self.hparams.get("model_type", None) - - # only tested with https://huggingface.co/openbmb/MiniCPM-V-2_6 - if "vision_config" in self.hparams and model_type == "minicpmv": - self.vparams = self.hparams["vision_config"] - self.preprocessor_config = self.load_preprocessor_config(self.dir_model) - self.vision_arch = gguf.MODEL_ARCH.VISION_MINICPMV - version = str(self.hparams.get("version", "unknown")) - if version == "2.5": - self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_5 - elif version == "2.6": - self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_6 - else: - raise ValueError(f"Unsupported MiniCPM-V version: {version}") - # TODO: how to do this without reading the whole safetensor file? - for tname, tensor in self.get_tensors(): - if tname == "resampler.ln_post.bias": - self.resampler_n_embd = tensor.shape[0] - if self.resampler_n_embd < 2: - raise ValueError("Failed to detect resampler embedding size") - - if self.vparams is not None and self.vision_arch is not None and self.preprocessor_config is not None: - self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5] - self.preprocessor_config["image_std"] = [0.5, 0.5, 0.5] - self.hparams["vision_feature_layer"] = 0 - self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"]) def set_gguf_parameters(self): super().set_gguf_parameters() - # scale_emb - embedding_scale = float(self.hparams.get("scale_emb", 1.0)) + embedding_scale = float(self.hparams["scale_emb"]) self.gguf_writer.add_embedding_scale(embedding_scale) logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") - # scale_depth - if "scale_depth" in self.hparams: - residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 - else: - residual_scale = 1.0 + residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 self.gguf_writer.add_residual_scale(residual_scale) logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") - # logit_scale - if "dim_model_base" in self.hparams: - logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] - else: - logit_scale = 1.0 + logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] self.gguf_writer.add_logit_scale(logit_scale) logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") if self.hparams.get("rope_scaling") is not None: @@ -2202,15 +2161,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") - # For vision model - if self.vparams is not None and self.proj_type is not None: - self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) - self.gguf_writer.add_vision_vit_projector_type(self.proj_type) - self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06) - max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 - self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] @@ -2228,118 +2178,22 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - if self.vision_arch == gguf.MODEL_ARCH.VISION_MINICPMV: - yield ( - self.format_tensor_name(gguf.MODEL_TENSOR.V_RESMPL_POS_EMBD_K, is_vision=True), - torch.from_numpy(self._get_2d_sincos_pos_embed(self.resampler_n_embd, (70, 70))) - ) - def set_vocab(self): - if self.vision_arch == gguf.MODEL_ARCH.VISION_MINICPMV: - # undocumented anywhere, I only found this thanks to https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf - self._set_vocab_gpt2() - else: - self._set_vocab_sentencepiece() + self._set_vocab_sentencepiece() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - # For vision model - if name.startswith("llm."): - name = name.replace("llm.", "") - - # split the resampler.attn.in_proj_(weight|bias) tensors into q, k, v - if name.endswith("in_proj_weight") or name.endswith("in_proj_bias"): - assert data_torch.shape[0] == 3 * self.resampler_n_embd - split_tensor = data_torch.chunk(3, dim=0) - name_q = name.replace("in_proj_", "in_proj_q.") # in_proj_q.(weight|bias) - name_k = name.replace("in_proj_", "in_proj_k.") # in_proj_k.(weight|bias) - name_v = name.replace("in_proj_", "in_proj_v.") # in_proj_v.(weight|bias) - return [ - (self.map_tensor_name(name_q), split_tensor[0]), - (self.map_tensor_name(name_k), split_tensor[1]), - (self.map_tensor_name(name_v), split_tensor[2]), - ] - - if name == "resampler.proj" or name == "resampler.query": - name += ".weight" - - if "post_layernorm" in name: - return [] # skip post_layernorm - n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") # HF models permute some of the tensors, so we need to undo that - if not name.startswith("vpm") and name.endswith(("q_proj.weight")): + if name.endswith(("q_proj.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if not name.startswith("vpm") and name.endswith(("k_proj.weight")): + if name.endswith(("k_proj.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) return [(self.map_tensor_name(name), data_torch)] - - def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: - del name, bid # unused - if "v.resmpl.query" in new_name or "v.resmpl.pos_embd_k" in new_name: - return gguf.GGMLQuantizationType.F32 - if "v.resmpl." in new_name: - return gguf.GGMLQuantizationType.F32 if n_dims == 1 else gguf.GGMLQuantizationType.F16 - return False - - # utils to work with MiniCPM-V resampler - - # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 - def _get_2d_sincos_pos_embed(self, embed_dim: int, grid_size: tuple[int, int] | int, cls_token=False) -> np.ndarray: - """ - grid_size: int of the grid height and width - return: - pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) - """ - if isinstance(grid_size, int): - grid_h_size, grid_w_size = grid_size, grid_size - else: - grid_h_size, grid_w_size = grid_size[0], grid_size[1] - - grid_h = np.arange(grid_h_size, dtype=np.float32) - grid_w = np.arange(grid_w_size, dtype=np.float32) - grid = np.meshgrid(grid_w, grid_h) # here w goes first - grid = np.stack(grid, axis=0) - - grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) - pos_embed = self._get_2d_sincos_pos_embed_from_grid(embed_dim, grid) - if cls_token: - pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) - return pos_embed - - def _get_2d_sincos_pos_embed_from_grid(self, embed_dim: int, grid: np.ndarray) -> np.ndarray: - assert embed_dim % 2 == 0 - - # use half of dimensions to encode grid_h - emb_h = self._get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) - emb_w = self._get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) - - emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) - return emb - - def _get_1d_sincos_pos_embed_from_grid(self, embed_dim: int, pos: np.ndarray) -> np.ndarray: - """ - embed_dim: output dimension for each position - pos: a list of positions to be encoded: size (M,) - out: (M, D) - """ - assert embed_dim % 2 == 0 - omega = np.arange(embed_dim // 2, dtype=np.float32) - omega /= embed_dim / 2. - omega = 1. / 10000 ** omega # (D/2,) - - pos = pos.reshape(-1) # (M,) - out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product - - emb_sin = np.sin(out) # (M, D/2) - emb_cos = np.cos(out) # (M, D/2) - - emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) - return emb @Model.register("MiniCPM3ForCausalLM") @@ -2479,6 +2333,155 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: yield name, data +@Model.register("MiniCPMV") +class MiniCPMVModel(Qwen2Model): + # based on minicpmv-surgery.py, not sure why it is Qwen2Model instead of MiniCPMModel + model_arch = gguf.MODEL_ARCH.QWEN2 + proj_type: gguf.constants.CLIPProjectorType | None + resampler_n_embd = 0 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + model_type = self.hparams.get("model_type", None) + + # only tested with https://huggingface.co/openbmb/MiniCPM-V-2_6 + if "vision_config" in self.hparams and model_type == "minicpmv": + self.vparams = self.hparams["vision_config"] + self.preprocessor_config = self.load_preprocessor_config(self.dir_model) + self.vision_arch = gguf.MODEL_ARCH.VISION_MINICPMV + version = str(self.hparams.get("version", "unknown")) + if version == "2.5": + self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_5 + elif version == "2.6": + self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_6 + else: + raise ValueError(f"Unsupported MiniCPM-V version: {version}") + # TODO: how to do this without reading the whole safetensor file? + for tname, tensor in self.get_tensors(): + if tname == "resampler.ln_post.bias": + self.resampler_n_embd = tensor.shape[0] + if self.resampler_n_embd < 2: + raise ValueError("Failed to detect resampler embedding size") + else: + raise ValueError("Expected vision_config, but not found") + + if self.vparams is not None and self.vision_arch is not None and self.preprocessor_config is not None: + self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5] + self.preprocessor_config["image_std"] = [0.5, 0.5, 0.5] + self.hparams["vision_feature_layer"] = 0 + self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"]) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + # For vision model + if self.vparams is not None and self.proj_type is not None: + self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) + self.gguf_writer.add_vision_vit_projector_type(self.proj_type) + self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06) + max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd) + + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.V_RESMPL_POS_EMBD_K, is_vision=True), + torch.from_numpy(self._get_2d_sincos_pos_embed(self.resampler_n_embd, (70, 70))) + ) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # for language part + if name.startswith("llm."): + return [(self.map_tensor_name(name.replace("llm.", "")), data_torch)] + + # split the resampler.attn.in_proj_(weight|bias) tensors into q, k, v + if name.endswith("in_proj_weight") or name.endswith("in_proj_bias"): + assert data_torch.shape[0] == 3 * self.resampler_n_embd + split_tensor = data_torch.chunk(3, dim=0) + name_q = name.replace("in_proj_", "in_proj_q.") # in_proj_q.(weight|bias) + name_k = name.replace("in_proj_", "in_proj_k.") # in_proj_k.(weight|bias) + name_v = name.replace("in_proj_", "in_proj_v.") # in_proj_v.(weight|bias) + return [ + (self.map_tensor_name(name_q), split_tensor[0]), + (self.map_tensor_name(name_k), split_tensor[1]), + (self.map_tensor_name(name_v), split_tensor[2]), + ] + + # append .weight to these tensors + if name == "resampler.proj" or name == "resampler.query": + name += ".weight" + + if "post_layernorm" in name: + return [] # skip post_layernorm + + return [(self.map_tensor_name(name), data_torch)] + + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + del name, bid # unused + if "v.resmpl.query" in new_name or "v.resmpl.pos_embd_k" in new_name: + return gguf.GGMLQuantizationType.F32 + if "v.resmpl." in new_name: + return gguf.GGMLQuantizationType.F32 if n_dims == 1 else gguf.GGMLQuantizationType.F16 + return False + + # utils to work with MiniCPM-V resampler + + # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 + def _get_2d_sincos_pos_embed(self, embed_dim: int, grid_size: tuple[int, int] | int, cls_token=False) -> np.ndarray: + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = self._get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + def _get_2d_sincos_pos_embed_from_grid(self, embed_dim: int, grid: np.ndarray) -> np.ndarray: + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = self._get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = self._get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + def _get_1d_sincos_pos_embed_from_grid(self, embed_dim: int, pos: np.ndarray) -> np.ndarray: + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + @Model.register("WavTokenizerDec") class WavTokenizerDecModel(Model): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp index d8f8d0f1172bf..d994535f6e66f 100644 --- a/examples/vision/vision.cpp +++ b/examples/vision/vision.cpp @@ -98,8 +98,9 @@ int main(int argc, char ** argv) { common_params params; // default prompt for llava 1.5 - params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n" - "USER:\nwhat did you see?\nASSISTANT:"; + //params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:\nwhat did you see?\nASSISTANT:"; + // default prompt for minicpmv 2.6 + params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nwhat did you see?\n<|im_end|>\n<|im_start|>assistant\n"; params.n_predict = 64; params.n_batch = 2048; params.n_ubatch = 1024; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 601016eda7449..6cc9609fcf8ca 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -457,12 +457,14 @@ class MODEL_TENSOR(IntEnum): V_PRE_NORM = auto() V_POST_NORM = auto() V_RESMPL_POS_EMBD_K = auto() # minicpmv - V_RESMPL_ATTN_IN = auto() # minicpmv + V_RESMPL_ATTN_Q = auto() # minicpmv + V_RESMPL_ATTN_K = auto() # minicpmv + V_RESMPL_ATTN_V = auto() # minicpmv V_RESMPL_ATTN_OUT = auto() # minicpmv - V_RESMPL_KV_PROJ = auto() # minicpmv - V_RESMPL_NORM_POST = auto() # minicpmv - V_RESMPL_NORM_KV = auto() # minicpmv - V_RESMPL_NORM_Q = auto() # minicpmv + V_RESMPL_KV = auto() # minicpmv + V_RESMPL_KV_NORM = auto() # minicpmv + V_RESMPL_POST_NORM = auto() # minicpmv + V_RESMPL_Q_NORM = auto() # minicpmv V_RESMPL_PROJ = auto() # minicpmv V_RESMPL_QUERY = auto() # minicpmv @@ -674,12 +676,14 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_PRE_NORM: "v.pre_norm", MODEL_TENSOR.V_POST_NORM: "v.post_norm", MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "v.resmpl.pos_embd_k", - MODEL_TENSOR.V_RESMPL_ATTN_IN: "v.resmpl.attn_in", + MODEL_TENSOR.V_RESMPL_ATTN_Q: "v.resmpl.attn_q", + MODEL_TENSOR.V_RESMPL_ATTN_K: "v.resmpl.attn_k", + MODEL_TENSOR.V_RESMPL_ATTN_V: "v.resmpl.attn_v", MODEL_TENSOR.V_RESMPL_ATTN_OUT: "v.resmpl.attn_out", - MODEL_TENSOR.V_RESMPL_KV_PROJ: "v.resmpl.kv_proj", - MODEL_TENSOR.V_RESMPL_NORM_POST: "v.resmpl.norm_post", - MODEL_TENSOR.V_RESMPL_NORM_KV: "v.resmpl.norm_kv", - MODEL_TENSOR.V_RESMPL_NORM_Q: "v.resmpl.norm_q", + MODEL_TENSOR.V_RESMPL_KV: "v.resmpl.kv", + MODEL_TENSOR.V_RESMPL_KV_NORM: "v.resmpl.kv_norm", + MODEL_TENSOR.V_RESMPL_POST_NORM: "v.resmpl.post_norm", + MODEL_TENSOR.V_RESMPL_Q_NORM: "v.resmpl.q_norm", MODEL_TENSOR.V_RESMPL_PROJ: "v.resmpl.proj", MODEL_TENSOR.V_RESMPL_QUERY: "v.resmpl.query", } @@ -1667,12 +1671,15 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_OUTPUT_NORM, MODEL_TENSOR.V_ENC_FFN_UP, MODEL_TENSOR.V_ENC_FFN_DOWN, - MODEL_TENSOR.V_RESMPL_ATTN_IN, + MODEL_TENSOR.V_RESMPL_POS_EMBD_K, + MODEL_TENSOR.V_RESMPL_ATTN_Q, + MODEL_TENSOR.V_RESMPL_ATTN_K, + MODEL_TENSOR.V_RESMPL_ATTN_V, MODEL_TENSOR.V_RESMPL_ATTN_OUT, - MODEL_TENSOR.V_RESMPL_KV_PROJ, - MODEL_TENSOR.V_RESMPL_NORM_POST, - MODEL_TENSOR.V_RESMPL_NORM_KV, - MODEL_TENSOR.V_RESMPL_NORM_Q, + MODEL_TENSOR.V_RESMPL_KV, + MODEL_TENSOR.V_RESMPL_KV_NORM, + MODEL_TENSOR.V_RESMPL_POST_NORM, + MODEL_TENSOR.V_RESMPL_Q_NORM, MODEL_TENSOR.V_RESMPL_PROJ, MODEL_TENSOR.V_RESMPL_QUERY, ], diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 92e1d499ae0d9..b756ec184136e 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -868,27 +868,35 @@ class TensorNameMap: "resampler.pos_embed_k", ), - MODEL_TENSOR.V_RESMPL_ATTN_IN: ( - "resampler.attn.in_proj", + MODEL_TENSOR.V_RESMPL_ATTN_Q: ( + "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_K: ( + "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_V: ( + "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj ), MODEL_TENSOR.V_RESMPL_ATTN_OUT: ( "resampler.attn.out_proj", ), - MODEL_TENSOR.V_RESMPL_KV_PROJ: ( + MODEL_TENSOR.V_RESMPL_KV: ( "resampler.kv_proj", ), - MODEL_TENSOR.V_RESMPL_NORM_POST: ( + MODEL_TENSOR.V_RESMPL_POST_NORM: ( "resampler.ln_post", ), - MODEL_TENSOR.V_RESMPL_NORM_KV: ( + MODEL_TENSOR.V_RESMPL_KV_NORM: ( "resampler.ln_kv", ), - MODEL_TENSOR.V_RESMPL_NORM_Q: ( + MODEL_TENSOR.V_RESMPL_Q_NORM: ( "resampler.ln_q", ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 73f4b57db9715..0b20b03f6a0de 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1372,12 +1372,14 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, { LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, { LLM_TENSOR_V_RESMPL_POS_EMBD_K, "v.resmpl.pos_embd_k" }, - { LLM_TENSOR_V_RESMPL_ATTN_IN, "v.resmpl.attn_in" }, + { LLM_TENSOR_V_RESMPL_ATTN_Q, "v.resmpl.attn_q" }, + { LLM_TENSOR_V_RESMPL_ATTN_K, "v.resmpl.attn_k" }, + { LLM_TENSOR_V_RESMPL_ATTN_V, "v.resmpl.attn_v" }, { LLM_TENSOR_V_RESMPL_ATTN_OUT, "v.resmpl.attn_out" }, - { LLM_TENSOR_V_RESMPL_KV_PROJ, "v.resmpl.kv_proj" }, - { LLM_TENSOR_V_RESMPL_NORM_POST, "v.resmpl.norm_post" }, - { LLM_TENSOR_V_RESMPL_NORM_KV, "v.resmpl.norm_kv" }, - { LLM_TENSOR_V_RESMPL_NORM_Q, "v.resmpl.norm_q" }, + { LLM_TENSOR_V_RESMPL_KV, "v.resmpl.kv" }, + { LLM_TENSOR_V_RESMPL_KV_NORM, "v.resmpl.kv_norm" }, + { LLM_TENSOR_V_RESMPL_POST_NORM, "v.resmpl.post_norm" }, + { LLM_TENSOR_V_RESMPL_Q_NORM, "v.resmpl.q_norm" }, { LLM_TENSOR_V_RESMPL_PROJ, "v.resmpl.proj" }, { LLM_TENSOR_V_RESMPL_QUERY, "v.resmpl.query" }, } @@ -1531,6 +1533,24 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + // vision + {LLM_TENSOR_V_MMPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_MMPROJ_MLP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_MMPROJ_PEG, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_ENC_EMBD_CLS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}}, + {LLM_TENSOR_V_ENC_EMBD_PATCH, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}}, + {LLM_TENSOR_V_ENC_EMBD_POS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}}, + {LLM_TENSOR_V_ENC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_ENC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_ENC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_ENC_INPUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_V_ENC_OUTPUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_V_ENC_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_ENC_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_PRE_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_V_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + // TODO: add minicpmv resampler tensors }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index 7545138924fd5..4f3e76a5f20c6 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -371,12 +371,14 @@ enum llm_tensor { LLM_TENSOR_V_POST_NORM, // vision - minicpmv LLM_TENSOR_V_RESMPL_POS_EMBD_K, - LLM_TENSOR_V_RESMPL_ATTN_IN, + LLM_TENSOR_V_RESMPL_ATTN_Q, + LLM_TENSOR_V_RESMPL_ATTN_K, + LLM_TENSOR_V_RESMPL_ATTN_V, LLM_TENSOR_V_RESMPL_ATTN_OUT, - LLM_TENSOR_V_RESMPL_KV_PROJ, - LLM_TENSOR_V_RESMPL_NORM_POST, - LLM_TENSOR_V_RESMPL_NORM_KV, - LLM_TENSOR_V_RESMPL_NORM_Q, + LLM_TENSOR_V_RESMPL_KV, + LLM_TENSOR_V_RESMPL_KV_NORM, + LLM_TENSOR_V_RESMPL_POST_NORM, + LLM_TENSOR_V_RESMPL_Q_NORM, LLM_TENSOR_V_RESMPL_PROJ, LLM_TENSOR_V_RESMPL_QUERY, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d8ff9c0e56227..1bebc7988def4 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1248,7 +1248,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_type = llama_model_rope_type(this); // vision model - auto & vparams = clip.hparams; + auto & vparams = vit.hparams; std::string vision_type; ml.get_key(LLM_KV_VISION_TYPE, vision_type, false); if (vision_type == "vit") { @@ -3451,10 +3451,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1, ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft)); } - } // load tensors for vision model - auto & vparams = clip.hparams; + auto & vparams = vit.hparams; if (has_vision) { // language params const int64_t n_embd = hparams.n_embd; @@ -3467,101 +3466,122 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t patch_size = vparams.patch_size; const auto tn = LLM_TN(vparams.arch); - // clip is CPU-only for now - clip.buft = ggml_backend_cpu_buffer_type(); - ggml_context * ctx_vision = ctx_map.at(clip.buft); - clip.layers.resize(n_vlayer); + // TODO: vit is cpu only for now + vit.buft = ggml_backend_cpu_buffer_type(); + ggml_context * ctx_vision = ctx_map.at(vit.buft); + vit.layers.resize(n_vlayer); switch (vparams.arch) { case LLM_ARCH_VISION_LLAVA: case LLM_ARCH_VISION_MOBILEVLM: { if (vparams.arch == LLM_ARCH_VISION_LLAVA) { - clip.mm_1_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 1), {n_vembd, n_vff}); - clip.mm_1_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 1), {n_vff}); - clip.mm_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 2), {n_vff, n_vff}); - clip.mm_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 2), {n_vff}); + vit.mm_1_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 1), {n_vembd, n_vff}); + vit.mm_1_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 1), {n_vff}); + vit.mm_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 2), {n_vff, n_vff}); + vit.mm_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 2), {n_vff}); } else if (vparams.arch == LLM_ARCH_VISION_MOBILEVLM) { - clip.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd}); - clip.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 0), {n_embd}); - clip.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 2), {n_embd, n_embd}); - clip.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 2), {n_embd}); - clip.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd}); - clip.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "bias", 0), {n_embd}); + vit.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd}); + vit.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 0), {n_embd}); + vit.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 2), {n_embd, n_embd}); + vit.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 2), {n_embd}); + vit.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd}); + vit.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "bias", 0), {n_embd}); } - clip.class_embedding = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_CLS ), {n_vembd}); - clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); - clip.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + vit.class_embedding = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_CLS ), {n_vembd}); + vit.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); + vit.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); - clip.pre_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "weight"), {n_vembd}); - clip.pre_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "bias" ), {n_vembd}); - clip.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); - clip.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + vit.pre_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "weight"), {n_vembd}); + vit.pre_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "bias" ), {n_vembd}); + vit.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + vit.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); for (int i = 0; i < n_vlayer; ++i) { - auto & layer = clip.layers[i]; - - layer.k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); - layer.k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}); - layer.v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); - layer.v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}); - layer.q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); - layer.q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}); - - layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); - layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}); - layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); - layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}); - - layer.norm_in_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}); - layer.norm_in_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}); - layer.norm_out_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); - layer.norm_out_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); - - layer.output_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); - layer.output_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}); + auto & layer = vit.layers[i]; + + layer.k_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}, 0); + layer.k_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}, 0); + layer.v_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}, 0); + layer.v_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}, 0); + layer.q_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}, 0); + layer.q_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}, 0); + + layer.ffn_up_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}, 0); + layer.ffn_down_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}, 0); + + layer.norm_in_w = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}, 0); + layer.norm_in_b = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}, 0); + layer.norm_out_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}, 0); + layer.norm_out_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}, 0); + + layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0); + layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0); } } break; case LLM_ARCH_VISION_MINICPMV: { - clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); - clip.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); - - // TODO: load all resampler tensors + vit.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); + vit.patch_bias = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd}); + vit.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + + // resampler + int rs_n_embd = llama_vision_n_mmproj_embd(vit); + vit.mm_model_pos_embed_k = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POS_EMBD_K, "weight"), {rs_n_embd, max_pos_embd}); + vit.mm_model_query = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_QUERY, "weight"), {rs_n_embd, 64}); // why 64? + vit.mm_model_proj = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_PROJ, "weight"), {rs_n_embd, rs_n_embd}); + vit.mm_model_kv_proj = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_KV, "weight"), {n_vembd, rs_n_embd}); + vit.mm_model_attn_q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_Q, "weight"), {rs_n_embd, rs_n_embd}); + vit.mm_model_attn_q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_Q, "bias" ), {rs_n_embd}); + vit.mm_model_attn_k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_K, "weight"), {rs_n_embd, rs_n_embd}); + vit.mm_model_attn_k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_K, "bias" ), {rs_n_embd}); + vit.mm_model_attn_v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_V, "weight"), {rs_n_embd, rs_n_embd}); + vit.mm_model_attn_v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_V, "bias" ), {rs_n_embd}); + vit.mm_model_attn_o_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_OUT, "weight"), {rs_n_embd, rs_n_embd}); + vit.mm_model_attn_o_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_OUT, "bias" ), {rs_n_embd}); + vit.mm_model_ln_q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_Q_NORM, "weight"), {rs_n_embd}); + vit.mm_model_ln_q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_Q_NORM, "bias" ), {rs_n_embd}); + vit.mm_model_ln_kv_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_KV_NORM, "weight"), {rs_n_embd}); + vit.mm_model_ln_kv_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_KV_NORM, "bias" ), {rs_n_embd}); + vit.mm_model_ln_post_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "weight"), {rs_n_embd}); + vit.mm_model_ln_post_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "bias" ), {rs_n_embd}); for (int i = 0; i < n_vlayer; ++i) { - auto & layer = clip.layers[i]; - - layer.k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}); - layer.k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}); - layer.v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}); - layer.v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}); - layer.q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}); - layer.q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}); - - layer.ffn_up_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}); - layer.ffn_up_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}); - layer.ffn_down_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}); - layer.ffn_down_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}); - - layer.norm_in_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}); - layer.norm_in_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}); - layer.norm_out_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}); - layer.norm_out_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}); - - layer.output_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}); - layer.output_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}); + auto & layer = vit.layers[i]; + + layer.k_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}, 0); + layer.k_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}, 0); + layer.v_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}, 0); + layer.v_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}, 0); + layer.q_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}, 0); + layer.q_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}, 0); + + layer.ffn_up_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}, 0); + layer.ffn_down_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}, 0); + + layer.norm_in_w = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}, 0); + layer.norm_in_b = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}, 0); + layer.norm_out_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}, 0); + layer.norm_out_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}, 0); + + layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0); + layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0); } } break; default: throw std::runtime_error("unknown vision architecture"); } - if (llama_vision_n_mmproj_embd(clip) != hparams.n_embd) { + if (llama_vision_n_mmproj_embd(vit) != hparams.n_embd) { std::runtime_error("model has vision, but n_mmproj_embd != n_embd"); } } + } ml.done_getting_tensors(); diff --git a/src/llama-model.h b/src/llama-model.h index d7a17d993efaf..1d3f53a71f0a7 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -365,7 +365,7 @@ struct llama_model { // vision bool has_vision = false; - llama_vision_model clip; + llama_vision_model vit; private: struct impl; diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index 3cae05d6b457c..d4471cd2e7917 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -19,8 +19,6 @@ struct img_size; static int bmp_export(const struct llama_image_u8 &img, const std::string &location); #endif -#define VISION_GRAPH_MAX_NODE 1024 - struct img_size { int width; int height; @@ -48,9 +46,9 @@ uint32_t llama_vision_n_mmproj_embd(const llama_vision_model & vmodel) { } else if (proj_type == VISION_PROJECTOR_TYPE_LDPV2) { return vmodel.mm_model_peg_0_b->ne[0]; } else if (proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_5) { - return 4096; + return 4096; // resampler } else if (proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_6) { - return 3584; + return 3584; // resampler } else { GGML_ASSERT(false && "invalid proj type"); } @@ -761,16 +759,21 @@ struct llama_vision_graph_builder { return cur; } - // graph for each vision arch - - struct ggml_cgraph * build_llava() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, VISION_GRAPH_MAX_NODE, false); + struct ggml_tensor * build_vit() { struct ggml_tensor * cur = build_inp(); cur = build_pre_norm(cur); for (int il = 0; il < n_layers; il++) { cur = build_layer(cur, il); } cur = build_post_norm(cur); + return cur; + } + + // graph for each vision arch + + struct ggml_cgraph * build_llava() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, VISION_GRAPH_MAX_NODE, false); + struct ggml_tensor * cur = build_vit(); // llava projector { @@ -825,6 +828,78 @@ struct llama_vision_graph_builder { return gf; } + + struct ggml_cgraph * build_minicpmv() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, VISION_GRAPH_MAX_NODE, false); + struct ggml_tensor * cur = build_vit(); + + // minicpmv resampler projector + { + int hidden_size = llama_vision_n_mmproj_embd(*ctx.model); + struct ggml_tensor * q = model.mm_model_query; + // layernorm + { + q = ggml_norm(ctx0, q, eps); + q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + } + + struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, cur); + // layernorm + { + v = ggml_norm(ctx0, v, eps); + v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); + } + + // position + struct ggml_tensor * k = ggml_add(ctx0, v, model.mm_model_pos_embed_k); + + // attention + { + const int d_head = 128; + int n_head = hidden_size/d_head; + int num_query = -1; + if (model.hparams.proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_5) { + num_query = 96; + } else if (model.hparams.proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_6) { + num_query = 64; + } + + struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); + struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); + struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); + // permute + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); // TODO: do this when converting the model + Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); // TODO: do this when converting the model + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); // TODO: do this when converting the model + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_inplace(ctx0, KQ); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); // TODO: do this when converting the model + KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); + + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); + } + // layernorm + { + cur = ggml_norm(ctx0, cur, eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.mm_model_ln_post_w), model.mm_model_ln_post_b); + } + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + } + + ggml_set_name(cur, "output"); + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_vision_tokens & inp) { @@ -852,8 +927,11 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ case LLM_ARCH_VISION_MOBILEVLM: gf = builder.build_llava(); break; + case LLM_ARCH_VISION_MINICPMV: + gf = builder.build_minicpmv(); + break; default: - GGML_ASSERT(false && "unsupported arch"); + GGML_ASSERT(false && "unsupported vision arch"); } // alloc memory for graph @@ -903,8 +981,8 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ free(positions_data); } - { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "inp_patches"); + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "inp_patches"); + if (patches) { int* patches_data = (int*)malloc(ggml_nbytes(patches)); for (int i = 0; i < num_patches; i++) { patches_data[i] = i + 1; @@ -962,7 +1040,8 @@ struct llama_vision_tokens * llama_vision_tokenize( case LLM_ARCH_VISION_MOBILEVLM: return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp)); case LLM_ARCH_VISION_MINICPMV: - return new llama_vision_tokens(llama_vision_processor_uhd(vctx).tokenize(*bmp)); + //return new llama_vision_tokens(llama_vision_processor_uhd(vctx).tokenize(*bmp)); + return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp)); default: GGML_ASSERT(false && "unsupported arch"); } diff --git a/src/llama-vision.h b/src/llama-vision.h index 374ae45376af7..45cb759448903 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -7,6 +7,8 @@ #include #include +#define VISION_GRAPH_MAX_NODE 2048 + enum vision_projector_type { VISION_PROJECTOR_TYPE_UNKNOWN, VISION_PROJECTOR_TYPE_MLP, @@ -108,24 +110,24 @@ struct llama_vision_model { struct ggml_tensor * mm_model_peg_0_b = nullptr; // MINICPMV projection - struct ggml_tensor * mm_model_pos_embed_k; - struct ggml_tensor * mm_model_query; - struct ggml_tensor * mm_model_proj; - struct ggml_tensor * mm_model_kv_proj; - struct ggml_tensor * mm_model_attn_q_w; - struct ggml_tensor * mm_model_attn_q_b; - struct ggml_tensor * mm_model_attn_k_w; - struct ggml_tensor * mm_model_attn_k_b; - struct ggml_tensor * mm_model_attn_v_w; - struct ggml_tensor * mm_model_attn_v_b; - struct ggml_tensor * mm_model_attn_o_w; - struct ggml_tensor * mm_model_attn_o_b; - struct ggml_tensor * mm_model_ln_q_w; - struct ggml_tensor * mm_model_ln_q_b; - struct ggml_tensor * mm_model_ln_kv_w; - struct ggml_tensor * mm_model_ln_kv_b; - struct ggml_tensor * mm_model_ln_post_w; - struct ggml_tensor * mm_model_ln_post_b; + struct ggml_tensor * mm_model_pos_embed_k = nullptr; + struct ggml_tensor * mm_model_query = nullptr; + struct ggml_tensor * mm_model_proj = nullptr; + struct ggml_tensor * mm_model_kv_proj = nullptr; + struct ggml_tensor * mm_model_attn_q_w = nullptr; + struct ggml_tensor * mm_model_attn_q_b = nullptr; + struct ggml_tensor * mm_model_attn_k_w = nullptr; + struct ggml_tensor * mm_model_attn_k_b = nullptr; + struct ggml_tensor * mm_model_attn_v_w = nullptr; + struct ggml_tensor * mm_model_attn_v_b = nullptr; + struct ggml_tensor * mm_model_attn_o_w = nullptr; + struct ggml_tensor * mm_model_attn_o_b = nullptr; + struct ggml_tensor * mm_model_ln_q_w = nullptr; + struct ggml_tensor * mm_model_ln_q_b = nullptr; + struct ggml_tensor * mm_model_ln_kv_w = nullptr; + struct ggml_tensor * mm_model_ln_kv_b = nullptr; + struct ggml_tensor * mm_model_ln_post_w = nullptr; + struct ggml_tensor * mm_model_ln_post_b = nullptr; struct ggml_tensor * image_newline = nullptr; }; diff --git a/src/llama.cpp b/src/llama.cpp index 6170a655a276a..f6d4e6e8ef378 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9838,9 +9838,9 @@ struct llama_context * llama_init_from_model( } if (model->has_vision) { - ctx->vctx.model = &model->clip; + ctx->vctx.model = &model->vit; ctx->vctx.sched = ctx->sched.get(); - const size_t max_nodes = 1024; + const size_t max_nodes = VISION_GRAPH_MAX_NODE; // TODO: make it dynamic ctx->vctx.buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); } From 8586d23c8abdb17e28174fb0ebd12c27adfdadd3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 23 Jan 2025 12:14:06 +0100 Subject: [PATCH 12/25] minicpm working without uhd --- convert_hf_to_gguf.py | 25 +++++++++++++++++++++++++ examples/vision/vision.cpp | 2 +- gguf-py/gguf/constants.py | 12 ++++++++++++ gguf-py/gguf/tensor_mapping.py | 16 ++++++++++++++++ src/llama-arch.cpp | 4 ++++ src/llama-arch.h | 4 ++++ src/llama-model.cpp | 6 ++++++ src/llama-vision.cpp | 4 ++++ src/llama-vision.h | 6 +++++- 9 files changed, 77 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9a05e99602575..e703cd33dfcb8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2339,6 +2339,7 @@ class MiniCPMVModel(Qwen2Model): model_arch = gguf.MODEL_ARCH.QWEN2 proj_type: gguf.constants.CLIPProjectorType | None resampler_n_embd = 0 + tok_embd_tensor: Tensor | None = None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2361,6 +2362,8 @@ def __init__(self, *args, **kwargs): for tname, tensor in self.get_tensors(): if tname == "resampler.ln_post.bias": self.resampler_n_embd = tensor.shape[0] + if tname.endswith("embed_tokens.weight"): + self.tok_embd_tensor = tensor if self.resampler_n_embd < 2: raise ValueError("Failed to detect resampler embedding size") else: @@ -2372,6 +2375,16 @@ def __init__(self, *args, **kwargs): self.hparams["vision_feature_layer"] = 0 self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"]) + def get_embd_of_tokens(self, map_token_to_tensor_name: Iterable[tuple[str, str]]) -> Iterable[tuple[str, Tensor]]: + if self.tok_embd_tensor is None: + raise ValueError("Token embedding tensor not found") + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + for token, tensor_name in map_token_to_tensor_name: + tok_id = tokenizer.get_vocab()[token] + row = self.tok_embd_tensor[tok_id] + yield tensor_name, row + def set_gguf_parameters(self): super().set_gguf_parameters() # For vision model @@ -2388,6 +2401,14 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: self.format_tensor_name(gguf.MODEL_TENSOR.V_RESMPL_POS_EMBD_K, is_vision=True), torch.from_numpy(self._get_2d_sincos_pos_embed(self.resampler_n_embd, (70, 70))) ) + added_tokens = [ + ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMAGE ] + ".weight"), + ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_END_IMAGE] + ".weight"), + ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_SLICE ] + ".weight"), + ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_END_SLICE] + ".weight"), + ] + for tensor_name, tensor in self.get_embd_of_tokens(added_tokens): + yield tensor_name, tensor def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -2404,6 +2425,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name_k = name.replace("in_proj_", "in_proj_k.") # in_proj_k.(weight|bias) name_v = name.replace("in_proj_", "in_proj_v.") # in_proj_v.(weight|bias) return [ + # TODO: permute these (self.map_tensor_name(name_q), split_tensor[0]), (self.map_tensor_name(name_k), split_tensor[1]), (self.map_tensor_name(name_v), split_tensor[2]), @@ -2413,6 +2435,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "resampler.proj" or name == "resampler.query": name += ".weight" + if name.startswith("resampler.proj"): + data_torch = data_torch.transpose(-1, -2).contiguous() + if "post_layernorm" in name: return [] # skip post_layernorm diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp index d994535f6e66f..bb2cdbf4ea657 100644 --- a/examples/vision/vision.cpp +++ b/examples/vision/vision.cpp @@ -100,7 +100,7 @@ int main(int argc, char ** argv) { // default prompt for llava 1.5 //params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:\nwhat did you see?\nASSISTANT:"; // default prompt for minicpmv 2.6 - params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nwhat did you see?\n<|im_end|>\n<|im_start|>assistant\n"; + params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nwhat did you see?\n<|im_end|>\n<|im_start|>assistant\n"; params.n_predict = 64; params.n_batch = 2048; params.n_ubatch = 1024; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6cc9609fcf8ca..f4da3e234abdb 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -467,6 +467,10 @@ class MODEL_TENSOR(IntEnum): V_RESMPL_Q_NORM = auto() # minicpmv V_RESMPL_PROJ = auto() # minicpmv V_RESMPL_QUERY = auto() # minicpmv + V_TOK_EMBD_IMAGE = auto() # embedding for token + V_TOK_EMBD_END_IMAGE = auto() # embedding for token + V_TOK_EMBD_SLICE = auto() # embedding for token + V_TOK_EMBD_END_SLICE = auto() # embedding for token MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -686,6 +690,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_Q_NORM: "v.resmpl.q_norm", MODEL_TENSOR.V_RESMPL_PROJ: "v.resmpl.proj", MODEL_TENSOR.V_RESMPL_QUERY: "v.resmpl.query", + MODEL_TENSOR.V_TOK_EMBD_IMAGE: "v.tok_embd.image", + MODEL_TENSOR.V_TOK_EMBD_END_IMAGE: "v.tok_embd.end_image", + MODEL_TENSOR.V_TOK_EMBD_SLICE: "v.tok_embd.slice", + MODEL_TENSOR.V_TOK_EMBD_END_SLICE: "v.tok_embd.end_slice", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1682,6 +1690,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_Q_NORM, MODEL_TENSOR.V_RESMPL_PROJ, MODEL_TENSOR.V_RESMPL_QUERY, + MODEL_TENSOR.V_TOK_EMBD_IMAGE, + MODEL_TENSOR.V_TOK_EMBD_END_IMAGE, + MODEL_TENSOR.V_TOK_EMBD_SLICE, + MODEL_TENSOR.V_TOK_EMBD_END_SLICE, ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b756ec184136e..0228e84000b11 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -907,6 +907,22 @@ class TensorNameMap: MODEL_TENSOR.V_RESMPL_QUERY: ( "resampler.query", ), + + MODEL_TENSOR.V_TOK_EMBD_IMAGE:( + "v.tok_embd.image", # tensor generated from token embeddings + ), + + MODEL_TENSOR.V_TOK_EMBD_END_IMAGE:( + "v.tok_embd.end_image", # tensor generated from token embeddings + ), + + MODEL_TENSOR.V_TOK_EMBD_SLICE:( + "v.tok_embd.slice", # tensor generated from token embeddings + ), + + MODEL_TENSOR.V_TOK_EMBD_END_SLICE:( + "v.tok_embd.end_slice", # tensor generated from token embeddings + ), } # architecture-specific block mappings diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 0b20b03f6a0de..1a6d4533156a8 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1382,6 +1382,10 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_V_RESMPL_Q_NORM, "v.resmpl.q_norm" }, { LLM_TENSOR_V_RESMPL_PROJ, "v.resmpl.proj" }, { LLM_TENSOR_V_RESMPL_QUERY, "v.resmpl.query" }, + { LLM_TENSOR_V_TOK_EMBD_IMAGE, "v.tok_embd.image" }, + { LLM_TENSOR_V_TOK_EMBD_END_IMAGE, "v.tok_embd.end_image" }, + { LLM_TENSOR_V_TOK_EMBD_SLICE, "v.tok_embd.slice" }, + { LLM_TENSOR_V_TOK_EMBD_END_SLICE, "v.tok_embd.end_slice" }, } }, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 4f3e76a5f20c6..3440ded53f2b5 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -381,6 +381,10 @@ enum llm_tensor { LLM_TENSOR_V_RESMPL_Q_NORM, LLM_TENSOR_V_RESMPL_PROJ, LLM_TENSOR_V_RESMPL_QUERY, + LLM_TENSOR_V_TOK_EMBD_IMAGE, + LLM_TENSOR_V_TOK_EMBD_END_IMAGE, + LLM_TENSOR_V_TOK_EMBD_SLICE, + LLM_TENSOR_V_TOK_EMBD_END_SLICE, }; enum llm_tensor_layer { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1bebc7988def4..4aed37d89c2e1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3549,6 +3549,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { vit.mm_model_ln_post_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "weight"), {rs_n_embd}); vit.mm_model_ln_post_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "bias" ), {rs_n_embd}); + // tok embd + vit.mm_tok_embd_image = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_IMAGE, "weight"), {n_embd}); + vit.mm_tok_embd_end_image = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_IMAGE, "weight"), {n_embd}); + vit.mm_tok_embd_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_SLICE, "weight"), {n_embd}); + vit.mm_tok_embd_end_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_SLICE, "weight"), {n_embd}); + for (int i = 0; i < n_vlayer; ++i) { auto & layer = vit.layers[i]; diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index d4471cd2e7917..ca65d536bc51e 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -895,6 +895,10 @@ struct llama_vision_graph_builder { cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); } + // add and token embeddings + cur = ggml_concat(ctx0, model.mm_tok_embd_image, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_tok_embd_end_image, 1); + ggml_set_name(cur, "output"); ggml_build_forward_expand(gf, cur); diff --git a/src/llama-vision.h b/src/llama-vision.h index 45cb759448903..948c8d0ed1e03 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -129,7 +129,11 @@ struct llama_vision_model { struct ggml_tensor * mm_model_ln_post_w = nullptr; struct ggml_tensor * mm_model_ln_post_b = nullptr; - struct ggml_tensor * image_newline = nullptr; + // special tokens + struct ggml_tensor * mm_tok_embd_image = nullptr; + struct ggml_tensor * mm_tok_embd_end_image = nullptr; + struct ggml_tensor * mm_tok_embd_slice = nullptr; + struct ggml_tensor * mm_tok_embd_end_slice = nullptr; }; struct llama_vision_context { From 25a97ce4cbc69f8fc1ee4911218e569298d1ad6a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 23 Jan 2025 13:34:13 +0100 Subject: [PATCH 13/25] correct positions for siglip --- examples/vision/vision.cpp | 2 +- src/llama-vision.cpp | 58 ++++++++++++++++++++++++++------------ 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp index bb2cdbf4ea657..d97067bba616f 100644 --- a/examples/vision/vision.cpp +++ b/examples/vision/vision.cpp @@ -100,7 +100,7 @@ int main(int argc, char ** argv) { // default prompt for llava 1.5 //params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:\nwhat did you see?\nASSISTANT:"; // default prompt for minicpmv 2.6 - params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nwhat did you see?\n<|im_end|>\n<|im_start|>assistant\n"; + params.prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nwhat do you see?<|im_end|>\n<|im_start|>assistant\n"; params.n_predict = 64; params.n_batch = 2048; params.n_ubatch = 1024; diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index ca65d536bc51e..209d0b137dc1d 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -501,15 +501,14 @@ struct llama_vision_processor_uhd : llama_vision_processor { llama_image_u8 source_image; bicubic_resize(img, source_image, best_size.first, best_size.second); // source_image = image.resize(best_size, Image.Resampling.BICUBIC) - images[images.size()-1].push_back(source_image); - } - else if (multiple > 1) { + images.back().push_back(source_image); + } else if (multiple > 1) { auto best_size = find_best_resize(original_size, scale_resolution, patch_size); llama_image_u8 source_image; bicubic_resize(img, source_image, best_size.first, best_size.second); // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) LLAMA_LOG_DEBUG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img.nx, img.ny, best_size.first, best_size.second); - images[images.size()-1].push_back(source_image); + images.back().push_back(source_image); std::pair best_grid = find_best_grid(max_slice_nums, multiple, log_ratio); LLAMA_LOG_DEBUG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img.nx, img.ny, best_grid.first, best_grid.second); @@ -541,7 +540,7 @@ struct llama_vision_processor_uhd : llama_vision_processor { patch.buf[j+2] = refine_image.buf[i+2]; } } - images[images.size()-1].push_back(patch); + images.back().push_back(patch); } } } @@ -948,7 +947,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ // set raw input { struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); - float * data = (float *)malloc(ggml_nbytes(inp_raw)); + std::vector inp_buf(ggml_nelements(inp_raw)); for (int i = 0; i < batch_size; i++) { const int nx = inp.px * inp.n_px; @@ -959,14 +958,13 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ for (int k = 0; k < 3; k++) { for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { - data[(b * 3 * n) + k * n + y * nx + x] = inp.buf[b][3 * (y * nx + x) + k]; + inp_buf[(b * 3 * n) + k * n + y * nx + x] = inp.buf[b][3 * (y * nx + x) + k]; } } } } } - ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); - free(data); + ggml_backend_tensor_set(inp_raw, inp_buf.data(), 0, ggml_nbytes(inp_raw)); } if (model.class_embedding) { @@ -974,33 +972,57 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ ggml_set_zero(inp_embd); } - { + if (hparams.arch == LLM_ARCH_VISION_MINICPMV) { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "inp_pos"); + std::vector pos_buf(ggml_nelements(positions)); + GGML_ASSERT(num_positions == (int)pos_buf.size()); - int* positions_data = (int*)malloc(ggml_nbytes(positions)); + int bucket_coords_h[70]; + int bucket_coords_w[70]; + for (size_t i = 0; i < inp.n_py; i++) { + bucket_coords_h[i] = std::floor(70.0*i/inp.n_py); + } + for (size_t i = 0; i < inp.n_px; i++) { + bucket_coords_w[i] = std::floor(70.0*i/inp.n_px); + } + for (size_t i = 0, id = 0; i < inp.n_py; i++){ + for (size_t j = 0; j < inp.n_px; j++){ + pos_buf[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + } + } + ggml_backend_tensor_set(positions, pos_buf.data(), 0, ggml_nbytes(positions)); + + } else { + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "inp_pos"); + std::vector pos_buf(ggml_nelements(positions)); + GGML_ASSERT(num_positions == (int)pos_buf.size()); for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; + pos_buf[i] = i; } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); + ggml_backend_tensor_set(positions, pos_buf.data(), 0, ggml_nbytes(positions)); } struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "inp_patches"); if (patches) { - int* patches_data = (int*)malloc(ggml_nbytes(patches)); + std::vector patches_buf(ggml_nelements(patches)); + GGML_ASSERT(num_patches == (int)patches_buf.size()); for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; + patches_buf[i] = i + 1; } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); + ggml_backend_tensor_set(patches, patches_buf.data(), 0, ggml_nbytes(patches)); } // compute + int64_t t_start = ggml_time_ms(); ggml_backend_sched_graph_compute(ctx.sched, gf); // the last node is the embedding tensor struct ggml_tensor * output_node = ggml_graph_node(gf, -1); //LLAMA_LOG_INFO("%s: output tensor shape = %lld %lld %lld %lld\n", __func__, output->ne[0], output->ne[1], output->ne[2], output->ne[3]); + LLAMA_LOG_DEBUG("%s: compute time = %lld ms\n", __func__, ggml_time_ms() - t_start); // copy output node to context if (ctx.ctx_ggml) { From c3a654c0fbad4c7eeeaf669fc708d40aef6f341c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 23 Jan 2025 15:51:30 +0100 Subject: [PATCH 14/25] add SmolVLM --- convert_hf_to_gguf.py | 37 ++++++++++++++++++++++-------- gguf-py/gguf/constants.py | 19 +++++++++++++++ gguf-py/gguf/gguf_writer.py | 3 +++ gguf-py/gguf/tensor_mapping.py | 15 ++++++++++++ src/llama-arch.cpp | 21 +++++++++++++++++ src/llama-arch.h | 3 +++ src/llama-model.cpp | 38 ++++++++++++++++++++++++++++++ src/llama-vision.cpp | 42 +++++++++++++++++++++++++++++++++- src/llama-vision.h | 3 +++ 9 files changed, 171 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e703cd33dfcb8..27bf2c1f213ac 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -292,7 +292,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_vit_head_count(self.vparams["num_attention_heads"]) self.gguf_writer.add_vision_vit_image_mean(self.preprocessor_config["image_mean"]) self.gguf_writer.add_vision_vit_image_std(self.preprocessor_config["image_std"]) - self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"])) + try: + self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"])) + except KeyError: + self.gguf_writer.add_vision_vit_select_layer(0) self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") @@ -506,8 +509,9 @@ def load_hparams(dir_model: Path): hparams = json.load(f) if "text_config" in hparams: text_config = hparams["text_config"] + model_id = text_config.get("_name_or_path", None) # for example, llava-1.5-7b-hf misses the language model config, need to retrieve it via model ID - if "_name_or_path" in text_config: + if model_id is not None and model_id != "None" and model_id != "": text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict() hparams = {**text_config, **hparams} return hparams @@ -1616,7 +1620,7 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration", "MobileLlamaForCausalLM") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration", "MobileLlamaForCausalLM", "Idefics3ForConditionalGeneration") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA @@ -1640,6 +1644,11 @@ def __init__(self, *args, **kwargs): self.preprocessor_config = AutoImageProcessor.from_pretrained(vision_model_id).to_dict() self.vision_arch = gguf.MODEL_ARCH.VISION_MOBILEVLM + if "vision_config" in self.hparams and model_type == "idefics3": + self.vparams = self.hparams["vision_config"] + self.preprocessor_config = self.load_preprocessor_config(self.dir_model) + self.vision_arch = gguf.MODEL_ARCH.VISION_IDEFICS3 + if self.vparams is not None and self.vision_arch is not None: self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"]) @@ -1694,14 +1703,20 @@ def set_gguf_parameters(self): # For vision model if self.vparams is not None: + max_pos_embd = -1 self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) # TODO: should not hardcode these, but they are currently missing from config.json if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA: self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP) + max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM: self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.LDPV2) + max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 + if self.vision_arch == gguf.MODEL_ARCH.VISION_IDEFICS3: + self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP) + self.gguf_writer.add_vision_vit_scale_factor(self.hparams["scale_factor"]) + max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-05) - max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd) @staticmethod @@ -1717,19 +1732,23 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") + is_vision_tensor = "vision_tower" in name or "vision_model" in name # For vision model if name.startswith("language_model"): name = name.replace("language_model.", "") + if name.startswith("model.text_model"): + name = name.replace("text_model.", "") # for SmolVLM else: name = name.replace("model.vision_tower.", "") - if "post_layernorm" in name: + if "post_layernorm" in name and self.vision_arch != gguf.MODEL_ARCH.VISION_IDEFICS3: return [] # skip post_layernorm - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + if not is_vision_tensor: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) # process the experts separately if name.find("block_sparse_moe.experts") != -1: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f4da3e234abdb..cc11aa56d93e9 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -238,6 +238,7 @@ class Vit: PATCH_MERGE_TYPE = "vision.vit.patch_merge_type" HEAD_COUNT = "vision.vit.attention.head_count" LAYERNORM_EPS = "vision.vit.attention.layer_norm_epsilon" + SCALE_FACTOR = "vision.vit.scale_factor" # only used by idefics3 for now # # recommended mapping of model tensor names for storage in gguf @@ -311,6 +312,7 @@ class MODEL_ARCH(IntEnum): VISION_LLAVA = auto() VISION_MOBILEVLM = auto() VISION_MINICPMV = auto() + VISION_IDEFICS3 = auto() class MODEL_TENSOR(IntEnum): @@ -441,6 +443,7 @@ class MODEL_TENSOR(IntEnum): POSNET_ATTN_OUT = auto() # vision V_MMPROJ = auto() + V_MMPROJ_FC = auto() V_MMPROJ_MLP = auto() V_MMPROJ_PEG = auto() V_ENC_EMBD_CLS = auto() @@ -535,6 +538,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.VISION_LLAVA: "llava", MODEL_ARCH.VISION_MOBILEVLM: "mobilevlm", MODEL_ARCH.VISION_MINICPMV: "minicpmv", + MODEL_ARCH.VISION_IDEFICS3: "idefics3", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -664,6 +668,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", # vision MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}", + MODEL_TENSOR.V_MMPROJ_FC: "v.mmproj.fc", MODEL_TENSOR.V_MMPROJ_MLP: "v.mmproj.mlp.{bid}", MODEL_TENSOR.V_MMPROJ_PEG: "v.mmproj.peg.{bid}", MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls", @@ -1695,6 +1700,20 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_TOK_EMBD_SLICE, MODEL_TENSOR.V_TOK_EMBD_END_SLICE, ], + MODEL_ARCH.VISION_IDEFICS3: [ + MODEL_TENSOR.V_MMPROJ_FC, + MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_ATTN_Q, + MODEL_TENSOR.V_ENC_ATTN_K, + MODEL_TENSOR.V_ENC_ATTN_V, + MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_OUTPUT, + MODEL_TENSOR.V_ENC_OUTPUT_NORM, + MODEL_TENSOR.V_ENC_FFN_UP, + MODEL_TENSOR.V_ENC_FFN_DOWN, + MODEL_TENSOR.V_POST_NORM, + ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 65d0e8f3004ab..a31ab736bc20a 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -928,6 +928,9 @@ def add_vision_vit_image_mean(self, value: Sequence[float]) -> None: def add_vision_vit_image_std(self, value: Sequence[float]) -> None: self.add_array(Keys.Vision.IMAGE_STD, value) + def add_vision_vit_scale_factor(self, value: int) -> None: + self.add_int32(Keys.Vision.Vit.SCALE_FACTOR, value) + def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: if not isinstance(value, str): template_default = None diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0228e84000b11..3f247d787ba11 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -794,6 +794,10 @@ class TensorNameMap: "multi_modal_projector.linear_{bid}", ), + MODEL_TENSOR.V_MMPROJ_FC: ( + "model.connector.modality_projection.proj", # SmolVLM + ), + MODEL_TENSOR.V_MMPROJ_MLP: ( "model.mm_projector.mlp.mlp.{bid}", ), @@ -809,51 +813,61 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_PATCH: ( "vision_tower.vision_model.embeddings.patch_embedding", "vpm.embeddings.patch_embedding", + "model.vision_model.embeddings.patch_embedding", # SmolVLM ), MODEL_TENSOR.V_ENC_EMBD_POS: ( "vision_tower.vision_model.embeddings.position_embedding", "vpm.embeddings.position_embedding", + "model.vision_model.embeddings.position_embedding", # SmolVLM ), MODEL_TENSOR.V_ENC_ATTN_Q: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", "vpm.encoder.layers.{bid}.self_attn.q_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM ), MODEL_TENSOR.V_ENC_ATTN_K: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", "vpm.encoder.layers.{bid}.self_attn.k_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM ), MODEL_TENSOR.V_ENC_ATTN_V: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", "vpm.encoder.layers.{bid}.self_attn.v_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", "vpm.encoder.layers.{bid}.layer_norm1", + "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM ), MODEL_TENSOR.V_ENC_OUTPUT: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", "vpm.encoder.layers.{bid}.self_attn.out_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM ), MODEL_TENSOR.V_ENC_OUTPUT_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", "vpm.encoder.layers.{bid}.layer_norm2", + "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM ), MODEL_TENSOR.V_ENC_FFN_UP: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", "vpm.encoder.layers.{bid}.mlp.fc1", + "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM ), MODEL_TENSOR.V_ENC_FFN_DOWN: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", "vpm.encoder.layers.{bid}.mlp.fc2", + "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM ), MODEL_TENSOR.V_PRE_NORM: ( @@ -862,6 +876,7 @@ class TensorNameMap: MODEL_TENSOR.V_POST_NORM: ( "vision_tower.vision_model.post_layernorm", + "model.vision_model.post_layernorm", # SmolVLM ), MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 1a6d4533156a8..92e488f57a59d 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -66,6 +66,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_VISION_LLAVA, "llava" }, { LLM_ARCH_VISION_MOBILEVLM, "mobilevlm" }, { LLM_ARCH_VISION_MINICPMV, "minicpmv" }, + { LLM_ARCH_VISION_IDEFICS3, "idefics3" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -214,6 +215,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, "vision.vit.patch_merge_type" }, { LLM_KV_VISION_VIT_HEAD_COUNT, "vision.vit.attention.head_count" }, { LLM_KV_VISION_VIT_LAYERNORM_EPS, "vision.vit.attention.layer_norm_epsilon" }, + { LLM_KV_VISION_VIT_SCALE_FACTOR, "vision.vit.scale_factor" }, // deprecated { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, @@ -1388,6 +1390,25 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_V_TOK_EMBD_END_SLICE, "v.tok_embd.end_slice" }, } }, + { + LLM_ARCH_VISION_IDEFICS3, + { + { LLM_TENSOR_V_MMPROJ_FC, "v.mmproj.fc" }, + { LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" }, + { LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" }, + { LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" }, + { LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" }, + { LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" }, + { LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" }, + { LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" }, + { LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" }, + { LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" }, + { LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" }, + { LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" }, + { LLM_TENSOR_V_PRE_NORM, "v.pre_norm" }, + { LLM_TENSOR_V_POST_NORM, "v.post_norm" }, + } + }, { LLM_ARCH_UNKNOWN, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 3440ded53f2b5..c3fc3203201d0 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -70,6 +70,7 @@ enum llm_arch { LLM_ARCH_VISION_LLAVA, LLM_ARCH_VISION_MOBILEVLM, LLM_ARCH_VISION_MINICPMV, + LLM_ARCH_VISION_IDEFICS3, LLM_ARCH_UNKNOWN, }; @@ -218,6 +219,7 @@ enum llm_kv { LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, LLM_KV_VISION_VIT_HEAD_COUNT, LLM_KV_VISION_VIT_LAYERNORM_EPS, + LLM_KV_VISION_VIT_SCALE_FACTOR, // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, @@ -354,6 +356,7 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_OUT, // vision LLM_TENSOR_V_MMPROJ, + LLM_TENSOR_V_MMPROJ_FC, LLM_TENSOR_V_MMPROJ_MLP, LLM_TENSOR_V_MMPROJ_PEG, LLM_TENSOR_V_ENC_EMBD_CLS, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4aed37d89c2e1..6a6b656181477 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1265,6 +1265,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_VISION_VIT_LAYERNORM_EPS, vparams.eps, true); ml.get_key(LLM_KV_VISION_VIT_SELECT_LAYER, vparams.select_layer, true); ml.get_key(LLM_KV_VISION_VIT_MAX_POS_EMBD, vparams.max_pos_embd, true); + ml.get_key(LLM_KV_VISION_VIT_SCALE_FACTOR, vparams.scale_factor, false); { std::string name; ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true); @@ -3555,6 +3556,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) { vit.mm_tok_embd_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_SLICE, "weight"), {n_embd}); vit.mm_tok_embd_end_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_SLICE, "weight"), {n_embd}); + for (int i = 0; i < n_vlayer; ++i) { + auto & layer = vit.layers[i]; + + layer.k_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}, 0); + layer.k_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}, 0); + layer.v_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}, 0); + layer.v_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}, 0); + layer.q_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}, 0); + layer.q_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}, 0); + + layer.ffn_up_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}, 0); + layer.ffn_down_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}, 0); + + layer.norm_in_w = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}, 0); + layer.norm_in_b = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}, 0); + layer.norm_out_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}, 0); + layer.norm_out_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}, 0); + + layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0); + layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0); + } + } break; + case LLM_ARCH_VISION_IDEFICS3: + { + int scale_factor = vit.hparams.scale_factor; + vit.projection = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_FC, "weight"), {n_vembd * scale_factor * scale_factor, n_embd}); + + vit.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); + vit.patch_bias = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd}); + vit.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + + vit.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}); + vit.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}); + for (int i = 0; i < n_vlayer; ++i) { auto & layer = vit.layers[i]; @@ -4085,6 +4122,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { case LLM_ARCH_VISION_LLAVA: case LLM_ARCH_VISION_MOBILEVLM: case LLM_ARCH_VISION_MINICPMV: + case LLM_ARCH_VISION_IDEFICS3: GGML_ABORT("vision arch does not use RoPE"); // all model arches should be listed explicitly here diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index 209d0b137dc1d..2da15e2ebc95f 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -42,7 +42,9 @@ struct llama_image_u8 { uint32_t llama_vision_n_mmproj_embd(const llama_vision_model & vmodel) { auto & proj_type = vmodel.hparams.proj_type; if (proj_type == VISION_PROJECTOR_TYPE_MLP) { - return vmodel.mm_2_b->ne[0]; + return vmodel.mm_2_b + ? vmodel.mm_2_b->ne[0] + : vmodel.projection->ne[1]; // idefics3 } else if (proj_type == VISION_PROJECTOR_TYPE_LDPV2) { return vmodel.mm_model_peg_0_b->ne[0]; } else if (proj_type == VISION_PROJECTOR_TYPE_MINICPMV_2_5) { @@ -903,6 +905,40 @@ struct llama_vision_graph_builder { return gf; } + + struct ggml_cgraph * build_idefics3() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, VISION_GRAPH_MAX_NODE, false); + struct ggml_tensor * cur = build_vit(); + + // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 + { + const int scale_factor = model.hparams.scale_factor; + const int n_embd = cur->ne[0]; + const int seq = cur->ne[1]; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = std::sqrt(seq); + const int width = std::sqrt(seq); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + seq / (scale_factor * scale_factor), + bsz); + + cur = ggml_mul_mat(ctx0, model.projection, cur); + } + + ggml_set_name(cur, "output"); + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_vision_tokens & inp) { @@ -933,6 +969,9 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ case LLM_ARCH_VISION_MINICPMV: gf = builder.build_minicpmv(); break; + case LLM_ARCH_VISION_IDEFICS3: + gf = builder.build_idefics3(); + break; default: GGML_ASSERT(false && "unsupported vision arch"); } @@ -1064,6 +1103,7 @@ struct llama_vision_tokens * llama_vision_tokenize( switch (vctx.model->hparams.arch) { case LLM_ARCH_VISION_LLAVA: case LLM_ARCH_VISION_MOBILEVLM: + case LLM_ARCH_VISION_IDEFICS3: return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp)); case LLM_ARCH_VISION_MINICPMV: //return new llama_vision_tokens(llama_vision_processor_uhd(vctx).tokenize(*bmp)); diff --git a/src/llama-vision.h b/src/llama-vision.h index 948c8d0ed1e03..953ec57953079 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -48,6 +48,9 @@ struct llama_vision_model { std::array image_grid_pinpoints; // TODO: should this be array of (x, y) pairs? int32_t image_crop_resolution; + + // idefics3 + int scale_factor = 0; }; struct vision_hparams hparams; ggml_backend_buffer_type_t buft; From b986af80de653795ab5facee28079ff5a0b1f018 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 23 Jan 2025 23:07:08 +0100 Subject: [PATCH 15/25] py: a bit cleaner --- convert_hf_to_gguf.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 27bf2c1f213ac..b3c2ce2b1c7df 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1734,17 +1734,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_kv_head = self.hparams.get("num_key_value_heads") is_vision_tensor = "vision_tower" in name or "vision_model" in name - # For vision model - if name.startswith("language_model"): - name = name.replace("language_model.", "") - if name.startswith("model.text_model"): - name = name.replace("text_model.", "") # for SmolVLM - else: - name = name.replace("model.vision_tower.", "") - if "post_layernorm" in name and self.vision_arch != gguf.MODEL_ARCH.VISION_IDEFICS3: - return [] # skip post_layernorm + if is_vision_tensor: + if name.startswith("model.text_model"): + name = name.replace("text_model.", "") # for SmolVLM + else: + name = name.replace("model.vision_tower.", "") + if "post_layernorm" in name and self.vision_arch != gguf.MODEL_ARCH.VISION_IDEFICS3: + return [] # skip post_layernorm if not is_vision_tensor: + if name.startswith("language_model"): + # language model tensors, remove the prefix + name = name.replace("language_model.", "") if name.endswith(("q_proj.weight", "q_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): From 90eefc2ba4cea865e2684d78653303ccdd292e48 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 25 Jan 2025 15:52:54 +0100 Subject: [PATCH 16/25] refactor minicpm-v support --- convert_hf_to_gguf.py | 78 +++++++++++++++++----------- src/llama-arch.cpp | 23 +++++++-- src/llama-arch.h | 1 + src/llama-model.cpp | 117 +++++++++++++++++++++++------------------- src/llama-vision.cpp | 103 ++++++++++++++++++++----------------- 5 files changed, 186 insertions(+), 136 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b3c2ce2b1c7df..d7eab4c469c63 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1008,6 +1008,29 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) +# TODO: maybe merge this with Model in the future +class VisionModelHelper: + model: Model + tok_embd_tensor: Tensor | None = None + + def __init__(self, model: Model): + self.model = model + # TODO: how to do this without reading the whole safetensor file? + for tname, tensor in model.get_tensors(): + if tname.endswith("embed_tokens.weight"): + self.tok_embd_tensor = tensor + + def get_embd_for_tokens(self, map_token_to_tensor_name: Iterable[tuple[str, gguf.MODEL_TENSOR]], tensor_name_postfix = '.weight') -> Iterable[tuple[str, Tensor]]: + if self.tok_embd_tensor is None: + raise ValueError("Token embedding tensor not found") + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.model.dir_model, trust_remote_code=True) + for token, tensor_name in map_token_to_tensor_name: + tok_id = tokenizer.get_vocab()[token] + row = self.tok_embd_tensor[tok_id] + yield gguf.TENSOR_NAMES[tensor_name] + tensor_name_postfix, row + + @Model.register("GPTNeoXForCausalLM") class GPTNeoXModel(Model): model_arch = gguf.MODEL_ARCH.GPTNEOX @@ -2355,11 +2378,11 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: @Model.register("MiniCPMV") class MiniCPMVModel(Qwen2Model): - # based on minicpmv-surgery.py, not sure why it is Qwen2Model instead of MiniCPMModel + # MiniCPM-V 2.5 is Qwen2 and 2.6 is Qwen-2.5 model_arch = gguf.MODEL_ARCH.QWEN2 proj_type: gguf.constants.CLIPProjectorType | None resampler_n_embd = 0 - tok_embd_tensor: Tensor | None = None + vhelper: VisionModelHelper | None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2378,56 +2401,49 @@ def __init__(self, *args, **kwargs): self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_6 else: raise ValueError(f"Unsupported MiniCPM-V version: {version}") + self.vhelper = VisionModelHelper(self) # TODO: how to do this without reading the whole safetensor file? for tname, tensor in self.get_tensors(): if tname == "resampler.ln_post.bias": self.resampler_n_embd = tensor.shape[0] - if tname.endswith("embed_tokens.weight"): - self.tok_embd_tensor = tensor if self.resampler_n_embd < 2: raise ValueError("Failed to detect resampler embedding size") else: raise ValueError("Expected vision_config, but not found") - if self.vparams is not None and self.vision_arch is not None and self.preprocessor_config is not None: - self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5] - self.preprocessor_config["image_std"] = [0.5, 0.5, 0.5] - self.hparams["vision_feature_layer"] = 0 - self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"]) - - def get_embd_of_tokens(self, map_token_to_tensor_name: Iterable[tuple[str, str]]) -> Iterable[tuple[str, Tensor]]: - if self.tok_embd_tensor is None: - raise ValueError("Token embedding tensor not found") - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - for token, tensor_name in map_token_to_tensor_name: - tok_id = tokenizer.get_vocab()[token] - row = self.tok_embd_tensor[tok_id] - yield tensor_name, row + assert self.vparams is not None + assert self.vision_arch is not None + assert self.preprocessor_config is not None + self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5] + self.preprocessor_config["image_std"] = [0.5, 0.5, 0.5] + self.hparams["vision_feature_layer"] = 0 + self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"]) def set_gguf_parameters(self): super().set_gguf_parameters() - # For vision model - if self.vparams is not None and self.proj_type is not None: - self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) - self.gguf_writer.add_vision_vit_projector_type(self.proj_type) - self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06) - max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 - self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd) + assert self.vparams is not None and self.proj_type is not None + self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) + self.gguf_writer.add_vision_vit_projector_type(self.proj_type) + self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06) + max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # because the model operates excusively on 70x70 patches for now, we should precompute the positional embeddings to gain performance + # in the future, we can do it in cpp if we figure out how to do it efficiently yield ( self.format_tensor_name(gguf.MODEL_TENSOR.V_RESMPL_POS_EMBD_K, is_vision=True), torch.from_numpy(self._get_2d_sincos_pos_embed(self.resampler_n_embd, (70, 70))) ) + assert self.vhelper is not None added_tokens = [ - ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMAGE ] + ".weight"), - ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_END_IMAGE] + ".weight"), - ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_SLICE ] + ".weight"), - ("", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_END_SLICE] + ".weight"), + ("", gguf.MODEL_TENSOR.V_TOK_EMBD_IMAGE), + ("", gguf.MODEL_TENSOR.V_TOK_EMBD_END_IMAGE), + ("", gguf.MODEL_TENSOR.V_TOK_EMBD_SLICE), + ("", gguf.MODEL_TENSOR.V_TOK_EMBD_END_SLICE), ] - for tensor_name, tensor in self.get_embd_of_tokens(added_tokens): + for tensor_name, tensor in self.vhelper.get_embd_for_tokens(added_tokens): yield tensor_name, tensor def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 92e488f57a59d..0da19fe67d340 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1559,9 +1559,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // vision - {LLM_TENSOR_V_MMPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_V_MMPROJ_MLP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_V_MMPROJ_PEG, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_MMPROJ, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_MMPROJ_MLP, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_MMPROJ_PEG, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, {LLM_TENSOR_V_ENC_EMBD_CLS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}}, {LLM_TENSOR_V_ENC_EMBD_PATCH, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}}, {LLM_TENSOR_V_ENC_EMBD_POS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}}, @@ -1575,7 +1575,22 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_V_ENC_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_V_PRE_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_V_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - // TODO: add minicpmv resampler tensors + {LLM_TENSOR_V_RESMPL_POS_EMBD_K, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_ADD}}, + {LLM_TENSOR_V_RESMPL_ATTN_Q, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_RESMPL_ATTN_K, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_RESMPL_ATTN_V, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_RESMPL_ATTN_OUT, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_RESMPL_KV, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_RESMPL_KV_NORM, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL}}, + {LLM_TENSOR_V_RESMPL_POST_NORM, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL}}, + {LLM_TENSOR_V_RESMPL_Q_NORM, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL}}, + {LLM_TENSOR_V_RESMPL_PROJ, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_V_RESMPL_QUERY, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, + // special token embeddings for image + {LLM_TENSOR_V_TOK_EMBD_IMAGE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}}, + {LLM_TENSOR_V_TOK_EMBD_END_IMAGE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}}, + {LLM_TENSOR_V_TOK_EMBD_SLICE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}}, + {LLM_TENSOR_V_TOK_EMBD_END_SLICE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index c3fc3203201d0..a84e17b570076 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -393,6 +393,7 @@ enum llm_tensor { enum llm_tensor_layer { LLM_TENSOR_LAYER_INPUT, LLM_TENSOR_LAYER_REPEATING, + LLM_TENSOR_LAYER_PROJECTION, LLM_TENSOR_LAYER_OUTPUT, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6a6b656181477..d5cd1eb048d57 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -217,6 +217,11 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1); op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16); } break; + case GGML_OP_CONCAT: + { + ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); + op_tensor = ggml_concat(ctx, w, b, 0); + } break; default: GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name); } @@ -1469,7 +1474,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } // sanity checks - if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) { + if (info.layer == LLM_TENSOR_LAYER_PROJECTION) { + // nothing to check + } else if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) { if (tn.bid != -1) { GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str()); } @@ -1491,6 +1498,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_TENSOR_LAYER_REPEATING: buft_list = pimpl->dev_layer.at(tn.bid).buft_list; break; + case LLM_TENSOR_LAYER_PROJECTION: + buft_list = pimpl->dev_layer.back().buft_list; + break; default: GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str()); } @@ -3469,7 +3479,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // TODO: vit is cpu only for now vit.buft = ggml_backend_cpu_buffer_type(); - ggml_context * ctx_vision = ctx_map.at(vit.buft); vit.layers.resize(n_vlayer); switch (vparams.arch) { @@ -3477,27 +3486,27 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_VISION_MOBILEVLM: { if (vparams.arch == LLM_ARCH_VISION_LLAVA) { - vit.mm_1_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 1), {n_vembd, n_vff}); - vit.mm_1_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 1), {n_vff}); - vit.mm_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "weight", 2), {n_vff, n_vff}); - vit.mm_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ, "bias" , 2), {n_vff}); + vit.mm_1_w = create_tensor(tn(LLM_TENSOR_V_MMPROJ, "weight", 1), {n_vembd, n_vff}, 0); + vit.mm_1_b = create_tensor(tn(LLM_TENSOR_V_MMPROJ, "bias" , 1), {n_vff}, 0); + vit.mm_2_w = create_tensor(tn(LLM_TENSOR_V_MMPROJ, "weight", 2), {n_vff, n_vff}, 0); + vit.mm_2_b = create_tensor(tn(LLM_TENSOR_V_MMPROJ, "bias" , 2), {n_vff}, 0); } else if (vparams.arch == LLM_ARCH_VISION_MOBILEVLM) { - vit.mm_model_mlp_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd}); - vit.mm_model_mlp_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 0), {n_embd}); - vit.mm_model_mlp_2_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 2), {n_embd, n_embd}); - vit.mm_model_mlp_2_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 2), {n_embd}); - vit.mm_model_peg_0_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd}); - vit.mm_model_peg_0_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_PEG, "bias", 0), {n_embd}); + vit.mm_model_mlp_0_w = create_tensor(tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 0), {n_vembd, n_embd}, 0); + vit.mm_model_mlp_0_b = create_tensor(tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 0), {n_embd}, 0); + vit.mm_model_mlp_2_w = create_tensor(tn(LLM_TENSOR_V_MMPROJ_MLP, "weight", 2), {n_embd, n_embd}, 0); + vit.mm_model_mlp_2_b = create_tensor(tn(LLM_TENSOR_V_MMPROJ_MLP, "bias", 2), {n_embd}, 0); + vit.mm_model_peg_0_w = create_tensor(tn(LLM_TENSOR_V_MMPROJ_PEG, "weight", 0), {n_channel, n_channel, 1, n_embd}, 0); + vit.mm_model_peg_0_b = create_tensor(tn(LLM_TENSOR_V_MMPROJ_PEG, "bias", 0), {n_embd}, 0); } - vit.class_embedding = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_CLS ), {n_vembd}); - vit.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); - vit.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + vit.class_embedding = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_CLS ), {n_vembd}, 0); + vit.patch_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}, 0); + vit.position_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}, 0); - vit.pre_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "weight"), {n_vembd}); - vit.pre_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_PRE_NORM, "bias" ), {n_vembd}); - vit.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); - vit.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + vit.pre_norm_w = create_tensor(tn(LLM_TENSOR_V_PRE_NORM, "weight"), {n_vembd}, 0); + vit.pre_norm_b = create_tensor(tn(LLM_TENSOR_V_PRE_NORM, "bias" ), {n_vembd}, 0); + vit.post_norm_w = create_tensor(tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); + vit.post_norm_b = create_tensor(tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, llama_model_loader::TENSOR_NOT_REQUIRED); for (int i = 0; i < n_vlayer; ++i) { auto & layer = vit.layers[i]; @@ -3525,36 +3534,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_VISION_MINICPMV: { - vit.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); - vit.patch_bias = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd}); - vit.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); - - // resampler - int rs_n_embd = llama_vision_n_mmproj_embd(vit); - vit.mm_model_pos_embed_k = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POS_EMBD_K, "weight"), {rs_n_embd, max_pos_embd}); - vit.mm_model_query = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_QUERY, "weight"), {rs_n_embd, 64}); // why 64? - vit.mm_model_proj = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_PROJ, "weight"), {rs_n_embd, rs_n_embd}); - vit.mm_model_kv_proj = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_KV, "weight"), {n_vembd, rs_n_embd}); - vit.mm_model_attn_q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_Q, "weight"), {rs_n_embd, rs_n_embd}); - vit.mm_model_attn_q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_Q, "bias" ), {rs_n_embd}); - vit.mm_model_attn_k_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_K, "weight"), {rs_n_embd, rs_n_embd}); - vit.mm_model_attn_k_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_K, "bias" ), {rs_n_embd}); - vit.mm_model_attn_v_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_V, "weight"), {rs_n_embd, rs_n_embd}); - vit.mm_model_attn_v_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_V, "bias" ), {rs_n_embd}); - vit.mm_model_attn_o_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_OUT, "weight"), {rs_n_embd, rs_n_embd}); - vit.mm_model_attn_o_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_ATTN_OUT, "bias" ), {rs_n_embd}); - vit.mm_model_ln_q_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_Q_NORM, "weight"), {rs_n_embd}); - vit.mm_model_ln_q_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_Q_NORM, "bias" ), {rs_n_embd}); - vit.mm_model_ln_kv_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_KV_NORM, "weight"), {rs_n_embd}); - vit.mm_model_ln_kv_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_KV_NORM, "bias" ), {rs_n_embd}); - vit.mm_model_ln_post_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "weight"), {rs_n_embd}); - vit.mm_model_ln_post_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_RESMPL_POST_NORM, "bias" ), {rs_n_embd}); + vit.patch_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}, 0); + vit.patch_bias = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd}, 0); + vit.position_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}, 0); // tok embd - vit.mm_tok_embd_image = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_IMAGE, "weight"), {n_embd}); - vit.mm_tok_embd_end_image = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_IMAGE, "weight"), {n_embd}); - vit.mm_tok_embd_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_SLICE, "weight"), {n_embd}); - vit.mm_tok_embd_end_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_SLICE, "weight"), {n_embd}); + vit.mm_tok_embd_image = create_tensor(tn(LLM_TENSOR_V_TOK_EMBD_IMAGE, "weight"), {n_embd}, 0); + vit.mm_tok_embd_end_image = create_tensor(tn(LLM_TENSOR_V_TOK_EMBD_END_IMAGE, "weight"), {n_embd}, 0); + vit.mm_tok_embd_slice = create_tensor(tn(LLM_TENSOR_V_TOK_EMBD_SLICE, "weight"), {n_embd}, 0); + vit.mm_tok_embd_end_slice = create_tensor(tn(LLM_TENSOR_V_TOK_EMBD_END_SLICE, "weight"), {n_embd}, 0); for (int i = 0; i < n_vlayer; ++i) { auto & layer = vit.layers[i]; @@ -3579,18 +3567,41 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0); layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0); } + + // resampler, we consider it as one layer on top of the encoder + int il = n_vlayer - 1; + int rs_n_embd = llama_vision_n_mmproj_embd(vit); + vit.mm_model_pos_embed_k = create_tensor(tn(LLM_TENSOR_V_RESMPL_POS_EMBD_K, "weight", il), {rs_n_embd, max_pos_embd}, 0); + vit.mm_model_query = create_tensor(tn(LLM_TENSOR_V_RESMPL_QUERY, "weight", il), {rs_n_embd, 64}, 0); // why 64? + vit.mm_model_proj = create_tensor(tn(LLM_TENSOR_V_RESMPL_PROJ, "weight", il), {rs_n_embd, rs_n_embd}, 0); + vit.mm_model_kv_proj = create_tensor(tn(LLM_TENSOR_V_RESMPL_KV, "weight", il), {n_vembd, rs_n_embd}, 0); + vit.mm_model_attn_q_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_Q, "weight", il), {rs_n_embd, rs_n_embd}, 0); + vit.mm_model_attn_q_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_Q, "bias" , il), {rs_n_embd}, 0); + vit.mm_model_attn_k_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_K, "weight", il), {rs_n_embd, rs_n_embd}, 0); + vit.mm_model_attn_k_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_K, "bias" , il), {rs_n_embd}, 0); + vit.mm_model_attn_v_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_V, "weight", il), {rs_n_embd, rs_n_embd}, 0); + vit.mm_model_attn_v_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_V, "bias" , il), {rs_n_embd}, 0); + vit.mm_model_attn_o_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_OUT, "weight", il), {rs_n_embd, rs_n_embd}, 0); + vit.mm_model_attn_o_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_ATTN_OUT, "bias" , il), {rs_n_embd}, 0); + vit.mm_model_ln_q_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_Q_NORM, "weight", il), {rs_n_embd}, 0); + vit.mm_model_ln_q_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_Q_NORM, "bias" , il), {rs_n_embd}, 0); + vit.mm_model_ln_kv_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_KV_NORM, "weight", il), {rs_n_embd}, 0); + vit.mm_model_ln_kv_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_KV_NORM, "bias" , il), {rs_n_embd}, 0); + vit.mm_model_ln_post_w = create_tensor(tn(LLM_TENSOR_V_RESMPL_POST_NORM, "weight", il), {rs_n_embd}, 0); + vit.mm_model_ln_post_b = create_tensor(tn(LLM_TENSOR_V_RESMPL_POST_NORM, "bias" , il), {rs_n_embd}, 0); + } break; case LLM_ARCH_VISION_IDEFICS3: { int scale_factor = vit.hparams.scale_factor; - vit.projection = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_FC, "weight"), {n_vembd * scale_factor * scale_factor, n_embd}); + vit.projection = create_tensor(tn(LLM_TENSOR_V_MMPROJ_FC, "weight"), {n_vembd * scale_factor * scale_factor, n_embd}, 0); - vit.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}); - vit.patch_bias = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd}); - vit.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}); + vit.patch_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd}, 0); + vit.patch_bias = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd}, 0); + vit.position_embeddings = create_tensor(tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd}, 0); - vit.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}); - vit.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}); + vit.post_norm_w = create_tensor(tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd}, 0); + vit.post_norm_b = create_tensor(tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd}, 0); for (int i = 0; i < n_vlayer; ++i) { auto & layer = vit.layers[i]; diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index 2da15e2ebc95f..bb6ffcf32bf1c 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -15,13 +15,13 @@ // export llama_image_u8 to bmp file for debugging // https://codereview.stackexchange.com/questions/195121/writing-a-bitmap-image-from-c -struct img_size; static int bmp_export(const struct llama_image_u8 &img, const std::string &location); #endif struct img_size { int width; int height; + img_size(int w, int h) : width(w), height(h) {} }; // RGB uint8 image @@ -89,7 +89,7 @@ static img_size select_best_resolution(const img_size & original_size, const std int original_width = original_size.width; int original_height = original_size.height; - img_size best_fit; + img_size best_fit(0, 0); int max_effective_resolution = 0; int min_wasted_resolution = std::numeric_limits::max(); @@ -314,12 +314,12 @@ struct llama_vision_processor_llava : llama_vision_processor { // "spatial_unpad" with "anyres" processing for llava-1.6 std::vector possible_resolutions; for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i += 2) { - img_size s; + img_size s(0, 0); s.width = params.image_grid_pinpoints[i]; s.height = params.image_grid_pinpoints[i+1]; possible_resolutions.push_back(s); } - img_size best_resolution = select_best_resolution({img.nx, img.ny}, possible_resolutions); + img_size best_resolution = select_best_resolution(img_size(img.nx, img.ny), possible_resolutions); // debug_image_save_to_bmp(*img, "input.bmp"); temp = resize_and_pad_image(img, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 // debug_image_save_to_bmp(*temp, "resized.bmp"); @@ -415,9 +415,9 @@ struct llama_vision_processor_uhd : llama_vision_processor { return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); } - std::pair find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width = original_size.first; - int height = original_size.second; + img_size find_best_resize(const img_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.width; + int height = original_size.height; if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { float r = static_cast(width) / height; height = static_cast(scale_resolution / std::sqrt(r)); @@ -425,14 +425,14 @@ struct llama_vision_processor_uhd : llama_vision_processor { } int best_width = ensure_divide(width, patch_size); int best_height = ensure_divide(height, patch_size); - return std::make_pair(best_width, best_height); + return img_size(best_width, best_height); } - std::pair get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width, height; - std::tie(width, height) = original_size; - int grid_x, grid_y; - std::tie(grid_x, grid_y) = grid; + img_size get_refine_size(const img_size & original_size, const img_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.width; + int height = original_size.height; + int grid_x = grid.width; + int grid_y = grid.height; int refine_width = ensure_divide(width, grid_x); int refine_height = ensure_divide(height, grid_y); @@ -441,16 +441,14 @@ struct llama_vision_processor_uhd : llama_vision_processor { int grid_height = refine_height / grid_y; // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) - auto best_grid_size = find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair - int best_grid_width, best_grid_height; - std::tie(best_grid_width, best_grid_height) = best_grid_size; + auto best_grid = find_best_resize({grid_width, grid_height}, scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair - // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) - std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) + // img_size refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) + img_size refine_size = img_size(best_grid.width * grid_x, best_grid.height * grid_y); // (new line) return refine_size; } - std::pair find_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + img_size find_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { std::vector candidate_split_grids_nums; for (int i : {multiple - 1, multiple, multiple + 1}) { if (i == 1 || i > max_slice_nums) { @@ -459,7 +457,7 @@ struct llama_vision_processor_uhd : llama_vision_processor { candidate_split_grids_nums.push_back(i); } - std::vector> candidate_grids; + std::vector candidate_grids; for (int split_grids_nums : candidate_split_grids_nums) { int m = 1; while (m <= split_grids_nums) { @@ -470,10 +468,10 @@ struct llama_vision_processor_uhd : llama_vision_processor { } } - std::pair best_grid{1, 1}; + img_size best_grid = img_size(1, 1); float min_error = std::numeric_limits::infinity(); for (const auto& grid : candidate_grids) { - float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); + float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height)); if (error < min_error) { best_grid = grid; min_error = error; @@ -487,7 +485,7 @@ struct llama_vision_processor_uhd : llama_vision_processor { const int max_slice_nums = 9, const int scale_resolution = 448, const int patch_size = 14) { - const std::pair original_size={img.nx,img.ny}; + const img_size original_size = img_size(img.nx, img.ny); const int original_width = img.nx; const int original_height = img.ny; const float log_ratio = log(1.0*original_width/original_height); @@ -501,34 +499,36 @@ struct llama_vision_processor_uhd : llama_vision_processor { if (multiple <= 1) { auto best_size = find_best_resize(original_size, scale_resolution, patch_size, true); llama_image_u8 source_image; - bicubic_resize(img, source_image, best_size.first, best_size.second); + bicubic_resize(img, source_image, best_size.width, best_size.height); // source_image = image.resize(best_size, Image.Resampling.BICUBIC) images.back().push_back(source_image); } else if (multiple > 1) { auto best_size = find_best_resize(original_size, scale_resolution, patch_size); llama_image_u8 source_image; - bicubic_resize(img, source_image, best_size.first, best_size.second); + bicubic_resize(img, source_image, best_size.width, best_size.height); // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) - LLAMA_LOG_DEBUG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img.nx, img.ny, best_size.first, best_size.second); + LLAMA_LOG_DEBUG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img.nx, img.ny, best_size.width, best_size.height); images.back().push_back(source_image); - std::pair best_grid = find_best_grid(max_slice_nums, multiple, log_ratio); - LLAMA_LOG_DEBUG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img.nx, img.ny, best_grid.first, best_grid.second); + img_size best_grid = find_best_grid(max_slice_nums, multiple, log_ratio); + LLAMA_LOG_DEBUG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img.nx, img.ny, best_grid.width, best_grid.height); auto refine_size = get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); llama_image_u8 refine_image; - bicubic_resize(img, refine_image, refine_size.first, refine_size.second); + // TODO: so far, we spend most of the time in bicubic_resize, we should optimize it + bicubic_resize(img, refine_image, refine_size.width, refine_size.height); - LLAMA_LOG_DEBUG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image.nx, refine_image.ny, refine_size.first, refine_size.second); + LLAMA_LOG_DEBUG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image.nx, refine_image.ny, refine_size.width, refine_size.height); // split_to_patches int width = refine_image.nx; int height = refine_image.ny; - int grid_x = int(width / best_grid.first); - int grid_y = int(height / best_grid.second); - for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ + int grid_x = int(width / best_grid.width); + int grid_y = int(height / best_grid.height); + for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.height; patches_i += grid_y, ic += 1){ + std::vector patches_out; images.push_back(std::vector()); - for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ + for (int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.width; patches_j += grid_x, jc += 1) { llama_image_u8 patch; patch.nx = grid_x; patch.ny = grid_y; @@ -542,8 +542,9 @@ struct llama_vision_processor_uhd : llama_vision_processor { patch.buf[j+2] = refine_image.buf[i+2]; } } - images.back().push_back(patch); + patches_out.push_back(std::move(patch)); } + images.push_back(std::move(patches_out)); } } return images; @@ -551,7 +552,6 @@ struct llama_vision_processor_uhd : llama_vision_processor { virtual llama_vision_tokens tokenize(const llama_image_u8 & img) override { auto & params = ctx.model->hparams; - GGML_ASSERT(params.arch == LLM_ARCH_VISION_MINICPMV); std::vector> imgs = slice_image(img); @@ -573,6 +573,10 @@ struct llama_vision_processor_uhd : llama_vision_processor { } }; +// +// cgraph builder +// + // TODO: move this to llm_build_context in llama.cpp struct llama_vision_graph_builder { llama_vision_context & ctx; @@ -590,6 +594,7 @@ struct llama_vision_graph_builder { int img_h; bool use_gelu; int n_layers; + int rs_n_embd; vision_projector_type proj_type; llama_vision_graph_builder(llama_vision_context & ctx, const llama_vision_tokens & inp) : ctx(ctx), model(*ctx.model) { @@ -950,7 +955,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ GGML_ASSERT(batch_size == 1); // TODO: support multiple images } - img_size image_size{(int)hparams.image_size, (int)hparams.image_size}; + img_size image_size = img_size((int)hparams.image_size, (int)hparams.image_size); const int patch_size = hparams.patch_size; const int num_patches = ((image_size.width / patch_size) * (image_size.height / patch_size)); const int num_positions = num_patches + (model.class_embedding ? 1 : 0); @@ -1016,23 +1021,25 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "inp_pos"); - std::vector pos_buf(ggml_nelements(positions)); - GGML_ASSERT(num_positions == (int)pos_buf.size()); + std::vector buf(ggml_nelements(positions)); + GGML_ASSERT(num_positions == (int)buf.size()); int bucket_coords_h[70]; int bucket_coords_w[70]; - for (size_t i = 0; i < inp.n_py; i++) { - bucket_coords_h[i] = std::floor(70.0*i/inp.n_py); + size_t h = inp.py; + size_t w = inp.py; + for (size_t i = 0; i < h; i++) { + bucket_coords_h[i] = std::floor(70.0*i/h); } - for (size_t i = 0; i < inp.n_px; i++) { - bucket_coords_w[i] = std::floor(70.0*i/inp.n_px); + for (size_t i = 0; i < w; i++) { + bucket_coords_w[i] = std::floor(70.0*i/w); } - for (size_t i = 0, id = 0; i < inp.n_py; i++){ - for (size_t j = 0; j < inp.n_px; j++){ - pos_buf[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + for (size_t i = 0, id = 0; i < h; i++){ + for (size_t j = 0; j < w; j++){ + buf[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; } } - ggml_backend_tensor_set(positions, pos_buf.data(), 0, ggml_nbytes(positions)); + ggml_backend_tensor_set(positions, buf.data(), 0, ggml_nbytes(positions)); } else { struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "inp_pos"); @@ -1055,6 +1062,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ } // compute + LLAMA_LOG_DEBUG("%s: compute start\n", __func__); int64_t t_start = ggml_time_ms(); ggml_backend_sched_graph_compute(ctx.sched, gf); @@ -1106,7 +1114,6 @@ struct llama_vision_tokens * llama_vision_tokenize( case LLM_ARCH_VISION_IDEFICS3: return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp)); case LLM_ARCH_VISION_MINICPMV: - //return new llama_vision_tokens(llama_vision_processor_uhd(vctx).tokenize(*bmp)); return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp)); default: GGML_ASSERT(false && "unsupported arch"); From fa55281759596079e73de38bc8db250c21dc76a5 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 6 Feb 2025 20:32:09 +0100 Subject: [PATCH 17/25] separate vision ctx and llm ctx --- examples/vision/vision.cpp | 14 ++++- include/llama.h | 23 +++++++- src/llama-arch.cpp | 4 +- src/llama-context.h | 3 - src/llama-vision.cpp | 112 ++++++++++++++++++++++++++++++++----- src/llama-vision.h | 7 ++- src/llama.cpp | 11 +--- 7 files changed, 139 insertions(+), 35 deletions(-) diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp index d97067bba616f..359a023ae86e3 100644 --- a/examples/vision/vision.cpp +++ b/examples/vision/vision.cpp @@ -120,6 +120,14 @@ int main(int argc, char ** argv) { return 1; } + llama_vision_context_params vparams = llama_vision_context_default_params(); + vparams.n_threads = llama_n_threads(ctx); + llama_vision_context * vctx = llama_vision_init_from_model(model, vparams); + if (!vctx) { + LOG_ERR("model does not have vision encoder\n"); + return 1; + } + struct common_sampler * smpl = common_sampler_init(model, params.sampling); llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); @@ -136,12 +144,12 @@ int main(int argc, char ** argv) { } llama_vision_bitmap * img = load_image_from_file(img_path); LOG_INF("loaded image %s, size = %d x %d\n", img_path, img->nx, img->ny); - img_tokens = llama_vision_tokenize(ctx, img); + img_tokens = llama_vision_tokenize(vctx, img); if (!img_tokens) { LOG_ERR("failed to create image tokens\n"); return 1; } - if (llama_vision_encode(ctx, img_tokens)) { + if (llama_vision_encode(vctx, img_tokens)) { LOG_ERR("failed to encode image\n"); return 1; } @@ -163,7 +171,7 @@ int main(int argc, char ** argv) { return 1; } } else { - auto * img_embd = llama_vision_get_output_tensor(ctx); + auto * img_embd = llama_vision_get_output_tensor(vctx); // std::vector output_debug(ggml_nelements(img_embd)); // ggml_backend_tensor_get(img_embd, output_debug.data(), 0, ggml_nbytes(img_embd)); // for (int row = 0; row < 10; row++) { diff --git a/include/llama.h b/include/llama.h index 04c06ac11a9f6..762dd3105ca8d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -229,6 +229,8 @@ extern "C" { bool sorted; } llama_token_data_array; + struct llama_vision_context; + // Structure represents the basic input unit of vision model // This can be a processed image or slices of images under the hood struct llama_vision_tokens; @@ -365,6 +367,10 @@ extern "C" { void * abort_callback_data; }; + struct llama_vision_context_params { + int32_t n_threads; + }; + // model quantization parameters typedef struct llama_model_quantize_params { int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() @@ -402,6 +408,7 @@ extern "C" { // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172) LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); + LLAMA_API struct llama_vision_context_params llama_vision_context_default_params(void); LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); @@ -1297,20 +1304,30 @@ extern "C" { // Vision API // + // Vision context + LLAMA_API struct llama_vision_context * llama_vision_init_from_model( + const struct llama_model * model, + struct llama_vision_context_params params); + LLAMA_API void llama_vision_free(struct llama_vision_context * ctx); + // Container for RGB bitmap LLAMA_API struct llama_vision_bitmap * llama_vision_bitmap_init(uint32_t nx, uint32_t ny); LLAMA_API void llama_vision_bitmap_free(struct llama_vision_bitmap * bmp); // Create image tokens from the RGB bitmap - LLAMA_API struct llama_vision_tokens * llama_vision_tokenize(struct llama_context * ctx, llama_vision_bitmap * bmp); + LLAMA_API struct llama_vision_tokens * llama_vision_tokenize( + struct llama_vision_context * ctx, + struct llama_vision_bitmap * bmp); LLAMA_API void llama_vision_tokens_free(struct llama_vision_tokens * img_tokens); // User must reserve N number of tokens in tokenized text prompt for each image // LLAMA_API int32_t llama_vision_get_n_tokens(const llama_vision_img_tokens * img_tokens); // Encode patches into embeddings - LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, struct llama_vision_tokens * img_tokens); - LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(struct llama_context * ctx); + LLAMA_API int32_t llama_vision_encode( + struct llama_vision_context * ctx, + struct llama_vision_tokens * img_tokens); + LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(struct llama_vision_context * ctx); // // Model split diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index fc5217fe9e3ec..19a7e3f3f2f49 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1576,8 +1576,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_V_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_V_ENC_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_V_ENC_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_V_PRE_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_V_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_V_PRE_NORM, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL}}, + {LLM_TENSOR_V_POST_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_V_RESMPL_POS_EMBD_K, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_ADD}}, {LLM_TENSOR_V_RESMPL_ATTN_Q, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, {LLM_TENSOR_V_RESMPL_ATTN_K, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}}, diff --git a/src/llama-context.h b/src/llama-context.h index f2704b89e96b8..69fd5557b8e11 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -108,9 +108,6 @@ struct llama_context { struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] - - // vision - llama_vision_context vctx; }; // TODO: make these methods of llama_context diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp index bb6ffcf32bf1c..f961acc81d079 100644 --- a/src/llama-vision.cpp +++ b/src/llama-vision.cpp @@ -982,7 +982,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ } // alloc memory for graph - bool ok = ggml_backend_sched_alloc_graph(ctx.sched, gf); + bool ok = ggml_backend_sched_alloc_graph(ctx.sched.get(), gf); if (!ok) { LLAMA_LOG_ERROR("failed to alloc memory for graph\n"); return -1; @@ -1064,7 +1064,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ // compute LLAMA_LOG_DEBUG("%s: compute start\n", __func__); int64_t t_start = ggml_time_ms(); - ggml_backend_sched_graph_compute(ctx.sched, gf); + ggml_backend_sched_graph_compute(ctx.sched.get(), gf); // the last node is the embedding tensor struct ggml_tensor * output_node = ggml_graph_node(gf, -1); @@ -1091,6 +1091,92 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_ //////////////////////////////////////////////////////////////////////////////////////// // public API +struct llama_vision_context_params llama_vision_context_default_params() { + return { + /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default + }; +} + +struct llama_vision_context * llama_vision_init_from_model(const struct llama_model * model, struct llama_vision_context_params params) { + if (!model->has_vision) { + return nullptr; + } + + llama_vision_context * ctx = new llama_vision_context; + ctx->model = &model->vit; + + // TODO: this looks ugly, mostly copied from llama.cpp, refactor it in the future + + // init backends + { + // add CPU backend + ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + if (ctx->backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + llama_vision_free(ctx); + return nullptr; + } + ctx->backends.emplace_back(ctx->backend_cpu); + + // create a list of the set_n_threads functions in the backends + for (auto & backend : ctx->backends) { + ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (reg) { + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + ggml_backend_set_n_threads_fn(backend.get(), params.n_threads); + } + } + } + + // scheduler and compute buffers + { + // buffer types used for the compute buffer of each backend + std::vector backend_buft; + std::vector backend_ptrs; + for (auto & backend : ctx->backends) { + auto * buft = ggml_backend_get_default_buffer_type(backend.get()); + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) { + // use the host buffer of the first device CPU for faster transfer of the intermediate state + auto * dev = model->devices[0]; + auto * host_buft = ggml_backend_dev_host_buffer_type(dev); + if (host_buft) { + buft = host_buft; + } + } + backend_buft.push_back(buft); + backend_ptrs.push_back(backend.get()); + } + + const size_t max_nodes = model->max_nodes(); + + // buffer used to store the computation graph and the tensor meta data + ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + + // TODO: support pipeline_parallel + const bool pipeline_parallel = false; + + ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); + + if (pipeline_parallel) { + LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get())); + } + } + + const size_t max_nodes = VISION_GRAPH_MAX_NODE; // TODO: make it dynamic + ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + + return ctx; +} + +void llama_vision_free(struct llama_vision_context * ctx) { + if (ctx->ctx_ggml) { + ggml_free(ctx->ctx_ggml); + } + delete ctx; +} + struct llama_vision_bitmap * llama_vision_bitmap_init(uint32_t nx, uint32_t ny) { llama_vision_bitmap * bmp = new llama_vision_bitmap; bmp->nx = nx; @@ -1105,16 +1191,15 @@ void llama_vision_bitmap_free(llama_vision_bitmap * bmp) { } struct llama_vision_tokens * llama_vision_tokenize( - struct llama_context * ctx, - llama_vision_bitmap * bmp) { - llama_vision_context & vctx = ctx->vctx; - switch (vctx.model->hparams.arch) { + struct llama_vision_context * ctx, + struct llama_vision_bitmap * bmp) { + switch (ctx->model->hparams.arch) { case LLM_ARCH_VISION_LLAVA: case LLM_ARCH_VISION_MOBILEVLM: case LLM_ARCH_VISION_IDEFICS3: - return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp)); + return new llama_vision_tokens(llama_vision_processor_llava(*ctx).tokenize(*bmp)); case LLM_ARCH_VISION_MINICPMV: - return new llama_vision_tokens(llama_vision_processor_llava(vctx).tokenize(*bmp)); + return new llama_vision_tokens(llama_vision_processor_llava(*ctx).tokenize(*bmp)); default: GGML_ASSERT(false && "unsupported arch"); } @@ -1124,19 +1209,18 @@ void llama_vision_tokens_free(llama_vision_tokens * p) { delete p; } -int32_t llama_vision_encode(struct llama_context * ctx, llama_vision_tokens * p) { +int32_t llama_vision_encode(struct llama_vision_context * ctx, struct llama_vision_tokens * p) { if (p->buf.empty()) { LLAMA_LOG_ERROR("%s: nothing to encode\n", __func__); return -1; } - llama_vision_context & vctx = ctx->vctx; - auto & hparams = vctx.model->hparams; + auto & hparams = ctx->model->hparams; switch (hparams.mm_patch_merge_type) { case MM_PATCH_MERGE_FLAT: { // flat / default llava-1.5 type embedding - int32_t encoded = llama_vision_encode_impl(vctx, *p); + int32_t encoded = llama_vision_encode_impl(*ctx, *p); if (encoded != 0) { LLAMA_LOG_ERROR("Unable to encode image\n"); return encoded; @@ -1154,8 +1238,8 @@ int32_t llama_vision_encode(struct llama_context * ctx, llama_vision_tokens * p) return 0; } -struct ggml_tensor * llama_vision_get_output_tensor(llama_context * ctx) { - return ctx->vctx.output; +struct ggml_tensor * llama_vision_get_output_tensor(struct llama_vision_context * ctx) { + return ctx->output; } //////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/llama-vision.h b/src/llama-vision.h index 953ec57953079..d1ba10c30cd30 100644 --- a/src/llama-vision.h +++ b/src/llama-vision.h @@ -1,6 +1,7 @@ #pragma once #include "ggml.h" +#include "ggml-cpp.h" #include "llama.h" #include "llama-arch.h" @@ -142,12 +143,14 @@ struct llama_vision_model { struct llama_vision_context { // memory buffers used to evaluate the model std::vector buf_compute_meta; - ggml_backend_sched_t sched = nullptr; - struct ggml_context * ctx_ggml = nullptr; + ggml_backend_sched_ptr sched; + std::vector backends; + ggml_backend_t backend_cpu; const llama_vision_model * model; // temporary output data, to be picked up by llama_decode() + struct ggml_context * ctx_ggml = nullptr; struct ggml_tensor * output; }; diff --git a/src/llama.cpp b/src/llama.cpp index 31c4e61fa18b0..99b539fb4b9ed 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8460,7 +8460,9 @@ static int llama_prepare_sbatch( // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + GGML_ASSERT((batch.token && !batch.embd && !batch.embd_tensor) + || (!batch.token && batch.embd && !batch.embd_tensor) + || (!batch.token && !batch.embd && batch.embd_tensor)); // NOLINT if (batch.token) { for (uint32_t i = 0; i < n_tokens_all; ++i) { if (batch.token[i] < 0 || uint32_t(batch.token[i]) >= model.vocab.n_tokens()) { @@ -9893,13 +9895,6 @@ struct llama_context * llama_init_from_model( } } - if (model->has_vision) { - ctx->vctx.model = &model->vit; - ctx->vctx.sched = ctx->sched.get(); - const size_t max_nodes = VISION_GRAPH_MAX_NODE; // TODO: make it dynamic - ctx->vctx.buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); - } - return ctx; } From 78632328f3afc534d31c52190d7fb1c2d3d44b02 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 1 Mar 2025 22:47:21 +0100 Subject: [PATCH 18/25] clarify --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8926bd328ee9e..1fe27b6718beb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1670,6 +1670,7 @@ def __init__(self, *args, **kwargs): self.preprocessor_config = AutoImageProcessor.from_pretrained(vision_model_id).to_dict() self.vision_arch = gguf.MODEL_ARCH.VISION_MOBILEVLM + # only tested with https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct if "vision_config" in self.hparams and model_type == "idefics3": self.vparams = self.hparams["vision_config"] self.preprocessor_config = self.load_preprocessor_config(self.dir_model) From c4e9231cbf280272445a60c0836ca36f3413145c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 1 Mar 2025 22:52:11 +0100 Subject: [PATCH 19/25] fix smolVLM conversion --- convert_hf_to_gguf.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1fe27b6718beb..6fa1d90f3ccb2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1762,15 +1762,14 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter is_vision_tensor = "vision_tower" in name or "vision_model" in name if is_vision_tensor: - if name.startswith("model.text_model"): - name = name.replace("text_model.", "") # for SmolVLM - else: - name = name.replace("model.vision_tower.", "") + name = name.replace("model.vision_tower.", "") if "post_layernorm" in name and self.vision_arch != gguf.MODEL_ARCH.VISION_IDEFICS3: return [] # skip post_layernorm if not is_vision_tensor: - if name.startswith("language_model"): + if name.startswith("model.text_model"): + name = name.replace("text_model.", "") # for SmolVLM + elif name.startswith("language_model"): # language model tensors, remove the prefix name = name.replace("language_model.", "") if name.endswith(("q_proj.weight", "q_proj.bias")): From 21aa2f5af70d306f351c4bb82eedc73a1de4f7aa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 1 Mar 2025 23:11:25 +0100 Subject: [PATCH 20/25] phi-4-mm TEXT-ONLY for now --- convert_hf_to_gguf.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6fa1d90f3ccb2..b96f373409b3c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2707,7 +2707,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_add_bos_token(False) -@Model.register("Phi3ForCausalLM") +@Model.register("Phi3ForCausalLM", "Phi4MMForCausalLM") class Phi3MiniModel(Model): model_arch = gguf.MODEL_ARCH.PHI3 @@ -2718,7 +2718,7 @@ def set_vocab(self): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) tokenizer_class = tokenizer_config_json['tokenizer_class'] - if tokenizer_class == 'GPT2Tokenizer': + if tokenizer_class == 'GPT2Tokenizer' or tokenizer_class == 'GPT2TokenizerFast': return self._set_vocab_gpt2() from sentencepiece import SentencePieceProcessor @@ -2884,6 +2884,16 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "base_layer" in name: + name = name.replace("base_layer.", "") + # TODO: a big TODO, for simplification, we are skipping multimodal tensors for now + if name.startswith("model.embed_tokens_extend") or "lora_" in name: + logger.error(f"Skipping multimodal tensor: {name!r}") + return [] + else: + return super().modify_tensors(data_torch, name, bid) + @Model.register("PhiMoEForCausalLM") class PhiMoeModel(Phi3MiniModel): From 0ead9c4526f8a234c818880c8c7d25e4f6b9e42c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 2 Mar 2025 10:29:56 +0100 Subject: [PATCH 21/25] Revert "fix smolVLM conversion" This reverts commit c4e9231cbf280272445a60c0836ca36f3413145c. --- convert_hf_to_gguf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b96f373409b3c..82974c0e9e956 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1762,14 +1762,15 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter is_vision_tensor = "vision_tower" in name or "vision_model" in name if is_vision_tensor: - name = name.replace("model.vision_tower.", "") + if name.startswith("model.text_model"): + name = name.replace("text_model.", "") # for SmolVLM + else: + name = name.replace("model.vision_tower.", "") if "post_layernorm" in name and self.vision_arch != gguf.MODEL_ARCH.VISION_IDEFICS3: return [] # skip post_layernorm if not is_vision_tensor: - if name.startswith("model.text_model"): - name = name.replace("text_model.", "") # for SmolVLM - elif name.startswith("language_model"): + if name.startswith("language_model"): # language model tensors, remove the prefix name = name.replace("language_model.", "") if name.endswith(("q_proj.weight", "q_proj.bias")): From 45bc1882df8f575df52993534c15af21a07db11e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 2 Mar 2025 13:22:26 +0100 Subject: [PATCH 22/25] a bit cleaner for llava conversion --- convert_hf_to_gguf.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 82974c0e9e956..f5f94107283b9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -508,12 +508,7 @@ def load_hparams(dir_model: Path): with open(dir_model / "config.json", "r", encoding="utf-8") as f: hparams = json.load(f) if "text_config" in hparams: - text_config = hparams["text_config"] - model_id = text_config.get("_name_or_path", None) - # for example, llava-1.5-7b-hf misses the language model config, need to retrieve it via model ID - if model_id is not None and model_id != "None" and model_id != "": - text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict() - hparams = {**text_config, **hparams} + hparams = {**hparams["text_config"], **hparams} return hparams @staticmethod @@ -1646,14 +1641,14 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration", "MobileLlamaForCausalLM", "Idefics3ForConditionalGeneration") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "MobileLlamaForCausalLM", "Idefics3ForConditionalGeneration") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - model_type = self.hparams.get("model_type", None) + model_type = self.hparams.get("model_type") self.vision_arch = None # only tested with https://huggingface.co/llava-hf/llava-1.5-7b-hf @@ -1762,15 +1757,14 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter is_vision_tensor = "vision_tower" in name or "vision_model" in name if is_vision_tensor: - if name.startswith("model.text_model"): - name = name.replace("text_model.", "") # for SmolVLM - else: - name = name.replace("model.vision_tower.", "") + name = name.replace("model.vision_tower.", "") if "post_layernorm" in name and self.vision_arch != gguf.MODEL_ARCH.VISION_IDEFICS3: return [] # skip post_layernorm if not is_vision_tensor: - if name.startswith("language_model"): + if name.startswith("model.text_model"): + name = name.replace("text_model.", "") # for SmolVLM + elif name.startswith("language_model"): # language model tensors, remove the prefix name = name.replace("language_model.", "") if name.endswith(("q_proj.weight", "q_proj.bias")): @@ -1853,6 +1847,22 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("LlavaForConditionalGeneration") +class LlavaModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA + + def __init__(self, *args, **kwargs): + # quick fix for llava model + # see: https://huggingface.co/llava-hf/llava-1.5-7b-hf/discussions/34 + hparams = Model.load_hparams(kwargs["dir_model"]) + if "vision_config" in hparams and hparams.get("model_type") == "llava": + text_config = hparams["text_config"] + text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict() + kwargs["hparams"] = {**text_config, **hparams} + + super().__init__(*args, **kwargs) + + @Model.register("DeciLMForCausalLM") class DeciModel(Model): model_arch = gguf.MODEL_ARCH.DECI From 5283a150205febd8846fb00fbe4fa956c2765b7b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 2 Mar 2025 13:22:37 +0100 Subject: [PATCH 23/25] Revert "phi-4-mm TEXT-ONLY for now" This reverts commit 21aa2f5af70d306f351c4bb82eedc73a1de4f7aa. --- convert_hf_to_gguf.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f5f94107283b9..4d14ba2e98f7c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2718,7 +2718,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_add_bos_token(False) -@Model.register("Phi3ForCausalLM", "Phi4MMForCausalLM") +@Model.register("Phi3ForCausalLM") class Phi3MiniModel(Model): model_arch = gguf.MODEL_ARCH.PHI3 @@ -2729,7 +2729,7 @@ def set_vocab(self): with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_config_json = json.load(f) tokenizer_class = tokenizer_config_json['tokenizer_class'] - if tokenizer_class == 'GPT2Tokenizer' or tokenizer_class == 'GPT2TokenizerFast': + if tokenizer_class == 'GPT2Tokenizer': return self._set_vocab_gpt2() from sentencepiece import SentencePieceProcessor @@ -2895,16 +2895,6 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "base_layer" in name: - name = name.replace("base_layer.", "") - # TODO: a big TODO, for simplification, we are skipping multimodal tensors for now - if name.startswith("model.embed_tokens_extend") or "lora_" in name: - logger.error(f"Skipping multimodal tensor: {name!r}") - return [] - else: - return super().modify_tensors(data_torch, name, bid) - @Model.register("PhiMoEForCausalLM") class PhiMoeModel(Phi3MiniModel): From cee80d48facf4429359699869b799ebf78cb01bf Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 16 Mar 2025 12:17:16 +0100 Subject: [PATCH 24/25] fix merge problem --- src/llama-graph.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 4e90873397ca4..6729ace7da375 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -43,7 +43,7 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens)); } - if (ubatch->embd) { + if (ubatch->embd && !ubatch->embd_tensor) { const int64_t n_embd = embd->ne[0]; const int64_t n_tokens = ubatch->n_tokens; From cdff8c5e99c212e9f792216f596dde760c55fa94 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 16 Mar 2025 12:19:37 +0100 Subject: [PATCH 25/25] fix merge (2) --- src/llama-batch.cpp | 1 + src/llama-context.cpp | 5 ++++- src/llama-graph.cpp | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 5ed32d8595256..c656e16093520 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -57,6 +57,7 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s ubatch.token = nullptr; } if (batch->embd_tensor) { + // TODO @ngxson : we also need to split the tensor by doing a ggml_view ubatch.embd_tensor = batch->embd_tensor; } else if (batch->embd) { if (ubatch.equal_seqs) { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c2fcce42a7d58..35d65b2ca2ae9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1194,7 +1194,10 @@ int llama_context::decode(llama_batch & inp_batch) { batch_guard bg(*kv_self); - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + // TODO @ngxson : we can do better than this + GGML_ASSERT((batch.token && !batch.embd && !batch.embd_tensor) + || (!batch.token && batch.embd && !batch.embd_tensor) + || (!batch.token && !batch.embd && batch.embd_tensor)); // NOLINT if (batch.token) { for (int64_t i = 0; i < n_tokens_all; ++i) { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 6729ace7da375..d5b603ce0a175 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -983,6 +983,10 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { cur = ggml_add(ctx0, cur, inpL_delta); } + } else if (ubatch.embd_tensor) { + inp->embd = ubatch.embd_tensor; + ggml_set_input(ubatch.embd_tensor); + } else { inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); ggml_set_input(inp->embd);