From f07c2ec505f2ba93c3ec8246b258a9c97c7c1660 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 24 Jan 2025 20:56:09 +0100 Subject: [PATCH 1/6] llama : add option to override tensor buffers --- common/arg.cpp | 38 ++++++++++++++++++++++++++++++++++++++ common/common.cpp | 10 ++++++++++ common/common.h | 1 + include/llama.h | 8 ++++++++ src/llama-model-loader.cpp | 5 ++++- src/llama-model-loader.h | 8 +++++--- src/llama-model.cpp | 21 +++++++++++++++++++-- src/llama-quant.cpp | 2 +- src/llama.cpp | 2 +- 9 files changed, 87 insertions(+), 8 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a6226a34b1860..d746f832e541d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1,5 +1,6 @@ #include "arg.h" +#include "common.h" #include "log.h" #include "sampling.h" @@ -321,6 +322,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context params.kv_overrides.back().key[0] = 0; } + if (!params.tensor_buft_overrides.empty()) { + params.tensor_buft_overrides.push_back({nullptr, nullptr}); + } + if (params.reranking && params.embedding) { throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both"); } @@ -1477,6 +1482,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex exit(0); } )); + add_opt(common_arg( + {"--override-tensor", "-ot"}, "=,...", + "override tensor buffer type", [](common_params & params, const std::string & value) { + static std::map buft_list; + if (buft_list.empty()) { + // enumerate all the devices and add their buffer types to the list + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + buft_list[ggml_backend_buft_name(buft)] = buft; + } + } + + for (const auto & override : string_split(value, ',')) { + std::string::size_type pos = override.find('='); + if (pos == std::string::npos) { + throw std::invalid_argument("invalid value"); + } + std::string tensor_name = override.substr(0, pos); + std::string buffer_type = override.substr(pos + 1); + + if (buft_list.find(buffer_type) == buft_list.end()) { + printf("Available buffer types:\n"); + for (const auto & it : buft_list) { + printf(" %s\n", ggml_backend_buft_name(it.second)); + } + throw std::invalid_argument("unknown buffer type"); + } + // FIXME: this leaks memory + params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)}); + } + } + )); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", diff --git a/common/common.cpp b/common/common.cpp index 6dea8e3d25238..1af628625ffe1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1083,15 +1083,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { if (!params.devices.empty()) { mparams.devices = params.devices.data(); } + if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; + if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { @@ -1099,6 +1102,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.kv_overrides = params.kv_overrides.data(); } + if (params.tensor_buft_overrides.empty()) { + mparams.tensor_buft_overrides = NULL; + } else { + GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern"); + mparams.tensor_buft_overrides = params.tensor_buft_overrides.data(); + } + return mparams; } diff --git a/common/common.h b/common/common.h index 571260372090f..9b42a8944d618 100644 --- a/common/common.h +++ b/common/common.h @@ -256,6 +256,7 @@ struct common_params { std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; + std::vector tensor_buft_overrides; bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) std::vector lora_adapters; // lora adapter path with user defined scale diff --git a/include/llama.h b/include/llama.h index 3b75e760780ef..26c6dd12828c5 100644 --- a/include/llama.h +++ b/include/llama.h @@ -275,10 +275,18 @@ extern "C" { }; }; + struct llama_model_tensor_buft_override { + const char * pattern; + ggml_backend_buffer_type_t buft; + }; + struct llama_model_params { // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) ggml_backend_dev_t * devices; + // NULL-terminated list of buffer types to use for tensors that match a pattern + const struct llama_model_tensor_buft_override * tensor_buft_overrides; + int32_t n_gpu_layers; // number of layers to store in VRAM enum llama_split_mode split_mode; // how to split the model across multiple GPUs diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 75073bf610ac3..c64e974a94f57 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader( std::vector & splits, bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p) { + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -457,6 +458,8 @@ llama_model_loader::llama_model_loader( } } + tensor_buft_overrides = param_tensor_buft_overrides_p; + // Load the main GGUF struct ggml_context * ctx = NULL; struct gguf_init_params params = { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index fe35404b26889..0f52b011b6986 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -77,8 +77,9 @@ struct llama_model_loader { llama_mmaps mappings; - std::map weights_map; - std::unordered_map kv_overrides; + std::map weights_map; + std::unordered_map kv_overrides; + const llama_model_tensor_buft_override * tensor_buft_overrides; gguf_context_ptr meta; std::vector contexts; @@ -95,7 +96,8 @@ struct llama_model_loader { std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p); + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); template typename std::enable_if::value, bool>::type diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 031b4c30b75dd..6b1653536f39e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1444,9 +1444,25 @@ bool llama_model::load_tensors(llama_model_loader & ml) { GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str()); } - ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list); + ggml_backend_buffer_type_t buft = nullptr; + + // check overrides + if (ml.tensor_buft_overrides) { + std::string tensor_name = tn.str(); + for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { + if (tensor_name.find(overrides->pattern) != std::string::npos) { + LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft)); + buft = overrides->buft; + break; + } + } + } + if (!buft) { - throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); + buft = select_weight_buft(hparams, t_meta, op, *buft_list); + if (!buft) { + throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); + } } // avoid using a host buffer when using mmap @@ -3757,6 +3773,7 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const { struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.devices =*/ nullptr, + /*.tensor_buft_overrides =*/ nullptr, /*.n_gpu_layers =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fb7982655a373..ab50c5d179a29 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index e8cfe5012819c..e2ca1d7b45c47 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); From 6c8d01a8bbe0d64491608089027c26ac85cce262 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 2 Feb 2025 17:23:32 +0100 Subject: [PATCH 2/6] add regex support --- src/llama-model.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f1cba4f39a676..f134d1bf1ef2a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -1464,7 +1465,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (ml.tensor_buft_overrides) { std::string tensor_name = tn.str(); for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { - if (tensor_name.find(overrides->pattern) != std::string::npos) { + std::regex pattern(overrides->pattern); + if (std::regex_search(tensor_name, pattern)) { LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft)); buft = overrides->buft; break; From 538f60934abd36f19598d74518cdef0ccd18a023 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 6 Feb 2025 01:32:04 +0100 Subject: [PATCH 3/6] ggml : fix possible underflow in ggml_nbytes --- ggml/src/ggml.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3b48615421187..52c553e76b29f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1151,6 +1151,12 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) { } size_t ggml_nbytes(const struct ggml_tensor * tensor) { + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + if (tensor->ne[i] <= 0) { + return 0; + } + } + size_t nbytes; const size_t blck_size = ggml_blck_size(tensor->type); if (blck_size == 1) { From 8770ffa60c0d0eac481f199f2da1bb6b622a8207 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 9 Feb 2025 00:32:52 +0100 Subject: [PATCH 4/6] rebuild buft list on every call --- common/arg.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5a98c4baf3a83..e796d0e85f946 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1485,13 +1485,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--override-tensor", "-ot"}, "=,...", "override tensor buffer type", [](common_params & params, const std::string & value) { - static std::map buft_list; + /* static */ std::map buft_list; if (buft_list.empty()) { // enumerate all the devices and add their buffer types to the list for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { auto * dev = ggml_backend_dev_get(i); auto * buft = ggml_backend_dev_buffer_type(dev); - buft_list[ggml_backend_buft_name(buft)] = buft; + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; + } } } From ab2d43a9787818a83e232c437c961521673b4e07 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 2 Mar 2025 13:09:08 +0100 Subject: [PATCH 5/6] wip --- src/llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama.cpp b/src/llama.cpp index de60ca050682a..132e9b9e660ef 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9813,6 +9813,7 @@ struct llama_context * llama_init_from_model( model->n_devices() > 1 && model->params.n_gpu_layers > (int)model->hparams.n_layer && model->params.split_mode == LLAMA_SPLIT_MODE_LAYER && + model->params.tensor_buft_overrides == nullptr && params.offload_kqv; // pipeline parallelism requires support for async compute and events in all devices From 2e4e8b13444824f0a7b10bdfc1cab8a859b4daf8 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 2 Apr 2025 02:31:53 +0200 Subject: [PATCH 6/6] disable pipeline parallelism when there are tensor overrides --- src/llama-context.cpp | 3 ++- src/llama-model.cpp | 7 +++++++ src/llama-model.h | 2 ++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 3479a8cca3d64..70efb22f7b5e3 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -255,7 +255,8 @@ llama_context::llama_context( model.n_devices() > 1 && model.params.n_gpu_layers > (int) model.hparams.n_layer && model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && - cparams.offload_kqv; + cparams.offload_kqv && + !model.has_tensor_overrides(); // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index dd481faf7f03f..ca6e3ab2caeb1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -379,9 +379,12 @@ struct llama_model::impl { layer_dev dev_input = {}; layer_dev dev_output = {}; std::vector dev_layer; + + bool has_tensor_overrides; }; llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique()) { + pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern; } llama_model::~llama_model() {} @@ -4169,6 +4172,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const { }); } +bool llama_model::has_tensor_overrides() const { + return pimpl->has_tensor_overrides; +} + const ggml_tensor * llama_model::get_tensor(const char * name) const { auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(), [name](const std::pair & it) { diff --git a/src/llama-model.h b/src/llama-model.h index f1bf0df3a4ef6..91e6e8725acd2 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -382,6 +382,8 @@ struct llama_model { ggml_backend_buffer_type_t select_buft(int il) const; + bool has_tensor_overrides() const; + const struct ggml_tensor * get_tensor(const char * name) const; // TODO: move this to new llm_arch_model_i interface