Skip to content

Introduce ggml_threadpool #7526

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
345 changes: 321 additions & 24 deletions common/common.cpp

Large diffs are not rendered by default.

27 changes: 21 additions & 6 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,18 @@ int32_t cpu_get_num_math();
// CLI argument parsing
//

struct cpu_params {
int32_t n_threads = -1;
bool cpumask[GGML_N_CORES_MAX] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
bool poll = false; // Use polling (busywait) to wait for work
};

struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed

int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
Expand Down Expand Up @@ -91,6 +96,11 @@ struct gpt_params {
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;

struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
struct cpu_params draft_cpuparams;
struct cpu_params draft_cpuparams_batch;

ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;

enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
Expand Down Expand Up @@ -189,6 +199,10 @@ bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);

bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_N_CORES_MAX]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_N_CORES_MAX]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);

std::string gpt_params_get_system_info(const gpt_params & params);

//
Expand Down Expand Up @@ -220,8 +234,9 @@ std::string fs_get_cache_directory();
// TODO: avoid tuplue, use struct
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);

struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
Expand Down
2 changes: 1 addition & 1 deletion examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ constexpr float rms_norm_eps = 5e-6f;
#endif

static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
Expand Down
5 changes: 3 additions & 2 deletions examples/batched-bench/batched-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,9 @@ int main(int argc, char ** argv) {
ctx_params.n_ubatch = n_ubatch;
ctx_params.flash_attn = flash_attn;

ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
ctx_params.n_threads = params.cpuparams.n_threads;
ctx_params.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;

// ensure enough sequences are available
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
Expand Down
5 changes: 3 additions & 2 deletions examples/batched/batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,9 @@ int main(int argc, char ** argv) {
ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel);
ctx_params.n_seq_max = n_parallel;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
ctx_params.n_threads = params.cpuparams.n_threads;
ctx_params.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;

llama_context * ctx = llama_new_context_with_model(model, ctx_params);

Expand Down
2 changes: 1 addition & 1 deletion examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#endif

static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
Expand Down
2 changes: 1 addition & 1 deletion examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int

ggml_gallocr_alloc_graph(alloc, gf);

struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, nullptr);
static std::vector<uint8_t> data_work;
data_work.resize(cplan.work_size);
cplan.work_data = data_work.data();
Expand Down
2 changes: 1 addition & 1 deletion examples/finetune/finetune.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1818,7 +1818,7 @@ int main(int argc, char ** argv) {
opt_cb_data.millis_per_iter = 0.0;

// measure required memory for work buffer
size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads, nullptr).work_size + GGML_OBJECT_SIZE;
printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));

// context for work buffer
Expand Down
96 changes: 78 additions & 18 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,10 +187,20 @@ struct cmd_params {
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
ggml_numa_strategy numa;
cpu_params cpuparams;
int reps;
bool verbose;
output_formats output_format;
};
//
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Debug leftovers

//static const cpu_params default_cpuparams(
// int32_t(std::thread::hardware_concurrency()),
// {false},
// false,
// 1,
// false,
// false
//);

static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
Expand All @@ -212,6 +222,7 @@ static const cmd_params cmd_params_defaults = {
/* use_mmap */ {true},
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* cpuparams */ {},
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
Expand Down Expand Up @@ -239,6 +250,11 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -mt, --max-threads <n> (default: %d)\n", cmd_params_defaults.cpuparams.n_threads);
printf(" -C, --cpu-mask <hex> (default: 0x0)\n");
printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority);
printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll);
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
Expand Down Expand Up @@ -275,7 +291,7 @@ static ggml_type ggml_type_from_name(const std::string & s) {
}


static cmd_params parse_cmd_params(int argc, char ** argv) {
static cmd_params parse_cmd_params(int argc, char** argv) {
cmd_params params;
std::string arg;
bool invalid_param = false;
Expand Down Expand Up @@ -326,7 +342,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
Expand All @@ -348,7 +364,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<std::string>(argv[i], split_delim);
std::vector<ggml_type> types;
for (const auto & t : p) {
for (const auto& t : p) {
ggml_type gt = ggml_type_from_name(t);
if (gt == GGML_TYPE_COUNT) {
invalid_param = true;
Expand All @@ -364,7 +380,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<std::string>(argv[i], split_delim);
std::vector<ggml_type> types;
for (const auto & t : p) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's what I get when I use an unconfigured IDE

for (const auto& t : p) {
ggml_type gt = ggml_type_from_name(t);
if (gt == GGML_TYPE_COUNT) {
invalid_param = true;
Expand Down Expand Up @@ -400,7 +416,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<std::string>(argv[i], split_delim);
std::vector<llama_split_mode> modes;
for (const auto & m : p) {
for (const auto& m : p) {
llama_split_mode mode;
if (m == "none") {
mode = LLAMA_SPLIT_MODE_NONE;
Expand Down Expand Up @@ -434,11 +450,36 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
} else {
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; }
}

} else if (arg == "-mt" || arg == "--max-threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.n_threads = std::stoi(argv[i]);
} else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
} else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.priority = std::stoul(argv[i]);
} else if (arg == "--cpu-strict") {
params.cpuparams.strict_cpu = true;
} else if (arg == "--poll") {
params.cpuparams.poll = true;
} else if (arg == "-fa" || arg == "--flash-attn") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -1234,8 +1275,7 @@ struct sql_printer : public printer {
}
};

static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
llama_set_n_threads(ctx, n_threads, n_threads);
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch) {

const llama_model * model = llama_get_model(ctx);
const int32_t n_vocab = llama_n_vocab(model);
Expand All @@ -1257,9 +1297,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
llama_synchronize(ctx);
}

static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should not have changed

llama_set_n_threads(ctx, n_threads, n_threads);

static void test_gen(llama_context * ctx, int n_gen, int n_past) {
const llama_model * model = llama_get_model(ctx);
const int32_t n_vocab = llama_n_vocab(model);

Expand Down Expand Up @@ -1330,6 +1368,23 @@ int main(int argc, char ** argv) {
llama_model * lmodel = nullptr;
const cmd_params_instance * prev_inst = nullptr;

postprocess_cpu_params(params.cpuparams);

struct ggml_threadpool_params tpp;
tpp.n_threads = params.cpuparams.n_threads;
tpp.mask_specified = params.cpuparams.mask_valid;
tpp.strict_cpu = params.cpuparams.strict_cpu;
tpp.prio = params.cpuparams.priority;
tpp.poll = params.cpuparams.poll;

std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_N_CORES_MAX);

struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}

for (const auto & inst : params_instances) {
// keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
Expand All @@ -1356,13 +1411,16 @@ int main(int argc, char ** argv) {

llama_kv_cache_clear(ctx);

llama_set_n_threads(ctx, t.n_threads, t.n_threads);
llama_attach_threadpool(ctx, threadpool);

// warmup run
if (t.n_prompt > 0) {
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch);
test_prompt(ctx, t.n_prompt, 0, t.n_batch);
}
if (t.n_gen > 0) {
test_gen(ctx, 1, 0, t.n_threads);
test_gen(ctx, 1, 0);
}

for (int i = 0; i < params.reps; i++) {
Expand All @@ -1371,10 +1429,10 @@ int main(int argc, char ** argv) {
uint64_t t_start = get_time_ns();

if (t.n_prompt > 0) {
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, 0, t.n_batch);
}
if (t.n_gen > 0) {
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
test_gen(ctx, t.n_gen, t.n_prompt);
}

uint64_t t_ns = get_time_ns() - t_start;
Expand All @@ -1386,7 +1444,9 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);

llama_free(ctx);

}
ggml_release_threadpool(threadpool);

llama_free_model(lmodel);

Expand Down
4 changes: 2 additions & 2 deletions examples/llava/llava-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;
Expand Down
30 changes: 30 additions & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,38 @@ int main(int argc, char ** argv) {
ctx_guidance = llama_new_context_with_model(model, lparams);
}

LOG("%s: llama threadpool init = n_threads = %d\n",
__func__,
(int32_t) params.cpuparams.n_threads
);
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
ggml_threadpool_params_from_cpu_params(params.cpuparams);

struct ggml_compute_threadpool * threadpool_batch = ggml_create_threadpool(&tpp_batch);
if (!threadpool_batch) {
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
exit(1);
}
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}

if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
return 1;
}

llama_attach_batch_threadpool(ctx, threadpool_batch);
llama_attach_threadpool(ctx, threadpool);
if (ctx_guidance) {
llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
llama_attach_threadpool(ctx_guidance, threadpool);
}

const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
Expand Down Expand Up @@ -963,6 +990,9 @@ int main(int argc, char ** argv) {
llama_sampling_free(ctx_sampling);
llama_backend_free();

ggml_release_threadpool(threadpool);
ggml_release_threadpool(threadpool_batch);

#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
Expand Down
Loading
Loading