From 9a4bdc8c12feda3de96ecee7e8a31c593894eebd Mon Sep 17 00:00:00 2001 From: fmz Date: Fri, 24 May 2024 12:04:00 -0700 Subject: [PATCH 01/15] Introduce ggml_threadpool Added an API to support explicit management of threadpools. --- CMakePresets.json | 17 +- common/common.cpp | 343 +++++++- common/common.h | 23 +- examples/baby-llama/baby-llama.cpp | 2 +- examples/batched-bench/batched-bench.cpp | 5 +- examples/batched/batched.cpp | 5 +- examples/benchmark/benchmark-matmult.cpp | 2 +- examples/export-lora/export-lora.cpp | 2 +- examples/finetune/finetune.cpp | 2 +- examples/llava/clip.cpp | 2 +- examples/llava/llava-cli.cpp | 4 +- examples/main/main.cpp | 29 + examples/passkey/passkey.cpp | 5 +- examples/server/server.cpp | 10 +- examples/simple/simple.cpp | 5 +- examples/speculative/speculative.cpp | 39 +- .../train-text-from-scratch.cpp | 2 +- ggml-alloc.h | 5 +- ggml-backend-impl.h | 5 +- ggml-backend.c | 74 +- ggml-backend.h | 28 +- ggml-cuda.cu | 8 +- ggml-kompute.cpp | 7 +- ggml-metal.m | 7 +- ggml-opencl.cpp | 7 +- ggml-sycl.cpp | 8 +- ggml-vulkan.cpp | 7 +- ggml.c | 769 ++++++++++++++---- ggml.h | 25 +- llama.cpp | 82 +- llama.h | 10 + tests/test-backend-ops.cpp | 4 +- tests/test-rope.cpp | 2 +- 33 files changed, 1278 insertions(+), 267 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index ad1af7eccebbd..44e11a5f9cf78 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -11,6 +11,17 @@ "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." } }, + { + "name": "msvc", + "hidden": true, + "generator": "Visual Studio 17 2022", + "architecture": "ARM", + "binaryDir": "${sourceDir}/build-${presetName}", + "cacheVariables": { + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." + } + }, { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, @@ -38,8 +49,8 @@ { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] }, { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] }, - { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, - { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] }, - { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] } + { "name": "arm64-windows-msvc-debug" , "inherits": [ "msvc", "arm64-windows-msvc", "debug" ] }, + { "name": "arm64-windows-msvc-release", "inherits": [ "msvc", "arm64-windows-msvc", "release" ] }, + { "name": "arm64-windows-msvc+static-release", "inherits": [ "msvc", "arm64-windows-msvc", "release", "static" ] } ] } diff --git a/common/common.cpp b/common/common.cpp index 7500e08ff1be4..6306fb572d1c7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -218,6 +218,34 @@ void gpt_params_handle_model_default(gpt_params & params) { } } +static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr) { + int32_t n_set = 0; + + if (cpuparams.n_threads < 0) { + // Assuming everything about cpuparams is invalid + if (role_model != nullptr) { + cpuparams = *role_model; + } else { + cpuparams.n_threads = cpu_get_num_math(); + } + } + + for (int32_t i = 0; i < GGML_N_CORES_MAX; i++) { + if (cpuparams.cpumask[i]) { + n_set++; + } + } + if (n_set == 0) { + // You hit the jackpot! + memset(&cpuparams.cpumask[0], 1, GGML_N_CORES_MAX); + n_set = GGML_N_CORES_MAX; + } + if (n_set < cpuparams.n_threads) { + // Not enough set bits, may experience performance issues. + fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); + } +} + bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; @@ -237,6 +265,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } } + postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); + if (params.prompt_cache_all && (params.interactive || params.interactive_first || params.instruct)) { @@ -280,6 +313,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return result; } +static bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_CORES_MAX]) { + size_t dash_loc = range.find('-'); + if (dash_loc == std::string::npos) { + fprintf(stderr, "Format of CPU range is invalid! Expected []-[].\n"); + return false; + } + + size_t start_i; + size_t end_i; + + if (dash_loc == 0) { + start_i = 0; + } else { + start_i = std::stoull(range.substr(0, dash_loc)); + if (start_i >= GGML_N_CORES_MAX) { + fprintf(stderr, "Start index out of bounds!\n"); + return false; + } + } + + if (dash_loc == range.length() - 1) { + end_i = GGML_N_CORES_MAX - 1; + } else { + end_i = std::stoull(range.substr(dash_loc + 1)); + if (end_i >= GGML_N_CORES_MAX) { + fprintf(stderr, "End index out of bounds!\n"); + return false; + } + } + + for (size_t i = start_i; i <= end_i; i++) { + boolmask[i] = true; + } + + return true; +} + +static bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_N_CORES_MAX]) { + // Discard potential 0x prefix + size_t start_i = 0; + if (mask.length() >= 2 && mask.substr(0, 2) == "0x") { + start_i = 2; + } + + size_t num_digits = mask.length() - start_i; + if (num_digits > 128) num_digits = 128; + + size_t end_i = num_digits + start_i; + + for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) { + char c = mask.at(i); + int8_t id = c; + + if ((c >= '0' && c <= '9')) { + id -= '0'; + } else if (c >= 'a' && c <= 'f') { + id -= 'a' - 10; + } else if (c >= 'A' && c <= 'F') { + id -= 'A' - 10; + } else { + fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i)); + return false; + } + + boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0); + boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0); + boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0); + boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0); + } + + return true; +} + bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { llama_sampling_params & sparams = params.sparams; @@ -298,43 +404,187 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } - params.n_threads = std::stoi(argv[i]); - if (params.n_threads <= 0) { - params.n_threads = std::thread::hardware_concurrency(); + params.cpuparams.n_threads = std::stoi(argv[i]); + if (params.cpuparams.n_threads <= 0) { + params.cpuparams.n_threads = std::thread::hardware_concurrency(); } return true; } + if (arg == "-C" || arg == "--cpu-mask") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string mask = argv[i]; + params.cpuparams.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask); + return true; + } + if (arg == "-Cr" || arg == "--cpu-range") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string range = argv[i]; + params.cpuparams.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask); + return true; + } + if (arg == "--prio") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.cpuparams.priority = std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict") { + params.cpuparams.strict_cpu = true; + return true; + } + if (arg == "--poll") { + params.cpuparams.poll = true; + return true; + } if (arg == "-tb" || arg == "--threads-batch") { if (++i >= argc) { invalid_param = true; return true; } - params.n_threads_batch = std::stoi(argv[i]); - if (params.n_threads_batch <= 0) { - params.n_threads_batch = std::thread::hardware_concurrency(); + params.cpuparams_batch.n_threads = std::stoi(argv[i]); + if (params.cpuparams_batch.n_threads <= 0) { + params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } return true; } + if (arg == "-Cb" || arg == "--cpu-mask-batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string mask = argv[i]; + params.cpuparams_batch.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask); + return true; + } + if (arg == "-Crb" || arg == "--cpu-range_batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string range = argv[i]; + params.cpuparams_batch.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask); + return true; + } + if (arg == "--prio-batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.cpuparams_batch.priority = std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict-batch") { + params.cpuparams_batch.strict_cpu = true; + return true; + } + if (arg == "--poll-batch") { + params.cpuparams_batch.poll = true; + return true; + } if (arg == "-td" || arg == "--threads-draft") { if (++i >= argc) { invalid_param = true; return true; } - params.n_threads_draft = std::stoi(argv[i]); - if (params.n_threads_draft <= 0) { - params.n_threads_draft = std::thread::hardware_concurrency(); + params.draft_cpuparams.n_threads = std::stoi(argv[i]); + if (params.draft_cpuparams.n_threads <= 0) { + params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); } return true; } - if (arg == "-tbd" || arg == "--threads-batch-draft") { + if (arg == "-Cd" || arg == "--cpu-mask-draft") { if (++i >= argc) { invalid_param = true; return true; } - params.n_threads_batch_draft = std::stoi(argv[i]); - if (params.n_threads_batch_draft <= 0) { - params.n_threads_batch_draft = std::thread::hardware_concurrency(); + std::string mask = argv[i]; + params.draft_cpuparams.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask); + return true; + } + if (arg == "-Crd" || arg == "--cpu-range-draft") { + if (++i >= argc) { + invalid_param = true; + return true; } + std::string range = argv[i]; + params.draft_cpuparams.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask); + return true; + } + if (arg == "--prio-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.draft_cpuparams.priority = std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict-draft") { + params.draft_cpuparams.strict_cpu = true; + return true; + } + if (arg == "--poll-draft") { + params.draft_cpuparams.poll = true; + return true; + } + if (arg == "-tdb" || arg == "--threads-draft-batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]); + if (params.draft_cpuparams_batch.n_threads <= 0) { + params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-Cdb" || arg == "--cpu-mask-draft-batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string mask = argv[i]; + params.draft_cpuparams_batch.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask); + return true; + } + if (arg == "-Crdb" || arg == "--cpu-range-draft-batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string range = argv[i]; + params.draft_cpuparams_batch.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask); + return true; + } + if (arg == "--prio-draft-batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.draft_cpuparams_batch.priority = std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict-draft-batch") { + params.draft_cpuparams_batch.strict_cpu = true; + return true; + } + if (arg == "--poll-draft_batch") { + params.draft_cpuparams_batch.poll = true; return true; } if (arg == "-p" || arg == "--prompt") { @@ -1373,13 +1623,41 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param printf(" (can be specified more than once for multiple prompts).\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); + printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.cpuparams.n_threads); printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -C M, --cpu-mask M CPU affinity mask: arbitrarily long hex. Takes precedence over cpu-range (default: \"\")\n"); + printf(" -Cr --cpu-range lo-hi Ranges of CPUs for affinity (alternative to --cpu-mask)\n"); + printf(" -Cb M, --cpu-mask-batch M\n"); + printf(" CPU affinity mask: arbitrarily long hex. Takes precedence over cpu-range (default: \"\")\n"); + printf(" -Crb --cpu-range-batch lo-hi\n"); + printf(" Ranges of CPUs for affinity (alternative to --cpu-mask)\n"); + printf(" --cpu-strict Use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu); + printf(" --cpu-strict-batch Use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu); + printf(" --priority N Set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority); + printf(" --priority-batch N Set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority); + printf(" --poll Use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll); + printf(" --poll-batch Use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll); printf(" -td N, --threads-draft N"); - printf(" number of threads to use during generation (default: same as --threads)\n"); - printf(" -tbd N, --threads-batch-draft N\n"); - printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); + printf(" number of threads to use during generation for draft model (default: same as --threads)\n"); + printf(" -tdb N, --threads-draft-batch N\n"); + printf(" number of threads to use during batch and prompt processing for draft model (default: same as --threads-draft)\n"); + printf(" -Cd M, --cpu-mask-draft M\n"); + printf(" Draft model CPU affinity mask. Takes precedence over cpu-range-draft (default: \"\")\n"); + printf(" -Crd --cpu-range-draft lo-hi\n"); + printf(" Ranges of CPUs for affinity (alternative to --cpu-mask-draft)\n"); + printf(" -Cdb M, --cpu-mask-draft-batch M\n"); + printf(" Draft model CPU affinity mask. Takes precedence over cpu-range-draft (default: \"\")\n"); + printf(" -Crdb --cpu-range-draft-batch lo-hi\n"); + printf(" Ranges of CPUs for affinity (alternative to --cpu-mask-draft)\n"); + printf(" --cpu-strict-draft Use strict CPU placement for draft model (default: %u)\n", (unsigned) params.draft_cpuparams.strict_cpu); + printf(" --cpu-strict-draft-batch\n"); + printf(" Use strict CPU placement for draft model (default: %u)\n", (unsigned) params.draft_cpuparams.strict_cpu); + printf(" --priority-draft N Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority); + printf(" --priority-draft-batch N\n"); + printf(" Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority); + printf(" --poll-draft Use polling to wait for draft model work (default: %u)\n", (unsigned) params.draft_cpuparams.poll); + printf(" --poll-draft-batch Use polling to wait for draft model work (default: %u)\n", (unsigned) params.draft_cpuparams.poll); printf(" -p PROMPT, --prompt PROMPT\n"); printf(" prompt to start generation with (default: empty)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); @@ -1551,9 +1829,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param std::string gpt_params_get_system_info(const gpt_params & params) { std::ostringstream os; - os << "system_info: n_threads = " << params.n_threads; - if (params.n_threads_batch != -1) { - os << " (n_threads_batch = " << params.n_threads_batch << ")"; + os << "system_info: n_threads = " << params.cpuparams.n_threads; + if (params.cpuparams_batch.n_threads != -1) { + os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")"; } os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); @@ -1943,7 +2221,7 @@ std::tuple llama_init_from_gpt_par ((i > 0) || params.lora_base.empty()) ? NULL : params.lora_base.c_str(), - params.n_threads); + params.cpuparams.n_threads); if (err != 0) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); @@ -2028,8 +2306,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_seq_max = params.n_parallel; cparams.n_batch = params.n_batch; cparams.n_ubatch = params.n_ubatch; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.n_threads = params.cpuparams.n_threads; + cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? + params.cpuparams.n_threads : params.cpuparams_batch.n_threads; cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; @@ -2054,6 +2333,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param return cparams; } +struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) { + struct ggml_threadpool_params tpp; + + tpp.mask_specified = params.mask_valid; + if (params.mask_valid) { + std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_N_CORES_MAX); + } + + tpp.n_threads = params.n_threads; + tpp.prio = params.priority; + tpp.poll = params.poll; + tpp.strict_cpu = params.strict_cpu; + + return tpp; +} + #ifdef LLAMA_USE_CURL static bool starts_with(const std::string & str, const std::string & prefix) { @@ -2971,7 +3266,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); - fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); diff --git a/common/common.h b/common/common.h index f68f3c2979b94..9c5ab959fb6fe 100644 --- a/common/common.h +++ b/common/common.h @@ -52,13 +52,18 @@ int32_t cpu_get_num_math(); // CLI argument parsing // +struct cpu_params { + int32_t n_threads = -1; + bool cpumask[GGML_N_CORES_MAX] = {false}; // CPU affinity mask. + bool mask_valid = false; // Default: any CPU + int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) + bool strict_cpu = false; // Use strict CPU placement + bool poll = false; // Use polling (busywait) to wait for work +}; + struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed - int32_t n_threads = cpu_get_num_math(); - int32_t n_threads_draft = -1; - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) @@ -91,6 +96,11 @@ struct gpt_params { ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; + struct cpu_params cpuparams; + struct cpu_params cpuparams_batch; + struct cpu_params draft_cpuparams; + struct cpu_params draft_cpuparams_batch; + ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; @@ -219,8 +229,9 @@ std::string fs_get_cache_directory(); // TODO: avoid tuplue, use struct std::tuple llama_init_from_gpt_params(gpt_params & params); -struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); -struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); +struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); +struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params); +struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params); diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index bf0125e753746..5b233e352a95a 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -19,7 +19,7 @@ constexpr float rms_norm_eps = 5e-6f; #endif static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); if (plan.work_size > 0) { buf.resize(plan.work_size); diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 2924d8116f44f..bd4a4c4ade40a 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -119,8 +119,9 @@ int main(int argc, char ** argv) { ctx_params.n_ubatch = n_ubatch; ctx_params.flash_attn = flash_attn; - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + ctx_params.n_threads = params.cpuparams.n_threads; + ctx_params.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? + params.cpuparams.n_threads : params.cpuparams_batch.n_threads; // ensure enough sequences are available ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 591bc6e57645c..55f11c522c764 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -83,8 +83,9 @@ int main(int argc, char ** argv) { ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_len, n_parallel); ctx_params.n_seq_max = n_parallel; - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + ctx_params.n_threads = params.cpuparams.n_threads; + ctx_params.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? + params.cpuparams.n_threads : params.cpuparams_batch.n_threads; llama_context * ctx = llama_new_context_with_model(model, ctx_params); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 47cb16c69d536..e78f6b388ef6e 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -21,7 +21,7 @@ #endif static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); if (plan.work_size > 0) { buf.resize(plan.work_size); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 08413f57e4c3a..0cf05a085472d 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -344,7 +344,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int ggml_gallocr_alloc_graph(alloc, gf); - struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads); + struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, nullptr); static std::vector data_work; data_work.resize(cplan.work_size); cplan.work_data = data_work.data(); diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 22425730f20eb..047868a8bbe68 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1818,7 +1818,7 @@ int main(int argc, char ** argv) { opt_cb_data.millis_per_iter = 0.0; // measure required memory for work buffer - size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; + size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads, nullptr).work_size + GGML_OBJECT_SIZE; printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); // context for work buffer diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 95fbe3d0216c4..87f2d8b19793c 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1915,7 +1915,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } #endif - ggml_backend_graph_compute(ctx->backend, gf); + ggml_backend_graph_compute(ctx->backend, gf, NULL); // the last node is the embedding tensor struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index c974900f21e20..4f90667baaefc 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -126,14 +126,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para if (!params->image.empty()) { LOG_TEE("using base64 encoded image instead of command line image path\n"); } - embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); + embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); if (!embed) { LOG_TEE("%s: can't load image from prompt\n", __func__); return NULL; } params->prompt = remove_image_from_prompt(prompt); } else { - embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str()); + embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str()); if (!embed) { fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); return NULL; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 09fa85fce0ee3..747c399209955 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -202,11 +202,38 @@ int main(int argc, char ** argv) { ctx_guidance = llama_new_context_with_model(model, lparams); } + LOG("%s: llama threadpool init = n_threads = %d\n", + __func__, + (int32_t) params.cpuparams.n_threads + ); + struct ggml_threadpool_params tpp_batch = + ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); + struct ggml_threadpool_params tpp = + ggml_threadpool_params_from_cpu_params(params.cpuparams); + + struct ggml_compute_threadpool * threadpool_batch = ggml_create_threadpool(&tpp_batch); + if (!threadpool_batch) { + LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); + exit(1); + } + struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp); + if (!threadpool) { + LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + exit(1); + } + if (model == NULL) { LOG_TEE("%s: error: unable to load model\n", __func__); return 1; } + llama_attach_batch_threadpool(ctx, threadpool_batch); + llama_attach_threadpool(ctx, threadpool); + if (ctx_guidance) { + llama_attach_batch_threadpool(ctx_guidance, threadpool_batch); + llama_attach_threadpool(ctx_guidance, threadpool); + } + const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); LOG("n_ctx: %d\n", n_ctx); @@ -955,6 +982,8 @@ int main(int argc, char ** argv) { llama_sampling_free(ctx_sampling); llama_backend_free(); + ggml_release_threadpool(threadpool); + #ifndef LOG_DISABLE_LOGS LOG_TEE("Log end\n"); #endif // LOG_DISABLE_LOGS diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index f2ef9ca10d4a2..04fe7b56be536 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -94,8 +94,9 @@ int main(int argc, char ** argv) { ctx_params.seed = seed; ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; ctx_params.n_batch = 512; - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + ctx_params.n_threads = params.cpuparams.n_threads; + ctx_params.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? + params.cpuparams.n_threads : params.cpuparams_batch.n_threads; GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e9904263d53c7..7bd72661fff7e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2329,7 +2329,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf("options:\n"); printf(" -h, --help show this help message and exit\n"); printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads); printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n"); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); @@ -2612,7 +2612,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } - params.n_threads = std::stoi(argv[i]); + params.cpuparams.n_threads = std::stoi(argv[i]); } else if (arg == "--grp-attn-n" || arg == "-gan") { if (++i >= argc) { invalid_param = true; @@ -2632,7 +2632,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } - params.n_threads_batch = std::stoi(argv[i]); + params.cpuparams_batch.n_threads = std::stoi(argv[i]); } else if (arg == "--threads-http") { if (++i >= argc) { invalid_param = true; @@ -2943,8 +2943,8 @@ int main(int argc, char ** argv) { }); LOG_INFO("system info", { - {"n_threads", params.n_threads}, - {"n_threads_batch", params.n_threads_batch}, + {"n_threads", params.cpuparams.n_threads}, + {"n_threads_batch", params.cpuparams_batch.n_threads}, {"total_threads", std::thread::hardware_concurrency()}, {"system_info", llama_print_system_info()}, }); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index b0f8e0fdc4987..a50a4dc3b52dc 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -53,8 +53,9 @@ int main(int argc, char ** argv) { ctx_params.seed = 1234; ctx_params.n_ctx = 2048; - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + ctx_params.n_threads = params.cpuparams.n_threads; + ctx_params.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? + params.cpuparams.n_threads : params.cpuparams_batch.n_threads; llama_context * ctx = llama_new_context_with_model(model, ctx_params); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 12e46fbc91a24..d99b67200714b 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -24,6 +24,14 @@ struct seq_draft { struct llama_sampling_context * ctx_sampling; }; +static void switch_active_threadpool( + ggml_compute_threadpool_t cur, + ggml_compute_threadpool_t nxt +) { + ggml_pause_threadpool(cur); + ggml_resume_threadpool(nxt); +} + int main(int argc, char ** argv) { gpt_params params; @@ -67,13 +75,19 @@ int main(int argc, char ** argv) { // load the target model std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); + ggml_threadpool_params tpp_tgt = ggml_threadpool_params_from_cpu_params(params.cpuparams); + ggml_compute_threadpool * threadpool_tgt = ggml_create_threadpool(&tpp_tgt); + if (!threadpool_tgt) { + LOG_TEE("%s: target threadpool create failed : n_threads %d\n", __func__, tpp_tgt.n_threads); + exit(1); + } + // load the draft model params.model = params.model_draft; params.n_gpu_layers = params.n_gpu_layers_draft; - if (params.n_threads_draft > 0) { - params.n_threads = params.n_threads_draft; + if (params.draft_cpuparams.n_threads > 0) { + params.cpuparams = params.draft_cpuparams; } - params.n_threads_batch = params.n_threads_batch_draft; std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); const bool vocab_type_tgt = llama_vocab_type(model_tgt); @@ -98,6 +112,17 @@ int main(int argc, char ** argv) { return 1; } + ggml_threadpool_params tpp_dft = ggml_threadpool_params_from_cpu_params(params.draft_cpuparams); + ggml_compute_threadpool * threadpool_dft = ggml_create_threadpool(&tpp_dft); + if (!threadpool_dft) { + LOG_TEE("%s: draft threadpool create failed : n_threads %d\n", __func__, tpp_dft.n_threads); + exit(1); + } + + llama_attach_threadpool(ctx_tgt, threadpool_tgt); + llama_attach_threadpool(ctx_dft, threadpool_dft); + ggml_pause_threadpool(threadpool_dft); + { const int n_vocab_tgt = llama_n_vocab(model_tgt); const int n_vocab_dft = llama_n_vocab(model_dft); @@ -153,6 +178,7 @@ int main(int argc, char ** argv) { // eval the prompt with both models llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); + switch_active_threadpool(threadpool_tgt, threadpool_dft); llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0)); const auto t_enc_end = ggml_time_us(); @@ -542,6 +568,7 @@ int main(int argc, char ** argv) { // evaluate the drafted tokens on the draft model llama_decode(ctx_dft, batch_dft); + ++n_past_cur; ++n_drafted; @@ -549,6 +576,7 @@ int main(int argc, char ** argv) { break; } } + switch_active_threadpool(threadpool_dft, threadpool_tgt); // evaluate the target model on the drafted tokens { @@ -559,6 +587,8 @@ int main(int argc, char ** argv) { // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); llama_decode(ctx_tgt, batch_tgt); + switch_active_threadpool(threadpool_tgt, threadpool_dft); + ++n_past_tgt; } @@ -608,6 +638,9 @@ int main(int argc, char ** argv) { llama_backend_free(); + ggml_release_threadpool(threadpool_tgt); + ggml_release_threadpool(threadpool_dft); + fprintf(stderr, "\n\n"); return 0; diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index e2f85c68297b8..7994ccd7ef038 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1211,7 +1211,7 @@ int main(int argc, char ** argv) { opt_cb_data.millis_per_iter = 0.0; // measure required memory for work buffer - size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; + size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads, nullptr).work_size + GGML_OBJECT_SIZE; printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); // context for work buffer diff --git a/ggml-alloc.h b/ggml-alloc.h index 434c13b34a929..cd85b6ee70560 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -7,8 +7,9 @@ extern "C" { #endif typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; -typedef struct ggml_backend_buffer * ggml_backend_buffer_t; -typedef struct ggml_backend * ggml_backend_t; +typedef struct ggml_backend_buffer * ggml_backend_buffer_t; +typedef struct ggml_backend * ggml_backend_t; +typedef struct ggml_compute_threadpool * ggml_compute_threadpool_t; // Tensor allocator struct ggml_tallocr { diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index f121e1de420fa..b5d4d0f8d16ed 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -92,13 +92,14 @@ extern "C" { void (*GGML_CALL synchronize)(ggml_backend_t backend); // compute graph with a plan (not used currently) - ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); + ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph with a plan enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + // compute graph without a plan (async) - enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool); // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); diff --git a/ggml-backend.c b/ggml-backend.c index 9e35ce98d7ace..ae185e7f96307 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -254,10 +254,14 @@ void ggml_backend_synchronize(ggml_backend_t backend) { backend->iface.synchronize(backend); } -ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +ggml_backend_graph_plan_t ggml_backend_graph_plan_create( + ggml_backend_t backend, + const struct ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool +) { GGML_ASSERT(backend->iface.graph_plan_create != NULL); - return backend->iface.graph_plan_create(backend, cgraph); + return backend->iface.graph_plan_create(backend, cgraph, threadpool); } void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { @@ -266,20 +270,31 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla backend->iface.graph_plan_free(backend, plan); } -enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +enum ggml_status ggml_backend_graph_plan_compute( + ggml_backend_t backend, + ggml_backend_graph_plan_t plan +) { GGML_ASSERT(backend->iface.graph_plan_compute != NULL); return backend->iface.graph_plan_compute(backend, plan); } -enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); +enum ggml_status ggml_backend_graph_compute( + ggml_backend_t backend, + struct ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool +) { + enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, threadpool); ggml_backend_synchronize(backend); return err; } -enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - return backend->iface.graph_compute(backend, cgraph); +enum ggml_status ggml_backend_graph_compute_async( + ggml_backend_t backend, + struct ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool +) { + return backend->iface.graph_compute(backend, cgraph, threadpool); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -758,12 +773,16 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { +GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create( + ggml_backend_t backend, + const struct ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool +) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); - cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); + cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { @@ -797,10 +816,14 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe GGML_UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute( + ggml_backend_t backend, + struct ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool +) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool); if (cpu_ctx->work_size < cplan.work_size) { free(cpu_ctx->work_data); @@ -1630,7 +1653,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { return true; } -static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { +static enum ggml_status ggml_backend_sched_compute_splits( + ggml_backend_sched_t sched, + ggml_compute_threadpool_t threadpool +) { struct ggml_backend_sched_split * splits = sched->splits; for (int i = 0; i < sched->n_splits; i++) { @@ -1664,7 +1690,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, threadpool); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1686,7 +1712,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, threadpool); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1825,13 +1851,21 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra return true; } -enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph); +enum ggml_status ggml_backend_sched_graph_compute( + ggml_backend_sched_t sched, + struct ggml_cgraph * graph, + ggml_compute_threadpool_t threadpool +) { + enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph, threadpool); ggml_backend_sched_synchronize(sched); return err; } -enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { +enum ggml_status ggml_backend_sched_graph_compute_async( + ggml_backend_sched_t sched, + struct ggml_cgraph * graph, + ggml_compute_threadpool_t threadpool +) { if (!sched->is_reset && !sched->is_alloc) { ggml_backend_sched_reset(sched); } @@ -1842,7 +1876,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch } } - return ggml_backend_sched_compute_splits(sched); + return ggml_backend_sched_compute_splits(sched, threadpool); } void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { @@ -2081,8 +2115,8 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1); struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1); - ggml_backend_graph_compute(backend1, &g1v); - ggml_backend_graph_compute(backend2, &g2v); + ggml_backend_graph_compute(backend1, &g1v, NULL); + ggml_backend_graph_compute(backend2, &g2v, NULL); if (ggml_is_view_op(t1->op)) { continue; diff --git a/ggml-backend.h b/ggml-backend.h index 744b6a77457d7..b17d9770b55c9 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -67,12 +67,24 @@ extern "C" { GGML_API void ggml_backend_synchronize(ggml_backend_t backend); - GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); - GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - - GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); - GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create( + ggml_backend_t backend, + const struct ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool); + + GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + + GGML_API enum ggml_status ggml_backend_graph_plan_compute ( + ggml_backend_t backend, + ggml_backend_graph_plan_t plan); + GGML_API enum ggml_status ggml_backend_graph_compute( + ggml_backend_t backend, + struct ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool); + GGML_API enum ggml_status ggml_backend_graph_compute_async( + ggml_backend_t backend, + struct ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); @@ -193,8 +205,8 @@ extern "C" { // Allocate and compute graph on the backend scheduler GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); - GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); - GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool); + GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool); GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); // Reset all assignments and allocators - must be called before changing the node backends diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b82167cbf7227..d33f8a49b83bf 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2495,9 +2495,13 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra return true; } -GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; +GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute( + ggml_backend_t backend, + ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool) { + GGML_UNUSED(threadpool); + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_device(cuda_ctx->device); #ifdef USE_CUDA_GRAPH diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 6c6058b2a95b1..90272d5f13d7b 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1948,7 +1948,12 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g return ggml_backend_kompute_buffer_type(ctx->device); } -static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static ggml_status ggml_backend_kompute_graph_compute( + ggml_backend_t backend, + struct ggml_cgraph * cgraph + ggml_compute_threadpool_t threadpool) { + + GGML_UNUSED(threadpool); auto * ctx = static_cast(backend->context); ggml_vk_graph_compute(ctx, cgraph); return GGML_STATUS_SUCCESS; diff --git a/ggml-metal.m b/ggml-metal.m index c9e570dbf5a3a..051ade2fcc2c5 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -3103,7 +3103,12 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute( + ggml_backend_t backend, + struct ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool) { + + UNUSED(threadpool); struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index e28566a7bdbd7..079a718a115a2 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -2235,7 +2235,12 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg GGML_UNUSED(backend); } -static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { +static ggml_status ggml_backend_opencl_graph_compute( + ggml_backend_t backend, + ggml_cgraph * graph, + ggml_compute_threadpool_t threadpool) { + + GGML_UNUSED(threadpool); for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * node = graph->nodes[i]; diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 496ec61c3c28a..15d07dc7a3d1c 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17022,7 +17022,13 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static ggml_status ggml_backend_sycl_graph_compute( + ggml_backend_t backend, + ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool) { + + GGML_UNUSED(threadpool); + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_sycl_set_main_device(sycl_ctx->device); diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 79ce1479f16ca..1f7923d65b800 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -6225,7 +6225,12 @@ static bool ggml_vk_is_empty(ggml_tensor * node) { return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; } -GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static ggml_status ggml_backend_vk_graph_compute( + ggml_backend_t backend, + ggml_cgraph * cgraph, + ggml_compute_threadpool_t threadpool) { + + GGML_UNUSED(threadpool); #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl; #endif diff --git a/ggml.c b/ggml.c index 9e72b7a765dba..3b4c093b30083 100644 --- a/ggml.c +++ b/ggml.c @@ -1739,30 +1739,103 @@ struct ggml_context_container { struct ggml_context context; }; -struct ggml_compute_state_shared { - const struct ggml_cgraph* cgraph; - const struct ggml_cplan* cplan; +// +// thread data +// - int64_t perf_node_start_cycles; - int64_t perf_node_start_time_us; +typedef pthread_t ggml_thread_t; + +#if defined(_WIN32) + +typedef CONDITION_VARIABLE ggml_cond_t; +typedef SRWLOCK ggml_mutex_t; + +#define ggml_mutex_init(m) InitializeSRWLock(m) +#define ggml_mutex_destroy(m) +#define ggml_mutex_lock(m) AcquireSRWLockExclusive(m) +#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m) +#define ggml_mutex_lock_shared(m) AcquireSRWLockShared(m) +#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m) + +#define ggml_cond_init(c) InitializeConditionVariable(c) +#define ggml_cond_destroy(c) +#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED) +#define ggml_cond_broadcast(c) WakeAllConditionVariable(c) + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#else + +typedef pthread_cond_t ggml_cond_t; +typedef pthread_mutex_t ggml_mutex_t; + +#define ggml_mutex_init(m) pthread_mutex_init(m, NULL) +#define ggml_mutex_destroy(m) pthread_mutex_destroy(m) +#define ggml_mutex_lock(m) pthread_mutex_lock(m) +#define ggml_mutex_unlock(m) pthread_mutex_unlock(m) +#define ggml_mutex_lock_shared(m) pthread_mutex_lock(m) +#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m) - const int n_threads; +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#define ggml_lock_lock(x) _mm_pause() +#else +#define ggml_lock_lock(x) UNUSED(x) +#endif +#define ggml_lock_unlock(x) UNUSED(x) +#define GGML_LOCK_INITIALIZER 0 +#define ggml_cond_init(c) pthread_cond_init(c, NULL) +#define ggml_cond_destroy(c) pthread_cond_destroy(c) +#define ggml_cond_wait(c, m) pthread_cond_wait(c, m) +#define ggml_cond_broadcast(c) pthread_cond_broadcast(c) + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#endif + + +// Threadpool def +struct ggml_compute_threadpool { // synchronization primitives - atomic_int n_active; // num active threads - atomic_int node_n; // active graph node - atomic_int node_task; // active graph node task phase + atomic_int n_ready; // number of ready threads (inter-graph sync) + atomic_int n_active; // number of active threads (intra-graph sync) + atomic_int node_n; // active graph node + atomic_int node_task; // active graph node task phase + volatile bool stop; // Used for stopping the threadpool altogether + volatile bool pause; // Used for pausing the threadpool or individual threads + + struct ggml_cgraph * cgraph; + struct ggml_cplan * cplan; + + struct ggml_compute_state * workers; // per thread state + int32_t n_threads_max; // number of threads in the pool + int32_t n_threads_cur; // number of threads used in the current graph + + bool poll; // Use polling (busywait) // TODO + int32_t prio; // Scheduling priority + + ggml_mutex_t mutex; // mutex for cond.var + ggml_cond_t cond; // cond.var for waiting for new work + + int64_t perf_node_start_cycles; + int64_t perf_node_start_time_us; ggml_abort_callback abort_callback; // abort ggml_graph_compute when true - void* abort_callback_data; + void * abort_callback_data; atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. }; +// Per-thread state struct ggml_compute_state { ggml_thread_t thrd; + bool cpumask[GGML_N_CORES_MAX]; int ith; - struct ggml_compute_state_shared* shared; + struct ggml_compute_threadpool * threadpool; enum ggml_status ec; }; @@ -12472,7 +12545,7 @@ UseGgmlGemm1:; return; } // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - atomic_store(&state->shared->current_chunk, nth); + atomic_store(&state->threadpool->current_chunk, nth); if (src1->type != vec_dot_type) { char * wdata = params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); @@ -12594,7 +12667,7 @@ UseGgmlGemm2:; break; } - current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1); + current_chunk = atomic_fetch_add(&state->threadpool->current_chunk, 1); } #ifdef GGML_PERF @@ -18893,65 +18966,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *)); } -// -// thread data -// -// synchronization is done via busy loops -// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops -// - -#ifdef __APPLE__ - -//#include -// -//typedef os_unfair_lock ggml_lock_t; -// -//#define ggml_lock_init(x) UNUSED(x) -//#define ggml_lock_destroy(x) UNUSED(x) -//#define ggml_lock_lock os_unfair_lock_lock -//#define ggml_lock_unlock os_unfair_lock_unlock -// -//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT - -typedef int ggml_lock_t; - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#define ggml_lock_lock(x) UNUSED(x) -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#else - -//typedef pthread_spinlock_t ggml_lock_t; - -//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) -//#define ggml_lock_destroy pthread_spin_destroy -//#define ggml_lock_lock pthread_spin_lock -//#define ggml_lock_unlock pthread_spin_unlock - -typedef int ggml_lock_t; - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define ggml_lock_lock(x) _mm_pause() -#else -#define ggml_lock_lock(x) UNUSED(x) -#endif -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#endif - // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) static void set_numa_thread_affinity(int thread_n) { @@ -19026,9 +19040,10 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } static void clear_numa_thread_affinity(void) {} #endif -static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { - int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles; - int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us; + +static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_threadpool * tp) { + int64_t cycles_cur = ggml_perf_cycles() - tp->perf_node_start_cycles; + int64_t time_us_cur = ggml_perf_time_us() - tp->perf_node_start_time_us; node->perf_runs++; node->perf_cycles += cycles_cur; @@ -19282,16 +19297,355 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ return n_tasks; } +static thread_ret_t ggml_graph_compute_secondary_thread(void* data); + +enum { + SCHED_PRIO_NORMAL, + SCHED_PRIO_MEDIUM, + SCHED_PRIO_HIGH, + SCHED_PRIO_REALTIME +}; + +#if defined(_WIN32) +#include "windows.h" + +// TODO: support > 64 CPUs +static bool __thread_affinity(bool * mask) { + HANDLE h = GetCurrentThread(); + uint64_t bitmask = 0ULL; + + assert(GGML_N_CORES_MAX >= 64); + + for (int32_t i = 0; i < 8; i++) { + int32_t idx = i * 8; + uint8_t val = 0; + val |= mask[idx + 0] << 0; + val |= mask[idx + 1] << 1; + val |= mask[idx + 2] << 2; + val |= mask[idx + 3] << 3; + val |= mask[idx + 4] << 4; + val |= mask[idx + 5] << 5; + val |= mask[idx + 6] << 6; + val |= mask[idx + 7] << 7; + bitmask |= (uint64_t)val << idx; + } + + for (int32_t i = 64; i < GGML_N_CORES_MAX; i++) { + if (mask[i]) { + fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n"); + break; + } + } + + DWORD_PTR m = (DWORD_PTR)mask; + + m = SetThreadAffinityMask(h, m); + + return m != 0; +} + +static bool __process_priority(int32_t prio) { + DWORD p = NORMAL_PRIORITY_CLASS; + + switch (prio) { + case SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break; + case SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break; + case SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break; + case SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break; + } + + return SetPriorityClass(GetCurrentProcess(), p); +} + +static bool __thread_priority(int32_t prio) { + DWORD p = NORMAL_PRIORITY_CLASS; + + switch (prio) { + case SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break; + case SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break; + case SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break; + case SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break; + } + + return SetThreadPriority(GetCurrentThread(), p); + +} + +#elif defined(__APPLE__) +#include +#include + +static bool __thread_affinity(const bool * mask) { + UNUSED(mask); + return true; +} + +static bool __process_priority(int32_t prio) { + int32_t p = 0; + + switch (prio) { + case SCHED_PRIO_NORMAL: p = 0; break; + case SCHED_PRIO_MEDIUM: p = -5; break; + case SCHED_PRIO_HIGH: p = -10; break; + case SCHED_PRIO_REALTIME: p = -20; break; + } + + int32_t r = setpriority(PRIO_PROCESS, 0, p); + return r != -1; +} + +static bool __thread_priority(int32_t prio) { + UNUSED(prio); + return true; +} + +#else // posix? + +#include + +static bool __thread_affinity(const bool * mask) { + cpu_set_t cpuset; + int32_t err; + + CPU_ZERO(&cpuset); + + for (uint32_t i = 0; i < GGML_N_CORES_MAX; i++) { + if (mask[i]) { + CPU_SET(i, &cpuset); + } + } + +#ifdef __ANDROID__ + err = sched_setaffinity(0, sizeof(cpuset), &cpuset); + if (err < 0) { + err = errno; + } +#else + err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); +#endif + if (err != 0) { + //fprintf(stderr, "warn: failed to set affinity mask 0x%llx (err %d: %s)\n", (unsigned long long)mask, err, strerror(err)); + return false; + } + + return true; +} + +static bool __process_priority(int32_t prio) { + struct sched_param p; + int32_t policy = SCHED_OTHER; + + switch (prio) { + case SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; + case SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; + case SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; + case SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; + } + + int32_t err = sched_setscheduler(0, policy, &p); + if (err != 0) { + //fprintf(stderr, "warn: failed to set process priority %d (err %d)\n", prio, err); + return false; + } + + return true; +} + +static bool __thread_priority(int32_t prio) { + struct sched_param p; + int32_t policy = SCHED_OTHER; + switch (prio) { + case SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; + case SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; + case SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; + case SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; + } + + int32_t err = pthread_setschedparam(pthread_self(), policy, &p); + if (err != 0) { + //fprintf(stderr, "warn: failed to set thread priority %d (err %d)\n", prio, err); + return false; + } + + return true; +} + +#endif + +static void __init_stack(size_t size) { + void* ptr = alloca(size); + if (ptr) { + memset(ptr, 0, size); + } +} + +#ifdef __aarch64__ + +static inline void __cpu_relax(void) { + __asm__ volatile("yield" ::: "memory"); +} + +#else + +static inline void __cpu_relax(void) { + __asm__ volatile("rep; nop" ::: "memory"); +} +#endif + +static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) { + if (!global_mask) { + memset(local_mask, 1, GGML_N_CORES_MAX); + return; + } + if (!strict) { + memcpy(local_mask, global_mask, GGML_N_CORES_MAX); + return; + } else { + memset(local_mask, 0, GGML_N_CORES_MAX); + int32_t base_idx = *iter; + for (int32_t i = 0; i < GGML_N_CORES_MAX; i++) { + int32_t idx = base_idx + i; + if (idx > GGML_N_CORES_MAX) { + // Just a cheaper modulo + idx -= GGML_N_CORES_MAX; + } + if (global_mask[idx]) { + local_mask[idx] = 1; + *iter = idx + 1; + return; + } + } + } +} + +struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_params * tpp) { + struct ggml_compute_threadpool * threadpool = + GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_threadpool)); + + { + threadpool->n_ready = 1; // the main thread is "ready" + threadpool->n_active = 0; + threadpool->node_n = 0; + threadpool->node_task = GGML_TASK_TYPE_FINALIZE; + threadpool->stop = false; + threadpool->pause = true; + threadpool->cgraph = NULL; + threadpool->cplan = NULL; + threadpool->workers = NULL; + threadpool->n_threads_max = tpp->n_threads; + threadpool->n_threads_cur = 0; + threadpool->poll = tpp->poll; + threadpool->prio = tpp->prio; + + threadpool->perf_node_start_cycles = 0ULL; + threadpool->perf_node_start_time_us = 0ULL; + + threadpool->abort_callback = NULL; + threadpool->abort_callback_data = NULL; + threadpool->current_chunk = 0; + } + + ggml_mutex_init(&threadpool->mutex); + ggml_cond_init(&threadpool->cond); + + struct ggml_compute_state * workers = + GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_state) * tpp->n_threads); + + threadpool->workers = workers; + + __init_stack(2ULL * 1024 * 1024); + + int cpumask_iter = 0; + + __process_priority(tpp->prio); + __thread_priority(tpp->prio); + + for (int j = 0; j < tpp->n_threads; j++) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .ith = j, + .threadpool = threadpool, + .ec = GGML_STATUS_SUCCESS, + }; + + if (tpp->mask_specified) { + __cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter); + } else { + workers[j].cpumask[j] = true; + } + + // Spin threads for all secondary workers + if (j > 0) { + int32_t rc = ggml_thread_create( + &workers[j].thrd, + NULL, + ggml_graph_compute_secondary_thread, + &workers[j] + ); + GGML_ASSERT(rc == 0); + } + } + + // Set the main-thread's affinity last + __thread_affinity(workers[0].cpumask); + // Ensure all threads entered the compute loop before returning. + while (atomic_load(&threadpool->n_ready) != threadpool->n_threads_max) { ; } + + return threadpool; +} + +void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) { + if (!threadpool) return; + + struct ggml_compute_state* workers = threadpool->workers; + const int32_t n_threads = threadpool->n_threads_max; + + // Don't really need to lock in the polling mode but it doesn't hurt + ggml_mutex_lock(&threadpool->mutex); + threadpool->n_threads_cur = n_threads; + threadpool->stop = true; + threadpool->pause = false; + + ggml_cond_broadcast(&threadpool->cond); + ggml_mutex_unlock(&threadpool->mutex); + + for (int32_t j = 1; j < n_threads; j++) { + int32_t rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED); + UNUSED(rc); + } + + GGML_ALIGNED_FREE(workers); + + ggml_mutex_destroy(&threadpool->mutex); + ggml_cond_destroy(&threadpool->cond); + + GGML_ALIGNED_FREE(threadpool); +} + +void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) { + GGML_PRINT_DEBUG("Pausing threadpool\n"); + threadpool->pause = true; +} + +void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) { + ggml_mutex_lock(&threadpool->mutex); + GGML_PRINT_DEBUG("Resuming threadpool\n"); + threadpool->pause = false; + ggml_cond_broadcast(&threadpool->cond); + ggml_mutex_unlock(&threadpool->mutex); +} + static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) { // wait for other threads to finish const int last_node_n = * node_n; while (true) { if (do_yield) { - sched_yield(); + __cpu_relax(); } - * node_n = atomic_load(&state->shared->node_n); + * node_n = atomic_load(&state->threadpool->node_n); if (* node_n != last_node_n) break; #if defined(__SSE3__) // Tell the processor we're spinning. It's a processor hint for spinlocks. @@ -19306,10 +19660,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co while (true) { if (do_yield) { - sched_yield(); + __cpu_relax(); } - * task_phase = atomic_load(&state->shared->node_task); + * task_phase = atomic_load(&state->threadpool->node_task); if (* task_phase != last_task_phase) break; #if defined(__SSE3__) // Tell the processor we're spinning. It's a processor hint for spinlocks. @@ -19318,13 +19672,13 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co } } -static thread_ret_t ggml_graph_compute_thread(void * data) { - struct ggml_compute_state * state = (struct ggml_compute_state *) data; +static thread_ret_t ggml_graph_compute_thread(struct ggml_compute_state * state) { + struct ggml_compute_threadpool * threadpool = state->threadpool; - const struct ggml_cgraph * cgraph = state->shared->cgraph; - const struct ggml_cplan * cplan = state->shared->cplan; + const struct ggml_cgraph * cgraph = threadpool->cgraph; + const struct ggml_cplan * cplan = threadpool->cplan; - const int n_threads = state->shared->n_threads; + const int n_threads = threadpool->n_threads_cur; set_numa_thread_affinity(state->ith); @@ -19333,12 +19687,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { while (true) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { - state->shared->node_n += 1; + threadpool->node_n += 1; state->ec = GGML_STATUS_ABORTED; return 0; } - if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + if (atomic_fetch_sub(&threadpool->n_active, 1) == 1) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again struct ggml_compute_params params = { @@ -19353,20 +19707,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /* FINALIZE */ struct ggml_tensor * node = cgraph->nodes[node_n]; if (GGML_OP_HAS_FINALIZE[node->op]) { - params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + params.nth = ggml_get_n_tasks(node, n_threads, threadpool->n_threads_cur); ggml_compute_forward(¶ms, node, state); } - ggml_graph_compute_perf_stats_node(node, state->shared); + ggml_graph_compute_perf_stats_node(node, threadpool); } // distribute new work or execute it direct if 1T while (++node_n < cgraph->n_nodes) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); struct ggml_tensor * node = cgraph->nodes[node_n]; - const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + const int n_tasks = ggml_get_n_tasks(node, n_threads, threadpool->n_threads_cur); - state->shared->perf_node_start_cycles = ggml_perf_cycles(); - state->shared->perf_node_start_time_us = ggml_perf_time_us(); + threadpool->perf_node_start_cycles = ggml_perf_cycles(); + threadpool->perf_node_start_time_us = ggml_perf_time_us(); params.nth = n_tasks; @@ -19387,7 +19741,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node, state); } - ggml_graph_compute_perf_stats_node(node, state->shared); + ggml_graph_compute_perf_stats_node(node, threadpool); } else { break; } @@ -19398,9 +19752,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } task_phase = GGML_TASK_TYPE_INIT; - atomic_store(&state->shared->n_active, n_threads); - atomic_store(&state->shared->node_n, node_n); - atomic_store(&state->shared->node_task, task_phase); + atomic_store(&threadpool->n_active, n_threads); + atomic_store(&threadpool->node_n, node_n); + atomic_store(&threadpool->node_task, task_phase); } else { ggml_graph_compute_thread_sync_node(&node_n, state, false); ggml_graph_compute_thread_sync_task(&task_phase, state, false); @@ -19411,7 +19765,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /* INIT & COMPUTE */ struct ggml_tensor * node = cgraph->nodes[node_n]; - const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + const int n_tasks = ggml_get_n_tasks(node, n_threads, threadpool->n_threads_cur); struct ggml_compute_params params = { /*.type =*/ GGML_TASK_TYPE_INIT, @@ -19427,12 +19781,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } } - if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + if (atomic_fetch_sub(&threadpool->n_active, 1) == 1) { task_phase = GGML_TASK_TYPE_COMPUTE; - atomic_store(&state->shared->n_active, n_threads); - atomic_store(&state->shared->node_task, task_phase); - } - else { + atomic_store(&threadpool->n_active, n_threads); + atomic_store(&threadpool->node_task, task_phase); + } else { // TODO: this sched_yield can have significant impact on the performance - either positive or negative // depending on the workload and the operating system. // since it is not clear what is the best approach, it should potentially become user-configurable @@ -19447,10 +19800,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node, state); } - if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + if (atomic_fetch_sub(&threadpool->n_active, 1) == 1) { task_phase = GGML_TASK_TYPE_FINALIZE; - atomic_store(&state->shared->n_active, n_threads); - atomic_store(&state->shared->node_task, task_phase); + atomic_store(&threadpool->n_active, n_threads); + atomic_store(&threadpool->node_task, task_phase); } else { ggml_graph_compute_thread_sync_task(&task_phase, state, false); @@ -19460,9 +19813,91 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } -struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { - if (n_threads <= 0) { - n_threads = GGML_DEFAULT_N_THREADS; +static inline int32_t ggml_graph_compute_check_for_work(struct ggml_compute_state * state) { + int32_t node_n; + struct ggml_compute_threadpool * threadpool = state->threadpool; + + do { + if (threadpool->poll) { + node_n = atomic_load(&threadpool->node_n); + if (node_n != -1) { + // No new work. Yield, and keep polling. + sched_yield(); + node_n = atomic_load(&threadpool->node_n); + } + } else { + ggml_mutex_lock_shared(&threadpool->mutex); + node_n = atomic_load(&threadpool->node_n); + if (node_n != -1 && !threadpool->stop && !threadpool->pause) { + // No new work. Wait for the signal. + ggml_cond_wait(&threadpool->cond, &threadpool->mutex); + node_n = atomic_load(&threadpool->node_n); + } + ggml_mutex_unlock_shared(&threadpool->mutex); + } + } while (state->ith >= threadpool->n_threads_cur); + return node_n; +} + +static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + struct ggml_compute_threadpool * threadpool = state->threadpool; + +#ifndef __aarch64__ + __init_stack(2ULL * 1024 * 1024); +#endif + + __thread_priority(threadpool->prio); + __thread_affinity(state->cpumask); + + // Indicate that we're ready to go + atomic_fetch_add(&threadpool->n_ready, 1); + + while (true) { + // Check if we need to sleep + while (threadpool->pause) { + GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith); + ggml_mutex_lock_shared(&threadpool->mutex); + if (threadpool->pause) { + ggml_cond_wait(&threadpool->cond, &threadpool->mutex); + } + GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith); + ggml_mutex_unlock_shared(&threadpool->mutex); + } + // This needs to be checked for after the cond_wait + if (threadpool->stop) break; + + // Check if there is new work + // node_n == -1 means we have a fresh graph to compute on. + // Only the main thread sets node_n back to -1. + + int32_t node_n = ggml_graph_compute_check_for_work(state); + if (node_n == -1) { + atomic_fetch_sub(&threadpool->n_ready, 1); + + int64_t ret = (int64_t) ggml_graph_compute_thread(state); + if (ret == GGML_EXIT_ABORTED) + return (thread_ret_t) ret; + + if (ret != GGML_EXIT_SUCCESS && ret != GGML_EXIT_ABORTED) { + fprintf(stderr, "ggml_graph_compute_thread exited with an unexpected error: %lld\n", (long long int) ret); + GGML_ASSERT(false); + } + + atomic_fetch_add(&threadpool->n_ready, 1); + } + } + + return (thread_ret_t) 0; +} + +struct ggml_cplan ggml_graph_plan( + const struct ggml_cgraph * cgraph, + int32_t n_threads, + struct ggml_compute_threadpool * threadpool +) { + if (threadpool == NULL) { + //GGML_PRINT("WARNING: Threadpool is not specified. Will create a disposable threadpool\n"); } size_t work_size = 0; @@ -19634,12 +20069,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } if (work_size > 0) { - work_size += CACHE_LINE_SIZE*(n_threads - 1); + work_size += CACHE_LINE_SIZE*(n_threads); } - cplan.n_threads = MIN(max_tasks, n_threads); - cplan.work_size = work_size; - cplan.work_data = NULL; + cplan.threadpool = threadpool; + cplan.n_threads = n_threads; + cplan.work_size = work_size; + cplan.work_data = NULL; return cplan; } @@ -19654,63 +20090,78 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl } } - const int n_threads = cplan->n_threads; + const int64_t perf_start_cycles = ggml_perf_cycles(); + const int64_t perf_start_time_us = ggml_perf_time_us(); - struct ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.cgraph_plan =*/ cplan, - /*.perf_node_start_cycles =*/ 0, - /*.perf_node_start_time_us =*/ 0, - /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, - /*.node_n =*/ -1, - /*.node_task =*/ GGML_TASK_TYPE_FINALIZE, - /*.abort_callback =*/ NULL, - /*.abort_callback_data =*/ NULL, - /*.current_chunk; =*/ 0, - }; - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + const int n_threads = cplan->n_threads; + struct ggml_compute_threadpool * threadpool = cplan->threadpool; + bool disposable_threadpool = false; + + if (threadpool == NULL) { + //GGML_PRINT("NOTE: Threadpool is not specified. Will create a disposable threadpool\n"); + struct ggml_threadpool_params tpp = { + .mask_specified = false, + .n_threads = n_threads, + .strict_cpu = false, + .prio = 1, + .poll = false + }; - // create thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; ++j) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .ith = j, - .shared = &state_shared, - .ec = GGML_STATUS_SUCCESS, - }; + threadpool = ggml_create_threadpool(&tpp); + ggml_resume_threadpool(threadpool); + disposable_threadpool = true; + } else if (n_threads > threadpool->n_threads_max) { + GGML_PRINT("WARNING: Requesting more threads that the threadpool contains. Expect a bad time.\n"); + } - const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } + // Initialize worker ordering + for (int j = 0; j < n_threads; ++j) { + threadpool->workers[j].ith = j; } - workers[0].ith = 0; - workers[0].shared = &state_shared; - workers[0].ec = GGML_STATUS_SUCCESS; + // Set up work + threadpool->cgraph = cgraph; + threadpool->cplan = cplan; + threadpool->n_active = n_threads; + threadpool->n_threads_cur = n_threads; - const int64_t perf_start_cycles = ggml_perf_cycles(); - const int64_t perf_start_time_us = ggml_perf_time_us(); + atomic_store(&threadpool->node_n, -1); + + // Kick threadpool + if (!threadpool->poll && n_threads > 1) { + ggml_mutex_lock(&threadpool->mutex); + ggml_cond_broadcast(&threadpool->cond); + ggml_mutex_unlock(&threadpool->mutex); + } - // this is a work thread too - ggml_graph_compute_thread(&workers[0]); - enum ggml_status compute_status = workers[0].ec; + int compute_status = GGML_STATUS_SUCCESS; + + // The main-thread is a work thread too. Start computing... + atomic_fetch_sub(&threadpool->n_ready, 1); + compute_status = (size_t) ggml_graph_compute_thread(&threadpool->workers[0]); // don't leave affinity set on the main thread clear_numa_thread_affinity(); - // join or kill thread pool + // Wait for all other threads to finish if (n_threads > 1) { - for (int j = 1; j < n_threads; j++) { - const int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - if (workers[j].ec != GGML_STATUS_SUCCESS) - compute_status = workers[j].ec; + atomic_fetch_add(&threadpool->n_ready, 1); + // wait for thread pool + while (atomic_load(&threadpool->n_ready) < threadpool->n_threads_max) { + __cpu_relax(); + } + } + + for (int j = 0; j < n_threads; ++j) { + if (threadpool->workers[j].ec != GGML_STATUS_SUCCESS) { + compute_status = threadpool->workers[j].ec; } } + if (disposable_threadpool) { + ggml_release_threadpool(threadpool); + } + // performance stats (graph) { int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; @@ -19731,8 +20182,12 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl return compute_status; } -enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { - struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); +enum ggml_status ggml_graph_compute_with_ctx( + struct ggml_context * ctx, + struct ggml_cgraph * cgraph, + int32_t n_threads +) { + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); @@ -20543,7 +20998,7 @@ static enum ggml_opt_result ggml_opt_adam( float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; @@ -20890,7 +21345,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( opt->iter = iter; } - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; diff --git a/ggml.h b/ggml.h index be81e0c52316b..314ea8b55cb85 100644 --- a/ggml.h +++ b/ggml.h @@ -274,6 +274,8 @@ #define GGML_UNREACHABLE() ((void) 0) #endif +#define GGML_N_CORES_MAX 512 + // used to copy the number of elements and stride in bytes of tensors into local variables. // main purpose is to reduce code duplication and improve readability. // @@ -609,6 +611,8 @@ extern "C" { // If it returns true, the computation is aborted typedef bool (*ggml_abort_callback)(void * data); + struct ggml_compute_threadpool; + // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -616,6 +620,7 @@ extern "C" { uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` int n_threads; + struct ggml_compute_threadpool * threadpool; // abort ggml_graph_compute when true ggml_abort_callback abort_callback; @@ -653,6 +658,15 @@ extern "C" { int64_t perf_time_us; }; + struct ggml_threadpool_params { + bool cpumask[GGML_N_CORES_MAX]; + bool mask_specified; + int32_t n_threads; + int32_t prio; + bool poll; + bool strict_cpu; + }; + // scratch buffer struct ggml_scratch { size_t offs; @@ -2025,9 +2039,18 @@ extern "C" { GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); + GGML_API struct ggml_compute_threadpool* ggml_create_threadpool (struct ggml_threadpool_params * params); + GGML_API void ggml_release_threadpool (struct ggml_compute_threadpool * threadpool); + GGML_API int32_t ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool); + GGML_API void ggml_pause_threadpool (struct ggml_compute_threadpool * threadpool); + GGML_API void ggml_resume_threadpool (struct ggml_compute_threadpool * threadpool); + // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API struct ggml_cplan ggml_graph_plan( + const struct ggml_cgraph * cgraph, + int n_threads, + struct ggml_compute_threadpool * threadpool); GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data diff --git a/llama.cpp b/llama.cpp index 15c66077525a7..69b441020cbd3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2198,6 +2198,9 @@ struct llama_context { #endif ggml_backend_t backend_cpu = nullptr; + ggml_compute_threadpool_t threadpool = nullptr; + ggml_compute_threadpool_t threadpool_batch = nullptr; + const llama_model & model; // key + value cache for the self attention @@ -11346,9 +11349,15 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { static void llama_graph_compute( - llama_context & lctx, - ggml_cgraph * gf, - int n_threads) { + llama_context & lctx, + ggml_cgraph * gf, + int n_threads, + ggml_compute_threadpool * threadpool) { +#ifdef GGML_USE_MPI + const int64_t n_layer = lctx.model.hparams.n_layer; + ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); +#endif + #ifdef GGML_USE_METAL if (ggml_backend_is_metal(lctx.backend_metal)) { ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); @@ -11360,7 +11369,7 @@ static void llama_graph_compute( ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } - ggml_backend_sched_graph_compute_async(lctx.sched, gf); + ggml_backend_sched_graph_compute_async(lctx.sched, gf, threadpool); // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); } @@ -11483,7 +11492,35 @@ static int llama_decode_internal( lctx.n_outputs = n_outputs_new; } - int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + int n_threads; + ggml_compute_threadpool* threadpool = nullptr; // nullptr -> disposable threadpool + if (n_tokens == 1) { + if (lctx.threadpool_batch) { + ggml_pause_threadpool(lctx.threadpool_batch); + } + if (lctx.threadpool) { + ggml_resume_threadpool(lctx.threadpool); + threadpool = lctx.threadpool; + } + n_threads = cparams.n_threads; + + } else { + if (lctx.threadpool && !lctx.threadpool_batch) { + ggml_pause_threadpool(lctx.threadpool); + } + if (lctx.threadpool_batch) { + ggml_resume_threadpool(lctx.threadpool_batch); + threadpool = lctx.threadpool_batch; + n_threads = cparams.n_threads_batch; + } else if (lctx.threadpool) { + ggml_resume_threadpool(lctx.threadpool); + threadpool = lctx.threadpool; + n_threads = cparams.n_threads; + } else { + n_threads = cparams.n_threads_batch; + } + } + GGML_ASSERT(n_threads > 0); // helpers for smoother batch API transition @@ -11596,7 +11633,7 @@ static int llama_decode_internal( llama_set_inputs(lctx, u_batch); - llama_graph_compute(lctx, gf, n_threads); + llama_graph_compute(lctx, gf, n_threads, threadpool); // update the kv ring buffer { @@ -11920,7 +11957,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); #endif //const int64_t t_end = ggml_time_us(); @@ -11942,7 +11979,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { llama_set_k_shift(lctx); - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); need_reserve = true; } @@ -11968,7 +12005,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { llama_set_s_copy(lctx); - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); need_reserve = true; } @@ -15391,7 +15428,7 @@ static int llama_apply_lora_from_file_internal( return 1; } - ggml_backend_graph_compute(backend_cpu, gf); + ggml_backend_graph_compute(backend_cpu, gf, nullptr); ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); @@ -15558,6 +15595,31 @@ void llama_numa_init(enum ggml_numa_strategy numa) { } } +void llama_attach_threadpool( + struct llama_context * ctx, + ggml_compute_threadpool_t threadpool) { + ctx->threadpool = threadpool; +} + +void llama_attach_batch_threadpool( + struct llama_context * ctx, + ggml_compute_threadpool_t threadpool_batch) { + ctx->threadpool_batch = threadpool_batch; +} + +void llama_detach_threadpool(struct llama_context * ctx) { + ctx->threadpool = nullptr; +} + +void llama_detach_batch_threadpool(struct llama_context * ctx) { + ctx->threadpool = nullptr; +} + +void llama_detach_threadpools(struct llama_context * ctx) { + llama_detach_threadpool(ctx); + llama_detach_batch_threadpool(ctx); +} + void llama_backend_free(void) { ggml_quantize_free(); } diff --git a/llama.h b/llama.h index 16cece5db0e78..7aae52e29633f 100644 --- a/llama.h +++ b/llama.h @@ -388,6 +388,16 @@ extern "C" { //optional: LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa); + LLAMA_API void llama_attach_threadpool( + struct llama_context * ctx, + ggml_compute_threadpool_t threadpool); + LLAMA_API void llama_attach_batch_threadpool( + struct llama_context * ctx, + ggml_compute_threadpool_t threadpool); + LLAMA_API void llama_detach_threadpool(struct llama_context * ctx); + LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx); + LLAMA_API void llama_detach_threadpools(struct llama_context * ctx); + // Call once at the end of the program - currently only used for MPI LLAMA_API void llama_backend_free(void); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index de74585da29dd..f9d033b52bbfc 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -587,7 +587,7 @@ struct test_case { ggml_build_forward_expand(gf, out); // warmup run - ggml_backend_graph_compute(backend, gf); + ggml_backend_graph_compute(backend, gf, nullptr); // duplicate the op size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU @@ -619,7 +619,7 @@ struct test_case { ggml_backend_synchronize(backend); int64_t start_time = ggml_time_us(); - ggml_backend_graph_compute(backend, gf); + ggml_backend_graph_compute(backend, gf, nullptr); ggml_backend_synchronize(backend); int64_t end_time = ggml_time_us(); double time_us = end_time - start_time; diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp index 26c1f42dc0e95..7e55cca579866 100644 --- a/tests/test-rope.cpp +++ b/tests/test-rope.cpp @@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32( } static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); if (plan.work_size > 0) { buf.resize(plan.work_size); From 467583831c671df6ce97aaa7cdd43ba3c49953b4 Mon Sep 17 00:00:00 2001 From: fmz Date: Fri, 24 May 2024 19:28:03 -0700 Subject: [PATCH 02/15] set bitmask properly on windows --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 3b4c093b30083..26d0dee825f2f 100644 --- a/ggml.c +++ b/ggml.c @@ -19337,7 +19337,7 @@ static bool __thread_affinity(bool * mask) { } } - DWORD_PTR m = (DWORD_PTR)mask; + DWORD_PTR m = (DWORD_PTR)bitmask; m = SetThreadAffinityMask(h, m); From 0ce35a67127de85bea2af51c12b30725c3de3645 Mon Sep 17 00:00:00 2001 From: fmz Date: Fri, 24 May 2024 12:53:46 -0700 Subject: [PATCH 03/15] reset cpu affinity every time for main thread --- ggml.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 26d0dee825f2f..f90fe22ab9adc 100644 --- a/ggml.c +++ b/ggml.c @@ -19586,8 +19586,6 @@ struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_p } } - // Set the main-thread's affinity last - __thread_affinity(workers[0].cpumask); // Ensure all threads entered the compute loop before returning. while (atomic_load(&threadpool->n_ready) != threadpool->n_threads_max) { ; } @@ -20119,6 +20117,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl threadpool->workers[j].ith = j; } + // Update main thread affinity to match the current threadpool + __thread_affinity(threadpool->workers[0].cpumask); + // Set up work threadpool->cgraph = cgraph; threadpool->cplan = cplan; From 8bdae2192b8966f5ba59e09c965d717b088d9a17 Mon Sep 17 00:00:00 2001 From: fmz Date: Fri, 24 May 2024 20:20:53 -0700 Subject: [PATCH 04/15] roll back CMakePresets.json changes --- CMakePresets.json | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 44e11a5f9cf78..ad1af7eccebbd 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -11,17 +11,6 @@ "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." } }, - { - "name": "msvc", - "hidden": true, - "generator": "Visual Studio 17 2022", - "architecture": "ARM", - "binaryDir": "${sourceDir}/build-${presetName}", - "cacheVariables": { - "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", - "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." - } - }, { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, @@ -49,8 +38,8 @@ { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] }, { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] }, - { "name": "arm64-windows-msvc-debug" , "inherits": [ "msvc", "arm64-windows-msvc", "debug" ] }, - { "name": "arm64-windows-msvc-release", "inherits": [ "msvc", "arm64-windows-msvc", "release" ] }, - { "name": "arm64-windows-msvc+static-release", "inherits": [ "msvc", "arm64-windows-msvc", "release", "static" ] } + { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, + { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] }, + { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] } ] } From ef1b87d1af564945879741d8abfd5f824f3843a8 Mon Sep 17 00:00:00 2001 From: fmz Date: Fri, 24 May 2024 22:20:52 -0700 Subject: [PATCH 05/15] free batch threadpool in main --- examples/main/main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 747c399209955..42a94a8c3156f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -983,6 +983,7 @@ int main(int argc, char ** argv) { llama_backend_free(); ggml_release_threadpool(threadpool); + ggml_release_threadpool(threadpool_batch); #ifndef LOG_DISABLE_LOGS LOG_TEE("Log end\n"); From e77167446ae09ebe665091c55c7046feb9b72085 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Sat, 25 May 2024 05:14:29 -0700 Subject: [PATCH 06/15] threadpool: proper handling for non-specified cpumask --- ggml.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/ggml.c b/ggml.c index f90fe22ab9adc..e67bd148a9cc4 100644 --- a/ggml.c +++ b/ggml.c @@ -1834,6 +1834,7 @@ struct ggml_compute_threadpool { struct ggml_compute_state { ggml_thread_t thrd; bool cpumask[GGML_N_CORES_MAX]; + bool mask_specified; int ith; struct ggml_compute_threadpool * threadpool; enum ggml_status ec; @@ -19472,13 +19473,6 @@ static bool __thread_priority(int32_t prio) { #endif -static void __init_stack(size_t size) { - void* ptr = alloca(size); - if (ptr) { - memset(ptr, 0, size); - } -} - #ifdef __aarch64__ static inline void __cpu_relax(void) { @@ -19553,8 +19547,6 @@ struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_p threadpool->workers = workers; - __init_stack(2ULL * 1024 * 1024); - int cpumask_iter = 0; __process_priority(tpp->prio); @@ -19566,12 +19558,12 @@ struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_p .ith = j, .threadpool = threadpool, .ec = GGML_STATUS_SUCCESS, + .mask_specified = false }; if (tpp->mask_specified) { __cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter); - } else { - workers[j].cpumask[j] = true; + workers[j].mask_specified = true; } // Spin threads for all secondary workers @@ -19841,12 +19833,9 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_compute_threadpool * threadpool = state->threadpool; -#ifndef __aarch64__ - __init_stack(2ULL * 1024 * 1024); -#endif - __thread_priority(threadpool->prio); - __thread_affinity(state->cpumask); + if (state->mask_specified) + __thread_affinity(state->cpumask); // Indicate that we're ready to go atomic_fetch_add(&threadpool->n_ready, 1); @@ -20096,7 +20085,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl bool disposable_threadpool = false; if (threadpool == NULL) { - //GGML_PRINT("NOTE: Threadpool is not specified. Will create a disposable threadpool\n"); + // GGML_PRINT("NOTE: Threadpool is not specified. Will create a disposable threadpool\n"); struct ggml_threadpool_params tpp = { .mask_specified = false, .n_threads = n_threads, @@ -20118,7 +20107,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl } // Update main thread affinity to match the current threadpool - __thread_affinity(threadpool->workers[0].cpumask); + if (threadpool->workers[0].mask_specified) + __thread_affinity(threadpool->workers[0].cpumask); // Set up work threadpool->cgraph = cgraph; From c1e7f488c3a9d342218a78f1332d2b76d4b1405b Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Sat, 25 May 2024 05:15:11 -0700 Subject: [PATCH 07/15] threadpool: add persistent threadpool for llama-bench --- examples/llama-bench/llama-bench.cpp | 37 ++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 2afdb3abdc278..81b84eb44f8e2 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1210,8 +1210,7 @@ struct sql_printer : public printer { } }; -static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { - llama_set_n_threads(ctx, n_threads, n_threads); +static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch) { const llama_model * model = llama_get_model(ctx); const int32_t n_vocab = llama_n_vocab(model); @@ -1233,9 +1232,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat llama_synchronize(ctx); } -static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { - llama_set_n_threads(ctx, n_threads, n_threads); - +static void test_gen(llama_context * ctx, int n_gen, int n_past) { const llama_model * model = llama_get_model(ctx); const int32_t n_vocab = llama_n_vocab(model); @@ -1332,13 +1329,31 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); + struct ggml_threadpool_params tpp; + tpp.n_threads = t.n_threads; + + // TODO: expose these via cli opts + tpp.mask_specified = false; + tpp.strict_cpu = false; + tpp.prio = 1; + tpp.poll = false; + + struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp); + if (!threadpool) { + LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + exit(1); + } + + llama_set_n_threads(ctx, t.n_threads, t.n_threads); + llama_attach_threadpool(ctx, threadpool); + // warmup run if (t.n_prompt > 0) { - //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); - test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); + //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch); + test_prompt(ctx, t.n_prompt, 0, t.n_batch); } if (t.n_gen > 0) { - test_gen(ctx, 1, 0, t.n_threads); + test_gen(ctx, 1, 0); } for (int i = 0; i < params.reps; i++) { @@ -1347,10 +1362,10 @@ int main(int argc, char ** argv) { uint64_t t_start = get_time_ns(); if (t.n_prompt > 0) { - test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); + test_prompt(ctx, t.n_prompt, 0, t.n_batch); } if (t.n_gen > 0) { - test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads); + test_gen(ctx, t.n_gen, t.n_prompt); } uint64_t t_ns = get_time_ns() - t_start; @@ -1362,6 +1377,8 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + + ggml_release_threadpool(threadpool); } llama_free_model(lmodel); From a67dbcc538516a9bc63429b0b359685758f7bcec Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Mon, 27 May 2024 21:29:58 -0700 Subject: [PATCH 08/15] threadpool: fix compiler errors for android and x64 builds --- ggml.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ggml.c b/ggml.c index e67bd148a9cc4..463b40f328c07 100644 --- a/ggml.c +++ b/ggml.c @@ -19402,6 +19402,9 @@ static bool __thread_priority(int32_t prio) { #else // posix? +#ifndef __USE_GNU +#define __USE_GNU +#endif #include static bool __thread_affinity(const bool * mask) { @@ -19473,17 +19476,16 @@ static bool __thread_priority(int32_t prio) { #endif -#ifdef __aarch64__ - +#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) ) static inline void __cpu_relax(void) { __asm__ volatile("yield" ::: "memory"); } - -#else - +#elif defined(__x86_64__) static inline void __cpu_relax(void) { - __asm__ volatile("rep; nop" ::: "memory"); + _mm_pause(); } +#else +static inline void __cpu_relax(void) {;} #endif static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) { From 1d9d39a18e2ce505cbb7b304bd85954dc224bf03 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Mon, 27 May 2024 22:13:07 -0700 Subject: [PATCH 09/15] threadpool: update backend interface in ggml-rpc --- ggml-rpc.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index cc1d3ace1ddac..f3a4fe8271283 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -585,7 +585,8 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector & o memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); } -GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, ggml_compute_threadpool * tp) { + UNUSED(tp); ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; std::vector input; serialize_graph(cgraph, input); @@ -1020,7 +1021,7 @@ bool rpc_server::graph_compute(const std::vector & input, std::vectornodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map); } - ggml_status status = ggml_backend_graph_compute(backend, graph); + ggml_status status = ggml_backend_graph_compute(backend, graph, NULL); // output serialization format: | status (1 byte) | output.resize(1, 0); output[0] = status; From 5c9222d88f6f5825c98e0256ac43d0100dfb03ee Mon Sep 17 00:00:00 2001 From: fmz Date: Tue, 28 May 2024 08:47:35 -0700 Subject: [PATCH 10/15] Restrict threadpool to CPU backend --- examples/llava/clip.cpp | 2 +- ggml-backend-impl.h | 4 +-- ggml-backend.c | 72 +++++++++++++++++++------------------- ggml-backend.h | 14 ++++---- ggml.c | 2 +- ggml.h | 2 +- llama.cpp | 5 +-- tests/test-backend-ops.cpp | 4 +-- 8 files changed, 52 insertions(+), 53 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 87f2d8b19793c..95fbe3d0216c4 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1915,7 +1915,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } #endif - ggml_backend_graph_compute(ctx->backend, gf, NULL); + ggml_backend_graph_compute(ctx->backend, gf); // the last node is the embedding tensor struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index b5d4d0f8d16ed..950711dd5ebd5 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -92,14 +92,14 @@ extern "C" { void (*GGML_CALL synchronize)(ggml_backend_t backend); // compute graph with a plan (not used currently) - ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool); + ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph with a plan enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan (async) - enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool); + enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); diff --git a/ggml-backend.c b/ggml-backend.c index ae185e7f96307..9565e55d37ea9 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -255,13 +255,12 @@ void ggml_backend_synchronize(ggml_backend_t backend) { } ggml_backend_graph_plan_t ggml_backend_graph_plan_create( - ggml_backend_t backend, - const struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + const struct ggml_cgraph * cgraph ) { GGML_ASSERT(backend->iface.graph_plan_create != NULL); - return backend->iface.graph_plan_create(backend, cgraph, threadpool); + return backend->iface.graph_plan_create(backend, cgraph); } void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { @@ -281,20 +280,18 @@ enum ggml_status ggml_backend_graph_plan_compute( enum ggml_status ggml_backend_graph_compute( ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + struct ggml_cgraph * cgraph ) { - enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, threadpool); + enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); ggml_backend_synchronize(backend); return err; } enum ggml_status ggml_backend_graph_compute_async( - ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + struct ggml_cgraph * cgraph ) { - return backend->iface.graph_compute(backend, cgraph, threadpool); + return backend->iface.graph_compute(backend, cgraph); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -741,7 +738,9 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { #endif struct ggml_backend_cpu_context { - int n_threads; + int n_threads; + ggml_compute_threadpool_t threadpool; + void * work_data; size_t work_size; @@ -774,15 +773,14 @@ struct ggml_backend_plan_cpu { }; GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create( - ggml_backend_t backend, - const struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + const struct ggml_cgraph * cgraph ) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); - cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool); + cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { @@ -817,13 +815,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe } GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute( - ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + struct ggml_cgraph * cgraph ) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool); + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); if (cpu_ctx->work_size < cplan.work_size) { free(cpu_ctx->work_data); @@ -892,6 +889,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { } ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->threadpool = NULL; ctx->work_data = NULL; ctx->work_size = 0; ctx->abort_callback = NULL; @@ -922,6 +920,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } +void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->threadpool = threadpool; +} + void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); @@ -1653,10 +1658,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { return true; } -static enum ggml_status ggml_backend_sched_compute_splits( - ggml_backend_sched_t sched, - ggml_compute_threadpool_t threadpool -) { +static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_backend_sched_split * splits = sched->splits; for (int i = 0; i < sched->n_splits; i++) { @@ -1690,7 +1692,7 @@ static enum ggml_status ggml_backend_sched_compute_splits( } if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, threadpool); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1712,7 +1714,7 @@ static enum ggml_status ggml_backend_sched_compute_splits( struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, threadpool); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1852,19 +1854,17 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra } enum ggml_status ggml_backend_sched_graph_compute( - ggml_backend_sched_t sched, - struct ggml_cgraph * graph, - ggml_compute_threadpool_t threadpool + ggml_backend_sched_t sched, + struct ggml_cgraph * graph ) { - enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph, threadpool); + enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph); ggml_backend_sched_synchronize(sched); return err; } enum ggml_status ggml_backend_sched_graph_compute_async( - ggml_backend_sched_t sched, - struct ggml_cgraph * graph, - ggml_compute_threadpool_t threadpool + ggml_backend_sched_t sched, + struct ggml_cgraph * graph ) { if (!sched->is_reset && !sched->is_alloc) { ggml_backend_sched_reset(sched); @@ -1876,7 +1876,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async( } } - return ggml_backend_sched_compute_splits(sched, threadpool); + return ggml_backend_sched_compute_splits(sched); } void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { @@ -2115,8 +2115,8 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1); struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1); - ggml_backend_graph_compute(backend1, &g1v, NULL); - ggml_backend_graph_compute(backend2, &g2v, NULL); + ggml_backend_graph_compute(backend1, &g1v); + ggml_backend_graph_compute(backend2, &g2v); if (ggml_is_view_op(t1->op)) { continue; diff --git a/ggml-backend.h b/ggml-backend.h index b17d9770b55c9..ba3a898cda29e 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -69,8 +69,7 @@ extern "C" { GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create( ggml_backend_t backend, - const struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool); + const struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); @@ -79,12 +78,10 @@ extern "C" { ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute( ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool); + struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async( ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool); + struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); @@ -112,6 +109,7 @@ extern "C" { GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool); GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); // Create a backend buffer from an existing pointer @@ -205,8 +203,8 @@ extern "C" { // Allocate and compute graph on the backend scheduler GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); - GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool); - GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool); + GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); // Reset all assignments and allocators - must be called before changing the node backends diff --git a/ggml.c b/ggml.c index 463b40f328c07..5a86c17176787 100644 --- a/ggml.c +++ b/ggml.c @@ -19501,7 +19501,7 @@ static void __cpumask_next(const bool * global_mask, bool * local_mask, bool str int32_t base_idx = *iter; for (int32_t i = 0; i < GGML_N_CORES_MAX; i++) { int32_t idx = base_idx + i; - if (idx > GGML_N_CORES_MAX) { + if (idx >= GGML_N_CORES_MAX) { // Just a cheaper modulo idx -= GGML_N_CORES_MAX; } diff --git a/ggml.h b/ggml.h index 314ea8b55cb85..7020cf28f1d3d 100644 --- a/ggml.h +++ b/ggml.h @@ -2051,7 +2051,7 @@ extern "C" { const struct ggml_cgraph * cgraph, int n_threads, struct ggml_compute_threadpool * threadpool); - GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); diff --git a/llama.cpp b/llama.cpp index 69b441020cbd3..1fc26a399a3c9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11366,10 +11366,11 @@ static void llama_graph_compute( if (lctx.backend_cpu != nullptr) { ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } - ggml_backend_sched_graph_compute_async(lctx.sched, gf, threadpool); + ggml_backend_sched_graph_compute_async(lctx.sched, gf); // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); } @@ -15428,7 +15429,7 @@ static int llama_apply_lora_from_file_internal( return 1; } - ggml_backend_graph_compute(backend_cpu, gf, nullptr); + ggml_backend_graph_compute(backend_cpu, gf); ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f9d033b52bbfc..de74585da29dd 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -587,7 +587,7 @@ struct test_case { ggml_build_forward_expand(gf, out); // warmup run - ggml_backend_graph_compute(backend, gf, nullptr); + ggml_backend_graph_compute(backend, gf); // duplicate the op size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU @@ -619,7 +619,7 @@ struct test_case { ggml_backend_synchronize(backend); int64_t start_time = ggml_time_us(); - ggml_backend_graph_compute(backend, gf, nullptr); + ggml_backend_graph_compute(backend, gf); ggml_backend_synchronize(backend); int64_t end_time = ggml_time_us(); double time_us = end_time - start_time; From cbab212a322b6a3f5f821624d825b360f941b1d7 Mon Sep 17 00:00:00 2001 From: fmz Date: Tue, 28 May 2024 08:47:35 -0700 Subject: [PATCH 11/15] Restrict threadpool to CPU backend --- examples/llava/clip.cpp | 2 +- ggml-backend-impl.h | 4 +-- ggml-backend.c | 72 +++++++++++++++++++------------------- ggml-backend.h | 14 ++++---- ggml-cuda.cu | 8 ++--- ggml-kompute.cpp | 7 +--- ggml-metal.m | 7 +--- ggml-opencl.cpp | 7 +--- ggml-rpc.cpp | 5 ++- ggml-sycl.cpp | 8 +---- ggml-vulkan.cpp | 7 +--- ggml.c | 2 +- ggml.h | 2 +- llama.cpp | 5 +-- tests/test-backend-ops.cpp | 4 +-- 15 files changed, 61 insertions(+), 93 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 87f2d8b19793c..95fbe3d0216c4 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1915,7 +1915,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } #endif - ggml_backend_graph_compute(ctx->backend, gf, NULL); + ggml_backend_graph_compute(ctx->backend, gf); // the last node is the embedding tensor struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index b5d4d0f8d16ed..950711dd5ebd5 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -92,14 +92,14 @@ extern "C" { void (*GGML_CALL synchronize)(ggml_backend_t backend); // compute graph with a plan (not used currently) - ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool); + ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph with a plan enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan (async) - enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool); + enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); diff --git a/ggml-backend.c b/ggml-backend.c index ae185e7f96307..9565e55d37ea9 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -255,13 +255,12 @@ void ggml_backend_synchronize(ggml_backend_t backend) { } ggml_backend_graph_plan_t ggml_backend_graph_plan_create( - ggml_backend_t backend, - const struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + const struct ggml_cgraph * cgraph ) { GGML_ASSERT(backend->iface.graph_plan_create != NULL); - return backend->iface.graph_plan_create(backend, cgraph, threadpool); + return backend->iface.graph_plan_create(backend, cgraph); } void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { @@ -281,20 +280,18 @@ enum ggml_status ggml_backend_graph_plan_compute( enum ggml_status ggml_backend_graph_compute( ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + struct ggml_cgraph * cgraph ) { - enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, threadpool); + enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); ggml_backend_synchronize(backend); return err; } enum ggml_status ggml_backend_graph_compute_async( - ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + struct ggml_cgraph * cgraph ) { - return backend->iface.graph_compute(backend, cgraph, threadpool); + return backend->iface.graph_compute(backend, cgraph); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -741,7 +738,9 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { #endif struct ggml_backend_cpu_context { - int n_threads; + int n_threads; + ggml_compute_threadpool_t threadpool; + void * work_data; size_t work_size; @@ -774,15 +773,14 @@ struct ggml_backend_plan_cpu { }; GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create( - ggml_backend_t backend, - const struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + const struct ggml_cgraph * cgraph ) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); - cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool); + cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { @@ -817,13 +815,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe } GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute( - ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool + ggml_backend_t backend, + struct ggml_cgraph * cgraph ) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool); + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); if (cpu_ctx->work_size < cplan.work_size) { free(cpu_ctx->work_data); @@ -892,6 +889,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { } ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->threadpool = NULL; ctx->work_data = NULL; ctx->work_size = 0; ctx->abort_callback = NULL; @@ -922,6 +920,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } +void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->threadpool = threadpool; +} + void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); @@ -1653,10 +1658,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { return true; } -static enum ggml_status ggml_backend_sched_compute_splits( - ggml_backend_sched_t sched, - ggml_compute_threadpool_t threadpool -) { +static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_backend_sched_split * splits = sched->splits; for (int i = 0; i < sched->n_splits; i++) { @@ -1690,7 +1692,7 @@ static enum ggml_status ggml_backend_sched_compute_splits( } if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, threadpool); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1712,7 +1714,7 @@ static enum ggml_status ggml_backend_sched_compute_splits( struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, threadpool); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1852,19 +1854,17 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra } enum ggml_status ggml_backend_sched_graph_compute( - ggml_backend_sched_t sched, - struct ggml_cgraph * graph, - ggml_compute_threadpool_t threadpool + ggml_backend_sched_t sched, + struct ggml_cgraph * graph ) { - enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph, threadpool); + enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph); ggml_backend_sched_synchronize(sched); return err; } enum ggml_status ggml_backend_sched_graph_compute_async( - ggml_backend_sched_t sched, - struct ggml_cgraph * graph, - ggml_compute_threadpool_t threadpool + ggml_backend_sched_t sched, + struct ggml_cgraph * graph ) { if (!sched->is_reset && !sched->is_alloc) { ggml_backend_sched_reset(sched); @@ -1876,7 +1876,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async( } } - return ggml_backend_sched_compute_splits(sched, threadpool); + return ggml_backend_sched_compute_splits(sched); } void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { @@ -2115,8 +2115,8 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1); struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1); - ggml_backend_graph_compute(backend1, &g1v, NULL); - ggml_backend_graph_compute(backend2, &g2v, NULL); + ggml_backend_graph_compute(backend1, &g1v); + ggml_backend_graph_compute(backend2, &g2v); if (ggml_is_view_op(t1->op)) { continue; diff --git a/ggml-backend.h b/ggml-backend.h index b17d9770b55c9..ba3a898cda29e 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -69,8 +69,7 @@ extern "C" { GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create( ggml_backend_t backend, - const struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool); + const struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); @@ -79,12 +78,10 @@ extern "C" { ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute( ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool); + struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async( ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool); + struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); @@ -112,6 +109,7 @@ extern "C" { GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool); GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); // Create a backend buffer from an existing pointer @@ -205,8 +203,8 @@ extern "C" { // Allocate and compute graph on the backend scheduler GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); - GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool); - GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool); + GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); // Reset all assignments and allocators - must be called before changing the node backends diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d33f8a49b83bf..b82167cbf7227 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2495,13 +2495,9 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra return true; } -GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute( - ggml_backend_t backend, - ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool) { - - GGML_UNUSED(threadpool); +GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_cuda_set_device(cuda_ctx->device); #ifdef USE_CUDA_GRAPH diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 90272d5f13d7b..6c6058b2a95b1 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1948,12 +1948,7 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g return ggml_backend_kompute_buffer_type(ctx->device); } -static ggml_status ggml_backend_kompute_graph_compute( - ggml_backend_t backend, - struct ggml_cgraph * cgraph - ggml_compute_threadpool_t threadpool) { - - GGML_UNUSED(threadpool); +static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { auto * ctx = static_cast(backend->context); ggml_vk_graph_compute(ctx, cgraph); return GGML_STATUS_SUCCESS; diff --git a/ggml-metal.m b/ggml-metal.m index 051ade2fcc2c5..c9e570dbf5a3a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -3103,12 +3103,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe UNUSED(backend); } -GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute( - ggml_backend_t backend, - struct ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool) { - - UNUSED(threadpool); +GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; return ggml_metal_graph_compute(metal_ctx, cgraph); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 079a718a115a2..e28566a7bdbd7 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -2235,12 +2235,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg GGML_UNUSED(backend); } -static ggml_status ggml_backend_opencl_graph_compute( - ggml_backend_t backend, - ggml_cgraph * graph, - ggml_compute_threadpool_t threadpool) { - - GGML_UNUSED(threadpool); +static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * node = graph->nodes[i]; diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index f3a4fe8271283..cc1d3ace1ddac 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -585,8 +585,7 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector & o memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); } -GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, ggml_compute_threadpool * tp) { - UNUSED(tp); +GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; std::vector input; serialize_graph(cgraph, input); @@ -1021,7 +1020,7 @@ bool rpc_server::graph_compute(const std::vector & input, std::vectornodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map); } - ggml_status status = ggml_backend_graph_compute(backend, graph, NULL); + ggml_status status = ggml_backend_graph_compute(backend, graph); // output serialization format: | status (1 byte) | output.resize(1, 0); output[0] = status; diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 15d07dc7a3d1c..496ec61c3c28a 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17022,13 +17022,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -GGML_CALL static ggml_status ggml_backend_sycl_graph_compute( - ggml_backend_t backend, - ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool) { - - GGML_UNUSED(threadpool); - +GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_sycl_set_main_device(sycl_ctx->device); diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 1f7923d65b800..79ce1479f16ca 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -6225,12 +6225,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) { return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; } -GGML_CALL static ggml_status ggml_backend_vk_graph_compute( - ggml_backend_t backend, - ggml_cgraph * cgraph, - ggml_compute_threadpool_t threadpool) { - - GGML_UNUSED(threadpool); +GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl; #endif diff --git a/ggml.c b/ggml.c index 463b40f328c07..5a86c17176787 100644 --- a/ggml.c +++ b/ggml.c @@ -19501,7 +19501,7 @@ static void __cpumask_next(const bool * global_mask, bool * local_mask, bool str int32_t base_idx = *iter; for (int32_t i = 0; i < GGML_N_CORES_MAX; i++) { int32_t idx = base_idx + i; - if (idx > GGML_N_CORES_MAX) { + if (idx >= GGML_N_CORES_MAX) { // Just a cheaper modulo idx -= GGML_N_CORES_MAX; } diff --git a/ggml.h b/ggml.h index 314ea8b55cb85..7020cf28f1d3d 100644 --- a/ggml.h +++ b/ggml.h @@ -2051,7 +2051,7 @@ extern "C" { const struct ggml_cgraph * cgraph, int n_threads, struct ggml_compute_threadpool * threadpool); - GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); diff --git a/llama.cpp b/llama.cpp index 69b441020cbd3..1fc26a399a3c9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11366,10 +11366,11 @@ static void llama_graph_compute( if (lctx.backend_cpu != nullptr) { ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } - ggml_backend_sched_graph_compute_async(lctx.sched, gf, threadpool); + ggml_backend_sched_graph_compute_async(lctx.sched, gf); // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); } @@ -15428,7 +15429,7 @@ static int llama_apply_lora_from_file_internal( return 1; } - ggml_backend_graph_compute(backend_cpu, gf, nullptr); + ggml_backend_graph_compute(backend_cpu, gf); ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f9d033b52bbfc..de74585da29dd 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -587,7 +587,7 @@ struct test_case { ggml_build_forward_expand(gf, out); // warmup run - ggml_backend_graph_compute(backend, gf, nullptr); + ggml_backend_graph_compute(backend, gf); // duplicate the op size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU @@ -619,7 +619,7 @@ struct test_case { ggml_backend_synchronize(backend); int64_t start_time = ggml_time_us(); - ggml_backend_graph_compute(backend, gf, nullptr); + ggml_backend_graph_compute(backend, gf); ggml_backend_synchronize(backend); int64_t end_time = ggml_time_us(); double time_us = end_time - start_time; From 65c11d415dcfc0d379c8ab14170bb984212995f4 Mon Sep 17 00:00:00 2001 From: fmz Date: Tue, 28 May 2024 22:31:15 -0700 Subject: [PATCH 12/15] llama-bench threadpool CLI params --- common/common.cpp | 10 +- common/common.h | 4 + examples/llama-bench/llama-bench.cpp | 145 +++++++++++++++++++-------- ggml.h | 2 +- 4 files changed, 114 insertions(+), 47 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 6306fb572d1c7..1a284c56dd6e7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -218,7 +218,7 @@ void gpt_params_handle_model_default(gpt_params & params) { } } -static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr) { +void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) { int32_t n_set = 0; if (cpuparams.n_threads < 0) { @@ -226,7 +226,7 @@ static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role if (role_model != nullptr) { cpuparams = *role_model; } else { - cpuparams.n_threads = cpu_get_num_math(); + cpuparams.n_threads = std::thread::hardware_concurrency(); } } @@ -235,11 +235,13 @@ static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role n_set++; } } + if (n_set == 0) { // You hit the jackpot! memset(&cpuparams.cpumask[0], 1, GGML_N_CORES_MAX); n_set = GGML_N_CORES_MAX; } + if (n_set < cpuparams.n_threads) { // Not enough set bits, may experience performance issues. fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); @@ -313,7 +315,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return result; } -static bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_CORES_MAX]) { +bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_CORES_MAX]) { size_t dash_loc = range.find('-'); if (dash_loc == std::string::npos) { fprintf(stderr, "Format of CPU range is invalid! Expected []-[].\n"); @@ -350,7 +352,7 @@ static bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_C return true; } -static bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_N_CORES_MAX]) { +bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_N_CORES_MAX]) { // Discard potential 0x prefix size_t start_i = 0; if (mask.length() >= 2 && mask.substr(0, 2) == "0x") { diff --git a/common/common.h b/common/common.h index 9c5ab959fb6fe..039042c815bfe 100644 --- a/common/common.h +++ b/common/common.h @@ -198,6 +198,10 @@ bool gpt_params_parse (int argc, char ** argv, gpt_params & params); bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); +bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_N_CORES_MAX]); +bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_N_CORES_MAX]); +void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); + std::string gpt_params_get_system_info(const gpt_params & params); // diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 81b84eb44f8e2..701d8cbcec42e 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -186,11 +186,18 @@ struct cmd_params { std::vector use_mmap; std::vector embeddings; ggml_numa_strategy numa; + cpu_params cpuparams; int reps; bool verbose; output_formats output_format; }; +int32_t n_threads = -1; +bool cpumask[GGML_N_CORES_MAX] = { false }; // CPU affinity mask. +bool mask_valid = false; // Default: any CPU +int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) +bool strict_cpu = false; // Use strict CPU placement +bool poll = false; // Use polling (busywait) to wait for work static const cmd_params cmd_params_defaults = { /* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* n_prompt */ {512}, @@ -210,6 +217,7 @@ static const cmd_params cmd_params_defaults = { /* use_mmap */ {true}, /* embeddings */ {false}, /* numa */ GGML_NUMA_STRATEGY_DISABLED, + /* cpuparams */ {int32_t(std::thread::hardware_concurrency()), {false}, false, 1, false, false}, /* reps */ 5, /* verbose */ false, /* output_format */ MARKDOWN @@ -236,6 +244,11 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" --numa (default: disabled)\n"); + printf(" -mt, --max-threads (default: %d)\n", cmd_params_defaults.cpuparams.n_threads); + printf(" -C, --cpu-mask (default: 0x0)\n"); + printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu); + printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority); + printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -272,7 +285,7 @@ static ggml_type ggml_type_from_name(const std::string & s) { } -static cmd_params parse_cmd_params(int argc, char ** argv) { +static cmd_params parse_cmd_params(int argc, char** argv) { cmd_params params; std::string arg; bool invalid_param = false; @@ -292,28 +305,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (arg == "-h" || arg == "--help") { print_usage(argc, argv); exit(0); - } else if (arg == "-m" || arg == "--model") { + } + else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.model.insert(params.model.end(), p.begin(), p.end()); - } else if (arg == "-p" || arg == "--n-prompt") { + } + else if (arg == "-p" || arg == "--n-prompt") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end()); - } else if (arg == "-n" || arg == "--n-gen") { + } + else if (arg == "-n" || arg == "--n-gen") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); - } else if (arg == "-pg") { + } + else if (arg == "-pg") { if (++i >= argc) { invalid_param = true; break; @@ -323,29 +340,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); - } else if (arg == "-b" || arg == "--batch-size") { + params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); + } + else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); - } else if (arg == "-ub" || arg == "--ubatch-size") { + } + else if (arg == "-ub" || arg == "--ubatch-size") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); - } else if (arg == "-ctk" || arg == "--cache-type-k") { + } + else if (arg == "-ctk" || arg == "--cache-type-k") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); std::vector types; - for (const auto & t : p) { + for (const auto& t : p) { ggml_type gt = ggml_type_from_name(t); if (gt == GGML_TYPE_COUNT) { invalid_param = true; @@ -354,14 +374,15 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { types.push_back(gt); } params.type_k.insert(params.type_k.end(), types.begin(), types.end()); - } else if (arg == "-ctv" || arg == "--cache-type-v") { + } + else if (arg == "-ctv" || arg == "--cache-type-v") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); std::vector types; - for (const auto & t : p) { + for (const auto& t : p) { ggml_type gt = ggml_type_from_name(t); if (gt == GGML_TYPE_COUNT) { invalid_param = true; @@ -370,66 +391,104 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { types.push_back(gt); } params.type_v.insert(params.type_v.end(), types.begin(), types.end()); - } else if (arg == "-t" || arg == "--threads") { + } + else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); - } else if (arg == "-ngl" || arg == "--n-gpu-layers") { + } + else if (arg == "-ngl" || arg == "--n-gpu-layers") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); - } else if (arg == "-sm" || arg == "--split-mode") { + } + else if (arg == "-sm" || arg == "--split-mode") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); std::vector modes; - for (const auto & m : p) { + for (const auto& m : p) { llama_split_mode mode; if (m == "none") { mode = LLAMA_SPLIT_MODE_NONE; - } else if (m == "layer") { + } + else if (m == "layer") { mode = LLAMA_SPLIT_MODE_LAYER; - } else if (m == "row") { + } + else if (m == "row") { mode = LLAMA_SPLIT_MODE_ROW; - } else { + } + else { invalid_param = true; break; } modes.push_back(mode); } params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); - } else if (arg == "-mg" || arg == "--main-gpu") { + } + else if (arg == "-mg" || arg == "--main-gpu") { if (++i >= argc) { invalid_param = true; break; } params.main_gpu = split(argv[i], split_delim); - } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + } + else if (arg == "-nkvo" || arg == "--no-kv-offload") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); - } else if (arg == "--numa") { + } + else if (arg == "--numa") { if (++i >= argc) { invalid_param = true; break; - } else { + } + else { std::string value(argv[i]); - /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { invalid_param = true; break; } } + + } + else if (arg == "-mt" || arg == "--max-threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cpuparams.n_threads = std::stoi(argv[i]); + } + else if (arg == "-C" || arg == "--cpu-mask") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string mask = argv[i]; + params.cpuparams.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask); + } + else if (arg == "--prio") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cpuparams.priority = std::stoul(argv[i]); + } else if (arg == "--cpu-strict") { + params.cpuparams.strict_cpu = true; + } else if (arg == "--poll") { + params.cpuparams.poll = true; } else if (arg == "-fa" || arg == "--flash-attn") { if (++i >= argc) { invalid_param = true; @@ -1303,6 +1362,23 @@ int main(int argc, char ** argv) { llama_model * lmodel = nullptr; const cmd_params_instance * prev_inst = nullptr; + postprocess_cpu_params(params.cpuparams); + + struct ggml_threadpool_params tpp; + tpp.n_threads = params.cpuparams.n_threads; + tpp.mask_specified = params.cpuparams.mask_valid; + tpp.strict_cpu = params.cpuparams.strict_cpu; + tpp.prio = params.cpuparams.priority; + tpp.poll = params.cpuparams.poll; + + std::memcpy(&tpp.cpumask[0], ¶ms.cpuparams.cpumask[0], GGML_N_CORES_MAX); + + struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp); + if (!threadpool) { + LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + exit(1); + } + for (const auto & inst : params_instances) { // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { @@ -1329,21 +1405,6 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); - struct ggml_threadpool_params tpp; - tpp.n_threads = t.n_threads; - - // TODO: expose these via cli opts - tpp.mask_specified = false; - tpp.strict_cpu = false; - tpp.prio = 1; - tpp.poll = false; - - struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp); - if (!threadpool) { - LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); - exit(1); - } - llama_set_n_threads(ctx, t.n_threads, t.n_threads); llama_attach_threadpool(ctx, threadpool); @@ -1378,8 +1439,8 @@ int main(int argc, char ** argv) { llama_free(ctx); - ggml_release_threadpool(threadpool); } + ggml_release_threadpool(threadpool); llama_free_model(lmodel); diff --git a/ggml.h b/ggml.h index 7020cf28f1d3d..5c63d825e32a6 100644 --- a/ggml.h +++ b/ggml.h @@ -274,7 +274,7 @@ #define GGML_UNREACHABLE() ((void) 0) #endif -#define GGML_N_CORES_MAX 512 +#define GGML_N_CORES_MAX 16 // used to copy the number of elements and stride in bytes of tensors into local variables. // main purpose is to reduce code duplication and improve readability. From e8c336438792056145f32a7b26ff239cf40ebbe5 Mon Sep 17 00:00:00 2001 From: Faisal Zaghloul Date: Thu, 30 May 2024 07:58:02 -0400 Subject: [PATCH 13/15] fixes for non-llvm builds --- examples/llama-bench/llama-bench.cpp | 17 ++++++++++------- ggml.c | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 701d8cbcec42e..82a001a0af3ba 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -191,13 +191,16 @@ struct cmd_params { bool verbose; output_formats output_format; }; +// +//static const cpu_params default_cpuparams( +// int32_t(std::thread::hardware_concurrency()), +// {false}, +// false, +// 1, +// false, +// false +//); -int32_t n_threads = -1; -bool cpumask[GGML_N_CORES_MAX] = { false }; // CPU affinity mask. -bool mask_valid = false; // Default: any CPU -int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) -bool strict_cpu = false; // Use strict CPU placement -bool poll = false; // Use polling (busywait) to wait for work static const cmd_params cmd_params_defaults = { /* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* n_prompt */ {512}, @@ -217,7 +220,7 @@ static const cmd_params cmd_params_defaults = { /* use_mmap */ {true}, /* embeddings */ {false}, /* numa */ GGML_NUMA_STRATEGY_DISABLED, - /* cpuparams */ {int32_t(std::thread::hardware_concurrency()), {false}, false, 1, false, false}, + /* cpuparams */ {}, /* reps */ 5, /* verbose */ false, /* output_format */ MARKDOWN diff --git a/ggml.c b/ggml.c index 5a86c17176787..ff7a8ca1a5eb4 100644 --- a/ggml.c +++ b/ggml.c @@ -20062,7 +20062,7 @@ struct ggml_cplan ggml_graph_plan( } cplan.threadpool = threadpool; - cplan.n_threads = n_threads; + cplan.n_threads = MIN(max_tasks, n_threads); cplan.work_size = work_size; cplan.work_data = NULL; From 719d12bc51a3e508af7b839230e1e03198666b5f Mon Sep 17 00:00:00 2001 From: Faisal Zaghloul Date: Fri, 31 May 2024 02:57:56 -0400 Subject: [PATCH 14/15] fix server test --- llama.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 1fc26a399a3c9..180d1153d0320 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11512,14 +11512,11 @@ static int llama_decode_internal( if (lctx.threadpool_batch) { ggml_resume_threadpool(lctx.threadpool_batch); threadpool = lctx.threadpool_batch; - n_threads = cparams.n_threads_batch; } else if (lctx.threadpool) { ggml_resume_threadpool(lctx.threadpool); threadpool = lctx.threadpool; - n_threads = cparams.n_threads; - } else { - n_threads = cparams.n_threads_batch; } + n_threads = cparams.n_threads_batch; } GGML_ASSERT(n_threads > 0); From 9d6675def40473561bff4c2c2fb78df9d09be856 Mon Sep 17 00:00:00 2001 From: Faisal Zaghloul Date: Fri, 31 May 2024 12:30:12 -0400 Subject: [PATCH 15/15] fix n_threads == 1 bug --- ggml.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index ff7a8ca1a5eb4..201f019587c75 100644 --- a/ggml.c +++ b/ggml.c @@ -20130,7 +20130,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl int compute_status = GGML_STATUS_SUCCESS; // The main-thread is a work thread too. Start computing... - atomic_fetch_sub(&threadpool->n_ready, 1); + if (n_threads > 1) { + atomic_fetch_sub(&threadpool->n_ready, 1); + } compute_status = (size_t) ggml_graph_compute_thread(&threadpool->workers[0]); // don't leave affinity set on the main thread