Skip to content

Commit c882647

Browse files
committed
Direct I/O and Transparent HugePages
--direct-io for bypassing page cache (and using THP on Linux) Up to 3-6x faster uncached loading, fewer pageouts, no page cache pollution.
1 parent a876861 commit c882647

File tree

9 files changed

+366
-56
lines changed

9 files changed

+366
-56
lines changed

common/common.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,6 +1638,17 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
16381638
params.use_mmap = false;
16391639
}
16401640
));
1641+
add_opt(llama_arg(
1642+
{"--direct-io"},
1643+
"use direct I/O (potentially faster uncached loading, fewer pageouts, no page cache pollution)",
1644+
[](gpt_params & params) {
1645+
if (llama_supports_direct_io()) {
1646+
params.use_direct_io = true;
1647+
} else {
1648+
fprintf(stderr, "warning: direct I/O is not available, --direct-io option will be ignored\n");
1649+
}
1650+
}
1651+
));
16411652
add_opt(llama_arg(
16421653
{"--numa"}, "TYPE",
16431654
"attempt optimizations that help on some NUMA systems\n"
@@ -2742,6 +2753,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
27422753
mparams.split_mode = params.split_mode;
27432754
mparams.tensor_split = params.tensor_split;
27442755
mparams.use_mmap = params.use_mmap;
2756+
mparams.use_direct_io = params.use_direct_io;
27452757
mparams.use_mlock = params.use_mlock;
27462758
mparams.check_tensors = params.check_tensors;
27472759
if (params.kv_overrides.empty()) {
@@ -3780,6 +3792,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
37803792
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
37813793
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
37823794
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
3795+
fprintf(stream, "direct-io: %s # default: false\n", params.use_direct_io ? "true" : "false");
37833796
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
37843797
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
37853798
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ struct gpt_params {
208208
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
209209
bool logits_all = false; // return logits for all tokens in the batch
210210
bool use_mmap = true; // use mmap for faster loads
211+
bool use_direct_io = false; // use direct I/O
211212
bool use_mlock = false; // use mlock to keep model in memory
212213
bool verbose_prompt = false; // print prompt tokens before generation
213214
bool display_prompt = true; // print prompt before generation

examples/llama-bench/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ options:
4343
-nkvo, --no-kv-offload <0|1> (default: 0)
4444
-fa, --flash-attn <0|1> (default: 0)
4545
-mmp, --mmap <0|1> (default: 1)
46+
-dio, --direct-io <0|1> (default: 0)
4647
--numa <distribute|isolate|numactl> (default: disabled)
4748
-embd, --embeddings <0|1> (default: 0)
4849
-ts, --tensor-split <ts0/ts1/..> (default: 0)

examples/llama-bench/llama-bench.cpp

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ struct cmd_params {
243243
std::vector<bool> flash_attn;
244244
std::vector<std::vector<float>> tensor_split;
245245
std::vector<bool> use_mmap;
246+
std::vector<bool> use_direct_io;
246247
std::vector<bool> embeddings;
247248
ggml_numa_strategy numa;
248249
int reps;
@@ -275,6 +276,7 @@ static const cmd_params cmd_params_defaults = {
275276
/* flash_attn */ {false},
276277
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
277278
/* use_mmap */ {true},
279+
/* use_direct_io */ {false},
278280
/* embeddings */ {false},
279281
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
280282
/* reps */ 5,
@@ -312,6 +314,7 @@ static void print_usage(int /* argc */, char ** argv) {
312314
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
313315
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
314316
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
317+
printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
315318
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
316319
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
317320
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
@@ -559,6 +562,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
559562
}
560563
auto p = string_split<bool>(argv[i], split_delim);
561564
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
565+
} else if (arg == "-dio" || arg == "--direct-io") {
566+
if (++i >= argc) {
567+
invalid_param = true;
568+
break;
569+
}
570+
auto p = string_split<bool>(argv[i], split_delim);
571+
params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
562572
} else if (arg == "-embd" || arg == "--embeddings") {
563573
if (++i >= argc) {
564574
invalid_param = true;
@@ -650,6 +660,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
650660
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
651661
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
652662
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
663+
if (params.use_direct_io.empty()){ params.use_direct_io = cmd_params_defaults.use_direct_io; }
653664
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
654665
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
655666
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
@@ -679,6 +690,7 @@ struct cmd_params_instance {
679690
bool flash_attn;
680691
std::vector<float> tensor_split;
681692
bool use_mmap;
693+
bool use_direct_io;
682694
bool embeddings;
683695

684696
llama_model_params to_llama_mparams() const {
@@ -692,6 +704,7 @@ struct cmd_params_instance {
692704
mparams.main_gpu = main_gpu;
693705
mparams.tensor_split = tensor_split.data();
694706
mparams.use_mmap = use_mmap;
707+
mparams.use_direct_io = use_direct_io;
695708

696709
return mparams;
697710
}
@@ -703,6 +716,7 @@ struct cmd_params_instance {
703716
split_mode == other.split_mode &&
704717
main_gpu == other.main_gpu &&
705718
use_mmap == other.use_mmap &&
719+
use_direct_io == other.use_direct_io &&
706720
tensor_split == other.tensor_split;
707721
}
708722

@@ -733,6 +747,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
733747
for (const auto & mg : params.main_gpu)
734748
for (const auto & ts : params.tensor_split)
735749
for (const auto & mmp : params.use_mmap)
750+
for (const auto & dio : params.use_direct_io)
736751
for (const auto & embd : params.embeddings)
737752
for (const auto & nb : params.n_batch)
738753
for (const auto & nub : params.n_ubatch)
@@ -768,6 +783,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
768783
/* .flash_attn = */ fa,
769784
/* .tensor_split = */ ts,
770785
/* .use_mmap = */ mmp,
786+
/* .use_direct_io= */ dio,
771787
/* .embeddings = */ embd,
772788
};
773789
instances.push_back(instance);
@@ -797,6 +813,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
797813
/* .flash_attn = */ fa,
798814
/* .tensor_split = */ ts,
799815
/* .use_mmap = */ mmp,
816+
/* .use_direct_io= */ dio,
800817
/* .embeddings = */ embd,
801818
};
802819
instances.push_back(instance);
@@ -826,6 +843,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
826843
/* .flash_attn = */ fa,
827844
/* .tensor_split = */ ts,
828845
/* .use_mmap = */ mmp,
846+
/* .use_direct_io= */ dio,
829847
/* .embeddings = */ embd,
830848
};
831849
instances.push_back(instance);
@@ -867,6 +885,7 @@ struct test {
867885
bool flash_attn;
868886
std::vector<float> tensor_split;
869887
bool use_mmap;
888+
bool use_direct_io;
870889
bool embeddings;
871890
int n_prompt;
872891
int n_gen;
@@ -896,6 +915,7 @@ struct test {
896915
flash_attn = inst.flash_attn;
897916
tensor_split = inst.tensor_split;
898917
use_mmap = inst.use_mmap;
918+
use_direct_io = inst.use_direct_io;
899919
embeddings = inst.embeddings;
900920
n_prompt = inst.n_prompt;
901921
n_gen = inst.n_gen;
@@ -967,7 +987,7 @@ struct test {
967987
"type_k", "type_v",
968988
"n_gpu_layers", "split_mode",
969989
"main_gpu", "no_kv_offload", "flash_attn",
970-
"tensor_split", "use_mmap", "embeddings",
990+
"tensor_split", "use_mmap", "use_direct_io", "embeddings",
971991
"n_prompt", "n_gen", "test_time",
972992
"avg_ns", "stddev_ns",
973993
"avg_ts", "stddev_ts",
@@ -989,7 +1009,7 @@ struct test {
9891009
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
9901010
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
9911011
field == "cpu_strict" ||
992-
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
1012+
field == "flash_attn" || field == "use_mmap" || field == "use_direct_io" || field == "embeddings") {
9931013
return BOOL;
9941014
}
9951015
if (field == "avg_ts" || field == "stddev_ts") {
@@ -1025,7 +1045,7 @@ struct test {
10251045
ggml_type_name(type_k), ggml_type_name(type_v),
10261046
std::to_string(n_gpu_layers), split_mode_str(split_mode),
10271047
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
1028-
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
1048+
tensor_split_str, std::to_string(use_mmap), std::to_string(use_direct_io), std::to_string(embeddings),
10291049
std::to_string(n_prompt), std::to_string(n_gen), test_time,
10301050
std::to_string(avg_ns()), std::to_string(stdev_ns()),
10311051
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -1241,6 +1261,9 @@ struct markdown_printer : public printer {
12411261
if (field == "use_mmap") {
12421262
return "mmap";
12431263
}
1264+
if (field == "use_direct_io") {
1265+
return "direct_io";
1266+
}
12441267
if (field == "embeddings") {
12451268
return "embd";
12461269
}
@@ -1302,6 +1325,9 @@ struct markdown_printer : public printer {
13021325
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
13031326
fields.emplace_back("use_mmap");
13041327
}
1328+
if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
1329+
fields.emplace_back("use_direct_io");
1330+
}
13051331
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
13061332
fields.emplace_back("embeddings");
13071333
}

examples/main/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,10 @@ These options help improve the performance and memory usage of the LLaMA models.
274274

275275
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
276276

277+
### Direct I/O
278+
279+
- `--direct-io`: Use direct I/O. Potentially faster uncached loading, fewer pageouts, no page cache pollution. You may benefit from this option if you load a model for the first time (or after some time), load several different models consecutively, or simply want to keep the page cache clean. The faster your storage device is, the greater the gain you can expect. The effect may be greater on Linux due to Transparent HugePage support.
280+
277281
### NUMA support
278282

279283
- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.

examples/server/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ The project is under active development, and we are [looking for feedback and co
9797
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
9898
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
9999
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
100+
| `--direct-io` | use direct I/O (potentially faster uncached loading, fewer pageouts, no page cache pollution) |
100101
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
101102
| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
102103
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |

include/llama.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ extern "C" {
303303
// Keep the booleans together to avoid misalignment during copy-by-value.
304304
bool vocab_only; // only load the vocabulary, no weights
305305
bool use_mmap; // use mmap if possible
306+
bool use_direct_io; // use direct I/O if possible
306307
bool use_mlock; // force system to keep model in RAM
307308
bool check_tensors; // validate model tensor data
308309
};
@@ -429,6 +430,7 @@ extern "C" {
429430
LLAMA_API size_t llama_max_devices(void);
430431

431432
LLAMA_API bool llama_supports_mmap (void);
433+
LLAMA_API bool llama_supports_direct_io (void);
432434
LLAMA_API bool llama_supports_mlock (void);
433435
LLAMA_API bool llama_supports_gpu_offload(void);
434436

scripts/run-with-preset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
1414
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
15-
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
15+
"direct-io", "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
1616
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
1717
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
1818
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
@@ -30,7 +30,7 @@
3030
]
3131

3232
CLI_ARGS_LLAMA_SERVER = [
33-
"alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
33+
"alias", "batch-size", "ctx-size", "direct-io", "embedding", "host", "memory-f32", "lora", "lora-base",
3434
"low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
3535
"numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
3636
"threads", "verbose"

0 commit comments

Comments
 (0)