Skip to content

Commit 9dd7e77

Browse files
committed
Merge branch 'master' into xsn/llama_batch_remove_compat
2 parents 4be7ecf + afd9909 commit 9dd7e77

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+8914
-4616
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
8888
set(GGML_LLAMAFILE_DEFAULT ON)
8989
endif()
9090

91+
if (NOT DEFINED GGML_AMX)
92+
set(GGML_AMX ON)
93+
endif()
94+
9195
if (NOT DEFINED GGML_CUDA_GRAPHS)
9296
set(GGML_CUDA_GRAPHS_DEFAULT ON)
9397
endif()

Makefile

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,6 @@ GGML_METAL := 1
9393
DEPRECATE_WARNING := 1
9494
endif
9595

96-
ifdef LLAMA_OPENMP
97-
GGML_OPENMP := 1
98-
DEPRECATE_WARNING := 1
99-
endif
100-
10196
ifdef LLAMA_RPC
10297
GGML_RPC := 1
10398
DEPRECATE_WARNING := 1
@@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
584579
OBJ_GGML += ggml/src/llamafile/sgemm.o
585580
endif
586581

582+
ifndef GGML_NO_AMX
583+
MK_CPPFLAGS += -DGGML_USE_AMX
584+
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
585+
endif
586+
587587
ifdef GGML_RPC
588588
MK_CPPFLAGS += -DGGML_USE_RPC
589589
OBJ_GGML += ggml/src/ggml-rpc.o
@@ -1087,6 +1087,19 @@ ggml/src/llamafile/sgemm.o: \
10871087
$(CXX) $(CXXFLAGS) -c $< -o $@
10881088
endif # GGML_NO_LLAMAFILE
10891089

1090+
ifndef GGML_NO_AMX
1091+
ggml/src/ggml-amx.o: \
1092+
ggml/src/ggml-amx.cpp \
1093+
ggml/include/ggml-amx.h
1094+
$(CXX) $(CXXFLAGS) -c $< -o $@
1095+
1096+
ggml/src/ggml-amx/mmq.o: \
1097+
ggml/src/ggml-amx/mmq.cpp \
1098+
ggml/src/ggml-amx/mmq.h \
1099+
ggml/include/ggml.h
1100+
$(CXX) $(CXXFLAGS) -c $< -o $@
1101+
endif
1102+
10901103
ifdef GGML_RPC
10911104
ggml/src/ggml-rpc.o: \
10921105
ggml/src/ggml-rpc.cpp \
@@ -1238,6 +1251,7 @@ clean:
12381251
rm -vrf ggml/src/ggml-metal-embed.metal
12391252
rm -vrf ggml/src/ggml-cuda/*.o
12401253
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
1254+
rm -vrf ggml/src/ggml-amx/*.o
12411255
rm -rvf $(BUILD_TARGETS)
12421256
rm -rvf $(TEST_TARGETS)
12431257
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ variety of hardware - locally and in the cloud.
2929

3030
- Plain C/C++ implementation without any dependencies
3131
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
32-
- AVX, AVX2 and AVX512 support for x86 architectures
32+
- AVX, AVX2, AVX512 and AMX support for x86 architectures
3333
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
3434
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
3535
- Vulkan and SYCL backend support
@@ -130,6 +130,8 @@ Typically finetunes of the base models below are supported as well.
130130
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
131131
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
132132
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
133+
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
134+
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
133135

134136
**UI:**
135137

common/arg.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -947,6 +947,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
947947
params.sparams.tfs_z = std::stof(value);
948948
}
949949
).set_sparam());
950+
add_opt(common_arg(
951+
{"--xtc-probability"}, "N",
952+
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
953+
[](common_params & params, const std::string & value) {
954+
params.sparams.xtc_probability = std::stof(value);
955+
}
956+
).set_sparam());
957+
add_opt(common_arg(
958+
{"--xtc-threshold"}, "N",
959+
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
960+
[](common_params & params, const std::string & value) {
961+
params.sparams.xtc_threshold = std::stof(value);
962+
}
963+
).set_sparam());
950964
add_opt(common_arg(
951965
{"--typical"}, "N",
952966
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
@@ -1788,6 +1802,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17881802
params.n_threads_http = value;
17891803
}
17901804
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
1805+
add_opt(common_arg(
1806+
{"--cache-reuse"}, "N",
1807+
string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
1808+
[](common_params & params, int value) {
1809+
params.n_cache_reuse = value;
1810+
}
1811+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
17911812
add_opt(common_arg(
17921813
{"--metrics"},
17931814
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),

common/common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2104,6 +2104,8 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
21042104
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
21052105
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
21062106
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2107+
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
2108+
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
21072109
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
21082110
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
21092111
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");

common/common.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ enum common_sampler_type {
9090
COMMON_SAMPLER_TYPE_TFS_Z = 4,
9191
COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
9292
COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
93+
COMMON_SAMPLER_TYPE_XTC = 7,
94+
COMMON_SAMPLER_TYPE_INFILL = 8,
9395
};
9496

9597
// dimensionality reduction methods, used by cvector-generator
@@ -108,6 +110,8 @@ struct common_sampler_params {
108110
int32_t top_k = 40; // <= 0 to use vocab size
109111
float top_p = 0.95f; // 1.0 = disabled
110112
float min_p = 0.05f; // 0.0 = disabled
113+
float xtc_probability = 0.00f; // 0.0 = disabled
114+
float xtc_threshold = 0.10f; // > 0.5 disables XTC
111115
float tfs_z = 1.00f; // 1.0 = disabled
112116
float typ_p = 1.00f; // typical_p, 1.0 = disabled
113117
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
@@ -124,13 +128,15 @@ struct common_sampler_params {
124128
bool ignore_eos = false;
125129
bool no_perf = false; // disable performance metrics
126130

131+
127132
std::vector<enum common_sampler_type> samplers = {
128133
COMMON_SAMPLER_TYPE_TOP_K,
129134
COMMON_SAMPLER_TYPE_TFS_Z,
130135
COMMON_SAMPLER_TYPE_TYPICAL_P,
131136
COMMON_SAMPLER_TYPE_TOP_P,
132137
COMMON_SAMPLER_TYPE_MIN_P,
133-
COMMON_SAMPLER_TYPE_TEMPERATURE
138+
COMMON_SAMPLER_TYPE_XTC,
139+
COMMON_SAMPLER_TYPE_TEMPERATURE,
134140
};
135141

136142
std::string grammar; // optional BNF-like grammar to constrain sampling
@@ -277,7 +283,8 @@ struct common_params {
277283
int32_t port = 8080; // server listens on this network port
278284
int32_t timeout_read = 600; // http read timeout in seconds
279285
int32_t timeout_write = timeout_read; // http write timeout in seconds
280-
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
286+
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
287+
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
281288

282289
std::string hostname = "127.0.0.1";
283290
std::string public_path = ""; // NOLINT

common/json-schema-to-grammar.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -611,7 +611,7 @@ class SchemaConverter {
611611
}
612612
return join_seq();
613613
};
614-
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
614+
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
615615
}
616616

617617
/*

common/sampling.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,10 @@ std::string common_sampler_params::print() const {
130130

131131
snprintf(result, sizeof(result),
132132
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
133-
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
133+
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
134134
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
135135
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
136-
top_k, tfs_z, top_p, min_p, typ_p, temp,
136+
top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
137137
mirostat, mirostat_eta, mirostat_tau);
138138

139139
return std::string(result);
@@ -184,6 +184,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
184184
case COMMON_SAMPLER_TYPE_MIN_P:
185185
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
186186
break;
187+
case COMMON_SAMPLER_TYPE_XTC:
188+
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
189+
break;
187190
case COMMON_SAMPLER_TYPE_TFS_Z:
188191
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
189192
break;
@@ -193,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
193196
case COMMON_SAMPLER_TYPE_TEMPERATURE:
194197
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
195198
break;
199+
case COMMON_SAMPLER_TYPE_INFILL:
200+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
201+
break;
196202
default:
197203
GGML_ASSERT(false && "unknown sampler type");
198204
}
@@ -372,6 +378,8 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
372378
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
373379
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
374380
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
381+
case COMMON_SAMPLER_TYPE_XTC: return 'x';
382+
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
375383
default : return '?';
376384
}
377385
}
@@ -384,6 +392,8 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
384392
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
385393
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
386394
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
395+
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
396+
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
387397
default : return "";
388398
}
389399
}
@@ -396,6 +406,8 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
396406
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
397407
{ "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
398408
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
409+
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
410+
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
399411
};
400412

401413
// since samplers names are written multiple ways
@@ -441,7 +453,9 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
441453
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
442454
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
443455
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
444-
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
456+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
457+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
458+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
445459
};
446460

447461
std::vector<common_sampler_type> samplers;

examples/json_schema_to_grammar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ def join_seq():
540540
return self._add_rule(
541541
name,
542542
to_rule(transform()) if self._raw_pattern \
543-
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
543+
else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
544544

545545

546546
def _resolve_ref(self, ref):

examples/llama-bench/llama-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ static std::string get_gpu_info() {
151151
int count = ggml_backend_sycl_get_device_count();
152152
for (int i = 0; i < count; i++) {
153153
char buf[128];
154-
ggml_sycl_get_device_description(i, buf, sizeof(buf));
154+
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
155155
id += buf;
156156
if (i < count - 1) {
157157
id += "/";

examples/llava/llava.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
466466
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
467467
if (!image_embed_result) {
468468
clip_image_u8_free(img);
469-
LOG_ERR("%s: coulnd't embed the image\n", __func__);
469+
LOG_ERR("%s: couldn't embed the image\n", __func__);
470470
return NULL;
471471
}
472472

examples/main/README.md

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,19 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres
241241

242242
Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
243243

244+
### XTC Sampling
245+
246+
- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
247+
- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
248+
249+
Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
250+
251+
By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
252+
253+
Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
254+
255+
Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
256+
244257
### Logit Bias
245258

246259
- `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
@@ -284,10 +297,6 @@ These options help improve the performance and memory usage of the LLaMA models.
284297

285298
These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
286299

287-
### Memory Float 32
288-
289-
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
290-
291300
### Batch Size
292301

293302
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.

examples/main/main.cpp

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -569,30 +569,30 @@ int main(int argc, char ** argv) {
569569
if (!params.ctx_shift){
570570
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
571571
break;
572-
} else {
573-
if (params.n_predict == -2) {
574-
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
575-
break;
576-
}
572+
}
573+
574+
if (params.n_predict == -2) {
575+
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
576+
break;
577+
}
577578

578-
const int n_left = n_past - params.n_keep;
579-
const int n_discard = n_left/2;
579+
const int n_left = n_past - params.n_keep;
580+
const int n_discard = n_left/2;
580581

581-
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
582-
n_past, n_left, n_ctx, params.n_keep, n_discard);
582+
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
583+
n_past, n_left, n_ctx, params.n_keep, n_discard);
583584

584-
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
585-
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past + 1 , -n_discard);
585+
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
586+
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
586587

587-
n_past -= n_discard;
588+
n_past -= n_discard;
588589

589-
LOG_DBG("after swap: n_past = %d\n", n_past);
590+
LOG_DBG("after swap: n_past = %d\n", n_past);
590591

591-
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
592+
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
592593

593-
LOG_DBG("clear session path\n");
594-
path_session.clear();
595-
}
594+
LOG_DBG("clear session path\n");
595+
path_session.clear();
596596
}
597597
} else {
598598
// context extension via Self-Extend

0 commit comments

Comments
 (0)