Skip to content

Commit 9b38f8b

Browse files
committed
Merge branch 'master' into compilade/refactor-kv-cache
2 parents 10c3c41 + d7fd29f commit 9b38f8b

File tree

102 files changed

+4818
-1748
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

102 files changed

+4818
-1748
lines changed

.devops/nix/package.nix

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,18 @@
1717
rocmPackages,
1818
vulkan-headers,
1919
vulkan-loader,
20-
clblast,
20+
curl,
2121
useBlas ? builtins.all (x: !x) [
2222
useCuda
2323
useMetalKit
24-
useOpenCL
2524
useRocm
2625
useVulkan
2726
] && blas.meta.available,
2827
useCuda ? config.cudaSupport,
29-
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
28+
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
3029
useMpi ? false, # Increases the runtime closure size by ~700M
31-
useOpenCL ? false,
3230
useRocm ? config.rocmSupport,
31+
enableCurl ? true,
3332
useVulkan ? false,
3433
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
3534

@@ -56,7 +55,6 @@ let
5655
++ lib.optionals useCuda [ "CUDA" ]
5756
++ lib.optionals useMetalKit [ "MetalKit" ]
5857
++ lib.optionals useMpi [ "MPI" ]
59-
++ lib.optionals useOpenCL [ "OpenCL" ]
6058
++ lib.optionals useRocm [ "ROCm" ]
6159
++ lib.optionals useVulkan [ "Vulkan" ];
6260

@@ -198,19 +196,19 @@ effectiveStdenv.mkDerivation (
198196
optionals effectiveStdenv.isDarwin darwinBuildInputs
199197
++ optionals useCuda cudaBuildInputs
200198
++ optionals useMpi [ mpi ]
201-
++ optionals useOpenCL [ clblast ]
202199
++ optionals useRocm rocmBuildInputs
203200
++ optionals useBlas [ blas ]
204-
++ optionals useVulkan vulkanBuildInputs;
201+
++ optionals useVulkan vulkanBuildInputs
202+
++ optionals enableCurl [ curl ];
205203

206204
cmakeFlags =
207205
[
208206
(cmakeBool "LLAMA_BUILD_SERVER" true)
209207
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
210208
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
209+
(cmakeBool "LLAMA_CURL" enableCurl)
211210
(cmakeBool "GGML_NATIVE" false)
212211
(cmakeBool "GGML_BLAS" useBlas)
213-
(cmakeBool "GGML_CLBLAST" useOpenCL)
214212
(cmakeBool "GGML_CUDA" useCuda)
215213
(cmakeBool "GGML_HIPBLAS" useRocm)
216214
(cmakeBool "GGML_METAL" useMetalKit)
@@ -254,7 +252,6 @@ effectiveStdenv.mkDerivation (
254252
useCuda
255253
useMetalKit
256254
useMpi
257-
useOpenCL
258255
useRocm
259256
useVulkan
260257
;
@@ -281,7 +278,7 @@ effectiveStdenv.mkDerivation (
281278
# Configurations we don't want even the CI to evaluate. Results in the
282279
# "unsupported platform" messages. This is mostly a no-op, because
283280
# cudaPackages would've refused to evaluate anyway.
284-
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
281+
badPlatforms = optionals useCuda lib.platforms.darwin;
285282

286283
# Configurations that are known to result in build failures. Can be
287284
# overridden by importing Nixpkgs with `allowBroken = true`.

.github/ISSUE_TEMPLATE/config.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,3 @@ contact_links:
99
- name: Want to contribute?
1010
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
1111
about: Head to the contribution guide page of the wiki for areas you can help with
12-
13-

.gitignore

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,14 @@ examples/server/*.mjs.hpp
9898

9999
# Python
100100

101-
__pycache__
102-
.venv
103-
/Pipfile
104-
dist
105-
poetry.lock
101+
/.venv
102+
__pycache__/
103+
*/poetry.lock
106104
poetry.toml
107105

106+
# Nix
107+
/result
108+
108109
# Test binaries
109110
/tests/test-backend-ops
110111
/tests/test-double-float

CMakeLists.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ endif()
4242

4343
option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
4444

45+
if (WIN32)
46+
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
47+
endif()
48+
4549
#
4650
# option list
4751
#
@@ -152,7 +156,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
152156
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
153157

154158
install(
155-
FILES convert-hf-to-gguf.py
159+
FILES convert_hf_to_gguf.py
156160
PERMISSIONS
157161
OWNER_READ
158162
OWNER_WRITE

CMakePresets.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"cacheVariables": {
2020
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
2121
"CMAKE_CXX_COMPILER": "icx",
22+
"CMAKE_C_COMPILER": "cl",
2223
"GGML_SYCL": "ON",
2324
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
2425
}

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ TEST_TARGETS = \
6262
tests/test-tokenizer-1-bpe \
6363
tests/test-tokenizer-1-spm
6464

65+
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
66+
LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
67+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
68+
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
69+
6570
# Deprecation aliases
6671
ifdef LLAMA_CUBLAS
6772
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
@@ -1086,6 +1091,7 @@ clean:
10861091
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
10871092
rm -rvf $(BUILD_TARGETS)
10881093
rm -rvf $(TEST_TARGETS)
1094+
rm -rvf $(LEGACY_TARGETS)
10891095
find examples pocs -type f -name "*.o" -delete
10901096

10911097
#

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ Typically finetunes of the base models below are supported as well.
108108
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
109109
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
110110
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
111+
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
111112
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
112113
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
113114
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
@@ -217,6 +218,11 @@ Unless otherwise noted these projects are open-source with permissive licensing:
217218
**Tools:**
218219

219220
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
221+
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
222+
223+
**Infrastructure:**
224+
225+
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
220226

221227
---
222228

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -688,7 +688,7 @@ function gg_run_embd_bge_small {
688688
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
689689
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
690690

691-
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
691+
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
692692

693693
model_f16="${path_models}/ggml-model-f16.gguf"
694694
model_q8_0="${path_models}/ggml-model-q8_0.gguf"

common/common.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
757757
params.cache_type_v = argv[++i];
758758
return true;
759759
}
760-
if (arg == "--multiline-input") {
760+
if (arg == "-mli" || arg == "--multiline-input") {
761761
params.multiline_input = true;
762762
return true;
763763
}
@@ -2070,7 +2070,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20702070
if (params.warmup) {
20712071
LOG("warming up the model with an empty run\n");
20722072

2073-
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
2073+
std::vector<llama_token> tmp;
2074+
llama_token bos = llama_token_bos(model);
2075+
llama_token eos = llama_token_eos(model);
2076+
// some models (e.g. T5) don't have a BOS token
2077+
if (bos != -1) {
2078+
tmp.push_back(bos);
2079+
}
2080+
tmp.push_back(eos);
2081+
2082+
if (llama_model_has_encoder(model)) {
2083+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
2084+
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
2085+
if (decoder_start_token_id == -1) {
2086+
decoder_start_token_id = bos;
2087+
}
2088+
tmp.clear();
2089+
tmp.push_back(decoder_start_token_id);
2090+
}
20742091
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
20752092
llama_past_clear(lctx);
20762093
llama_synchronize(lctx);

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,4 +459,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
459459
void yaml_dump_non_result_info(
460460
FILE * stream, const gpt_params & params, const llama_context * lctx,
461461
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
462-

0 commit comments

Comments
 (0)