Skip to content

Commit f77260c

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 7e51fe0 + 152610e commit f77260c

File tree

119 files changed

+15130
-1347
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+15130
-1347
lines changed

.devops/nix/package.nix

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
# Increases the runtime closure size by ~700M
3232
useMpi ? false,
3333
useRocm ? config.rocmSupport,
34+
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
3435
enableCurl ? true,
3536
useVulkan ? false,
3637
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@@ -188,7 +189,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
188189
]
189190
++ optionals useRocm [
190191
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
191-
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
192+
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
192193
]
193194
++ optionals useMetalKit [
194195
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")

.devops/tools.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,23 @@ arg1="$1"
88
shift
99

1010
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
11-
python3 ./convert_hf_to_gguf.py "$@"
11+
exec python3 ./convert_hf_to_gguf.py "$@"
1212
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
13-
./llama-quantize "$@"
13+
exec ./llama-quantize "$@"
1414
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
15-
./llama-cli "$@"
15+
exec ./llama-cli "$@"
1616
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
1717
echo "Converting PTH to GGML..."
1818
for i in `ls $1/$2/ggml-model-f16.bin*`; do
1919
if [ -f "${i/f16/q4_0}" ]; then
2020
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
2121
else
2222
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
23-
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
23+
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
2424
fi
2525
done
2626
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
27-
./llama-server "$@"
27+
exec ./llama-server "$@"
2828
else
2929
echo "Unknown command: $arg1"
3030
echo "Available commands: "

.github/workflows/build.yml

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ jobs:
317317
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
318318
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
319319
sudo apt-get update -y
320-
sudo apt-get install -y build-essential vulkan-sdk
320+
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
321321
322322
- name: Build
323323
id: cmake_build
@@ -327,6 +327,12 @@ jobs:
327327
cmake -DGGML_VULKAN=ON ..
328328
cmake --build . --config Release -j $(nproc)
329329
330+
- name: Test
331+
id: cmake_test
332+
run: |
333+
cd build
334+
ctest -L main --verbose --timeout 900
335+
330336
ubuntu-22-cmake-hip:
331337
runs-on: ubuntu-22.04
332338
container: rocm/dev-ubuntu-22.04:6.0.2
@@ -662,6 +668,8 @@ jobs:
662668
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
663669
- build: 'msvc-arm64'
664670
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
671+
- build: 'llvm-arm64-opencl-adreno'
672+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
665673

666674
steps:
667675
- name: Clone
@@ -703,6 +711,28 @@ jobs:
703711
run: |
704712
choco install ninja
705713
714+
- name: Install OpenCL Headers and Libs
715+
id: install_opencl
716+
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
717+
run: |
718+
git clone https://github.com/KhronosGroup/OpenCL-Headers
719+
cd OpenCL-Headers
720+
mkdir build && cd build
721+
cmake .. `
722+
-DBUILD_TESTING=OFF `
723+
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
724+
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
725+
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
726+
cmake --build . --target install
727+
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
728+
cd OpenCL-ICD-Loader
729+
mkdir build-arm64-release && cd build-arm64-release
730+
cmake .. `
731+
-A arm64 `
732+
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
733+
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
734+
cmake --build . --target install --config release
735+
706736
- name: Build
707737
id: cmake_build
708738
run: |
@@ -732,7 +762,7 @@ jobs:
732762
- name: Test
733763
id: cmake_test
734764
# not all machines have native AVX-512
735-
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
765+
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
736766
run: |
737767
cd build
738768
ctest -L main -C Release --verbose --timeout 900

Makefile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ BUILD_TARGETS = \
2222
llama-infill \
2323
llama-llava-cli \
2424
llama-minicpmv-cli\
25+
llama-qwen2vl-cli\
2526
llama-lookahead \
2627
llama-lookup \
2728
llama-lookup-create \
@@ -1404,6 +1405,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
14041405
$(OBJ_ALL)
14051406
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
14061407

1408+
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
1409+
examples/llava/llava.cpp \
1410+
examples/llava/llava.h \
1411+
examples/llava/clip.cpp \
1412+
examples/llava/clip.h \
1413+
$(OBJ_ALL)
1414+
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
1415+
14071416
ifeq ($(UNAME_S),Darwin)
14081417
swift: examples/batched.swift
14091418
(cd examples/batched.swift; make build)

README.md

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
9898
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
9999
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
100100
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
101+
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
101102

102103
#### Multimodal
103104

@@ -110,6 +111,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
110111
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
111112
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
112113
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
114+
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
113115

114116
</details>
115117

@@ -219,7 +221,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
219221
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
220222
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
221223
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
222-
| [hipBLAS](docs/build.md#hipblas) | AMD GPU |
224+
| [HIP](docs/build.md#hip) | AMD GPU |
223225
| [Vulkan](docs/build.md#vulkan) | GPU |
224226
| [CANN](docs/build.md#cann) | Ascend NPU |
225227

@@ -412,7 +414,7 @@ To learn more about model quantization, [read this documentation](examples/quant
412414
[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
413415
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
414416

415-
## [`llama-bench`](example/bench)
417+
## [`llama-bench`](examples/llama-bench)
416418

417419
#### Benchmark the performance of the inference for various parameters.
418420

@@ -433,6 +435,20 @@ To learn more about model quantization, [read this documentation](examples/quant
433435

434436
</details>
435437

438+
## [`llama-run`](examples/run)
439+
440+
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
441+
442+
- <details>
443+
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
444+
445+
```bash
446+
llama-run granite-code
447+
```
448+
449+
</details>
450+
451+
[^3]: [https://github.com/containers/ramalama](RamaLama)
436452
437453
## [`llama-simple`](examples/simple)
438454

common/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
8181
# Use curl to download model url
8282
if (LLAMA_CURL)
8383
find_package(CURL REQUIRED)
84-
add_definitions(-DLLAMA_USE_CURL)
84+
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
8585
include_directories(${CURL_INCLUDE_DIRS})
8686
find_library(CURL_LIBRARY curl REQUIRED)
8787
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})

common/arg.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -855,13 +855,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
855855
params.sampling.ignore_eos = true;
856856
}
857857
).set_sparam());
858-
add_opt(common_arg(
859-
{"--penalize-nl"},
860-
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
861-
[](common_params & params) {
862-
params.sampling.penalize_nl = true;
863-
}
864-
).set_sparam());
865858
add_opt(common_arg(
866859
{"--temp"}, "N",
867860
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
@@ -916,6 +909,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
916909
{"--repeat-last-n"}, "N",
917910
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
918911
[](common_params & params, int value) {
912+
if (value < -1) {
913+
throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
914+
}
919915
params.sampling.penalty_last_n = value;
920916
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
921917
}
@@ -970,6 +966,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
970966
{"--dry-penalty-last-n"}, "N",
971967
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
972968
[](common_params & params, int value) {
969+
if (value < -1) {
970+
throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
971+
}
973972
params.sampling.dry_penalty_last_n = value;
974973
}
975974
).set_sparam());

common/common.cpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,25 @@ struct common_init_result common_init_from_params(common_params & params) {
940940
params.sampling.ignore_eos = false;
941941
}
942942

943+
if (params.sampling.ignore_eos) {
944+
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
945+
if (llama_token_is_eog(model, i)) {
946+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
947+
params.sampling.logit_bias.push_back({i, -INFINITY});
948+
}
949+
}
950+
}
951+
952+
if (params.sampling.penalty_last_n == -1) {
953+
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
954+
params.sampling.penalty_last_n = llama_n_ctx(lctx);
955+
}
956+
957+
if (params.sampling.dry_penalty_last_n == -1) {
958+
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
959+
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
960+
}
961+
943962
if (params.warmup) {
944963
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
945964

@@ -1076,12 +1095,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
10761095
#define CURL_MAX_RETRY 3
10771096
#define CURL_RETRY_DELAY_SECONDS 2
10781097

1079-
1080-
static bool starts_with(const std::string & str, const std::string & prefix) {
1081-
// While we wait for C++20's std::string::starts_with...
1082-
return str.rfind(prefix, 0) == 0;
1083-
}
1084-
10851098
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
10861099
int remaining_attempts = max_attempts;
10871100

@@ -1767,7 +1780,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
17671780
break;
17681781
case 0: // max absolute
17691782
for (int i = 0; i < n; i++) {
1770-
if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
1783+
if (sum < std::abs(inp[i])) {
1784+
sum = std::abs(inp[i]);
1785+
}
17711786
}
17721787
sum /= 32760.0; // make an int16 range
17731788
break;

common/common.h

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;
3737

3838
// build info
3939
extern int LLAMA_BUILD_NUMBER;
40-
extern char const * LLAMA_COMMIT;
41-
extern char const * LLAMA_COMPILER;
42-
extern char const * LLAMA_BUILD_TARGET;
40+
extern const char * LLAMA_COMMIT;
41+
extern const char * LLAMA_COMPILER;
42+
extern const char * LLAMA_BUILD_TARGET;
4343

4444
struct common_control_vector_load_info;
4545

@@ -95,6 +95,7 @@ enum common_sampler_type {
9595
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
9696
COMMON_SAMPLER_TYPE_XTC = 8,
9797
COMMON_SAMPLER_TYPE_INFILL = 9,
98+
COMMON_SAMPLER_TYPE_PENALTIES = 10,
9899
};
99100

100101
// dimensionality reduction methods, used by cvector-generator
@@ -130,7 +131,6 @@ struct common_params_sampling {
130131
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
131132
float mirostat_tau = 5.00f; // target entropy
132133
float mirostat_eta = 0.10f; // learning rate
133-
bool penalize_nl = false; // consider newlines as a repeatable token
134134
bool ignore_eos = false;
135135
bool no_perf = false; // disable performance metrics
136136
bool timing_per_token = false;
@@ -139,6 +139,7 @@ struct common_params_sampling {
139139

140140

141141
std::vector<enum common_sampler_type> samplers = {
142+
COMMON_SAMPLER_TYPE_PENALTIES,
142143
COMMON_SAMPLER_TYPE_DRY,
143144
COMMON_SAMPLER_TYPE_TOP_K,
144145
COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -193,11 +194,13 @@ struct common_params {
193194
float defrag_thold = 0.1f; // KV cache defragmentation threshold
194195

195196
// offload params
196-
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
197-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
198-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
199-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
200-
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
197+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
198+
199+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
200+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
201+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
202+
203+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
201204

202205
struct cpu_params cpuparams;
203206
struct cpu_params cpuparams_batch;
@@ -437,6 +440,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
437440
return parts;
438441
}
439442

443+
static bool string_starts_with(const std::string & str,
444+
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
445+
return str.rfind(prefix, 0) == 0;
446+
}
447+
440448
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
441449
void string_process_escapes(std::string & input);
442450

@@ -588,7 +596,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
588596
// Embedding utils
589597
//
590598

591-
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
599+
// TODO: repace embd_norm with an enum
600+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
592601

593602
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
594603

0 commit comments

Comments
 (0)