Skip to content

Commit d01bccd

Browse files
authored
ci : integrate with ggml-org/ci (#2250)
* ci : run ctest ggml-ci * ci : add open llama 3B-v2 tests ggml-ci * ci : disable wget progress output ggml-ci * ci : add open llama 3B-v2 tg tests for q4 and q5 quantizations ggml-ci * tests : try to fix tail free sampling test ggml-ci * ci : add K-quants ggml-ci * ci : add short perplexity tests ggml-ci * ci : add README.md * ppl : add --chunks argument to limit max number of chunks ggml-ci * ci : update README
1 parent 6cbf9df commit d01bccd

File tree

8 files changed

+312
-6
lines changed

8 files changed

+312
-6
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ build/
1616
build-em/
1717
build-debug/
1818
build-release/
19+
build-ci-debug/
20+
build-ci-release/
1921
build-static/
2022
build-cublas/
2123
build-opencl/
@@ -25,9 +27,10 @@ build-no-accel/
2527
build-sanitize-addr/
2628
build-sanitize-thread/
2729
out/
30+
tmp/
2831

2932
models/*
30-
*.bin
33+
models-mnt
3134

3235
/main
3336
/quantize

ci/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# CI
2+
3+
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
4+
5+
https://github.com/ggml-org/ci
6+
7+
It monitors the `master` branch for new commits and runs the
8+
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
9+
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
10+
to cover various hardware architectures, including GPU and Apple Silicon instances.
11+
12+
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
13+
Only the branches of this repo are monitored for this keyword.
14+
15+
It is a good practice, before publishing changes to execute the full CI locally on your machine:
16+
17+
```bash
18+
mkdir tmp
19+
bash ./ci/run.sh ./tmp/results ./tmp/mnt
20+
```

ci/run.sh

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
#/bin/bash
2+
3+
if [ -z "$2" ]; then
4+
echo "usage: $0 <output-dir> <mnt-dir>"
5+
exit 1
6+
fi
7+
8+
mkdir -p "$1"
9+
mkdir -p "$2"
10+
11+
OUT=$(realpath "$1")
12+
MNT=$(realpath "$2")
13+
14+
rm -v $OUT/*.log
15+
rm -v $OUT/*.exit
16+
rm -v $OUT/*.md
17+
18+
sd=`dirname $0`
19+
cd $sd/../
20+
SRC=`pwd`
21+
22+
## helpers
23+
24+
# download a file if it does not exist or if it is outdated
25+
function gg_wget {
26+
local out=$1
27+
local url=$2
28+
29+
local cwd=`pwd`
30+
31+
mkdir -p $out
32+
cd $out
33+
34+
# should not re-download if file is the same
35+
wget -nv -N $url
36+
37+
cd $cwd
38+
}
39+
40+
function gg_printf {
41+
printf -- "$@" >> $OUT/README.md
42+
}
43+
44+
function gg_run {
45+
ci=$1
46+
47+
set -o pipefail
48+
set -x
49+
50+
gg_run_$ci | tee $OUT/$ci.log
51+
cur=$?
52+
echo "$cur" > $OUT/$ci.exit
53+
54+
set +x
55+
set +o pipefail
56+
57+
gg_sum_$ci
58+
59+
ret=$((ret | cur))
60+
}
61+
62+
## ci
63+
64+
# ctest_debug
65+
66+
function gg_run_ctest_debug {
67+
cd ${SRC}
68+
69+
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
70+
71+
set -e
72+
73+
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
74+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
75+
76+
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
77+
78+
set +e
79+
}
80+
81+
function gg_sum_ctest_debug {
82+
gg_printf '### %s\n\n' "${ci}"
83+
84+
gg_printf 'Runs ctest in debug mode\n'
85+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
86+
gg_printf '```\n'
87+
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
88+
gg_printf '```\n'
89+
gg_printf '\n'
90+
}
91+
92+
# ctest_release
93+
94+
function gg_run_ctest_release {
95+
cd ${SRC}
96+
97+
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
98+
99+
set -e
100+
101+
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
102+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
103+
104+
if [ -z $GG_BUILD_LOW_PERF ]; then
105+
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
106+
else
107+
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
108+
fi
109+
110+
set +e
111+
}
112+
113+
function gg_sum_ctest_release {
114+
gg_printf '### %s\n\n' "${ci}"
115+
116+
gg_printf 'Runs ctest in release mode\n'
117+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
118+
gg_printf '```\n'
119+
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
120+
gg_printf '```\n'
121+
}
122+
123+
# open_llama_3b_v2
124+
125+
function gg_run_open_llama_3b_v2 {
126+
cd ${SRC}
127+
128+
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
129+
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
130+
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
131+
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
132+
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
133+
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
134+
135+
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
136+
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
137+
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
138+
139+
path_models="../models-mnt/open-llama/3B-v2"
140+
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
141+
142+
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
143+
144+
set -e
145+
146+
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
147+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
148+
149+
python3 ../convert.py ${path_models}
150+
151+
model_f16="${path_models}/ggml-model-f16.bin"
152+
model_q8_0="${path_models}/ggml-model-q8_0.bin"
153+
model_q4_0="${path_models}/ggml-model-q4_0.bin"
154+
model_q4_1="${path_models}/ggml-model-q4_1.bin"
155+
model_q5_0="${path_models}/ggml-model-q5_0.bin"
156+
model_q5_1="${path_models}/ggml-model-q5_1.bin"
157+
model_q3_k="${path_models}/ggml-model-q3_k.bin"
158+
model_q4_k="${path_models}/ggml-model-q4_k.bin"
159+
model_q5_k="${path_models}/ggml-model-q5_k.bin"
160+
model_q6_k="${path_models}/ggml-model-q6_k.bin"
161+
162+
wiki_test_60="${path_wiki}/wiki.test-60.raw"
163+
164+
./bin/quantize ${model_f16} ${model_q8_0} q8_0
165+
./bin/quantize ${model_f16} ${model_q4_0} q4_0
166+
./bin/quantize ${model_f16} ${model_q4_1} q4_1
167+
./bin/quantize ${model_f16} ${model_q5_0} q5_0
168+
./bin/quantize ${model_f16} ${model_q5_1} q5_1
169+
./bin/quantize ${model_f16} ${model_q3_k} q3_k
170+
./bin/quantize ${model_f16} ${model_q4_k} q4_k
171+
./bin/quantize ${model_f16} ${model_q5_k} q5_k
172+
./bin/quantize ${model_f16} ${model_q6_k} q6_k
173+
174+
(time ./bin/main --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
175+
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
176+
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
177+
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
178+
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
179+
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
180+
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
181+
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
182+
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
183+
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
184+
185+
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
186+
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
187+
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
188+
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
189+
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
190+
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
191+
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
192+
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
193+
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
194+
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
195+
196+
function check_ppl {
197+
qnt="$1"
198+
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
199+
200+
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
201+
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
202+
return 20
203+
fi
204+
205+
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
206+
return 0
207+
}
208+
209+
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
210+
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
211+
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
212+
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
213+
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
214+
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
215+
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
216+
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
217+
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
218+
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
219+
220+
set +e
221+
}
222+
223+
function gg_sum_open_llama_3b_v2 {
224+
gg_printf '### %s\n\n' "${ci}"
225+
226+
gg_printf 'OpenLLaMA 3B-v2:\n'
227+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
228+
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
229+
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
230+
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
231+
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
232+
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
233+
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
234+
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
235+
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
236+
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
237+
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
238+
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
239+
}
240+
241+
## main
242+
243+
if [ -z $GG_BUILD_LOW_PERF ]; then
244+
rm -rf ${SRC}/models-mnt
245+
246+
mnt_models=$(realpath ${MNT}/models)
247+
mkdir -p ${mnt_models}
248+
ln -sfn ${mnt_models} ${SRC}/models-mnt
249+
250+
python3 -m pip install -r ${SRC}/requirements.txt
251+
fi
252+
253+
ret=0
254+
255+
#test $ret -eq 0 && gg_run ctest_debug
256+
#test $ret -eq 0 && gg_run ctest_release
257+
258+
if [ -z $GG_BUILD_LOW_PERF ]; then
259+
test $ret -eq 0 && gg_run open_llama_3b_v2
260+
fi
261+
262+
exit $ret

examples/common.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
279279
break;
280280
}
281281
params.n_keep = std::stoi(argv[i]);
282+
} else if (arg == "--chunks") {
283+
if (++i >= argc) {
284+
invalid_param = true;
285+
break;
286+
}
287+
params.n_chunks = std::stoi(argv[i]);
282288
} else if (arg == "-m" || arg == "--model") {
283289
if (++i >= argc) {
284290
invalid_param = true;
@@ -515,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
515521
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
516522
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
517523
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
524+
fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
518525
if (llama_mlock_supported()) {
519526
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
520527
}

examples/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ struct gpt_params {
2828
int32_t n_ctx = 512; // context size
2929
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
3030
int32_t n_keep = 0; // number of tokens to keep from initial prompt
31+
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
3132
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
3233
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3334
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs

examples/perplexity/perplexity.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
3232
// BOS tokens will be added for each chunk before eval
3333
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
3434

35-
int count = 0;
35+
const int n_chunk_max = tokens.size() / params.n_ctx;
3636

37-
const int n_chunk = tokens.size() / params.n_ctx;
37+
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
3838
const int n_vocab = llama_n_vocab(ctx);
3939
const int n_batch = params.n_batch;
4040

41+
int count = 0;
4142
double nll = 0.0;
43+
4244
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
4345

4446
for (int i = 0; i < n_chunk; ++i) {

llama.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2024,9 +2024,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
20242024
}
20252025

20262026
// Normalize the second derivatives
2027-
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2028-
for (float & value : second_derivatives) {
2029-
value /= second_derivatives_sum;
2027+
{
2028+
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
2029+
2030+
if (second_derivatives_sum > 1e-6f) {
2031+
for (float & value : second_derivatives) {
2032+
value /= second_derivatives_sum;
2033+
}
2034+
} else {
2035+
for (float & value : second_derivatives) {
2036+
value = 1.0f / second_derivatives.size();
2037+
}
2038+
}
20302039
}
20312040

20322041
float cum_sum = 0.0f;

tests/test-sampling.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,4 +200,6 @@ int main(void) {
200200
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
201201

202202
printf("OK\n");
203+
204+
return 0;
203205
}

0 commit comments

Comments
 (0)