From da53236bf3953509b716feac8458ad1d5a90b212 Mon Sep 17 00:00:00 2001 From: JohannesGaessler Date: Tue, 8 Aug 2023 19:05:10 +0200 Subject: [PATCH 1/3] database writing works --- Makefile | 2 +- examples/main/main.cpp | 37 +++++++++++++++++++++++++++++++++++++ llama.cpp | 9 +++++++++ llama.h | 2 ++ 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 897c5cb9abcca..8772107ea123e 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ OPT = -O3 endif CFLAGS = -I. $(OPT) -std=c11 -fPIC CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC -LDFLAGS = +LDFLAGS = -lsqlite3 ifdef LLAMA_DEBUG CFLAGS += -O0 -g diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 56ada7e69d99d..fb4c828bc7ee9 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -17,8 +17,10 @@ #include #include #include +#include #include #include +#include #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #include @@ -163,6 +165,30 @@ int main(int argc, char ** argv) { return 0; } + sqlite3 * db = NULL; + int return_code; + const int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE; + return_code = sqlite3_open_v2("llama.sqlite", &db, flags, NULL); + fprintf(stderr, "\nsqlite open: %d %s\n\n", return_code, sqlite3_errmsg(db)); + + const std::string sql_create_table ="CREATE TABLE IF NOT EXISTS llama_runs(" + "id INTEGER PRIMARY KEY AUTOINCREMENT," + "build_number INTEGER NOT NULL," + "build_commit TEXT NOT NULL," + + "n_gpu_layers BIGINT NOT NULL," + + "t_sample_us BIGINT NOT NULL," + "t_eval_us BIGINT NOT NULL," + "t_p_eval_us BIGINT NOT NULL," + "n_sample BIGINT NOT NULL," + "n_eval BIGINT NOT NULL," + "n_p_eval BIGINT NOT NULL);"; + + char * errmsg; + return_code = sqlite3_exec(db, sql_create_table.c_str(), NULL, NULL, &errmsg); + fprintf(stderr, "\nsqlite create table: %d %s\n\n", return_code, errmsg); + std::string path_session = params.path_prompt_cache; std::vector session_tokens; @@ -808,6 +834,17 @@ int main(int argc, char ** argv) { } llama_print_timings(ctx); + + std::ostringstream sql_insert_values; + sql_insert_values << "INSERT INTO llama_runs(build_number, build_commit, n_gpu_layers, " + "t_sample_us, t_eval_us, t_p_eval_us, n_sample, n_eval, n_p_eval) VALUES ("; + sql_insert_values << BUILD_NUMBER << ","; + sql_insert_values << "'" << BUILD_COMMIT << "',"; + sql_insert_values << params.n_gpu_layers << ","; + llama_sqlite_append_timings(ctx, sql_insert_values); + return_code = sqlite3_exec(db, sql_insert_values.str().c_str(), NULL, NULL, &errmsg); + fprintf(stderr, "\nsqlite insert data: %d %s\n\n", return_code, errmsg); + if (ctx_guidance) { llama_free(ctx_guidance); } llama_free(ctx); llama_free_model(model); diff --git a/llama.cpp b/llama.cpp index 71061aab910ef..a9256121b7070 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4243,6 +4243,15 @@ void llama_print_timings(struct llama_context * ctx) { fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); } +void llama_sqlite_append_timings(struct llama_context * ctx, std::ostringstream & sql_insert_values) { + sql_insert_values << ctx->t_sample_us << ","; + sql_insert_values << ctx->t_eval_us << ","; + sql_insert_values << ctx->t_p_eval_us << ","; + sql_insert_values << ctx->n_sample << ","; + sql_insert_values << ctx->n_eval << ","; + sql_insert_values << ctx->n_p_eval << ");"; +} + void llama_reset_timings(struct llama_context * ctx) { ctx->t_start_us = ggml_time_us(); ctx->t_sample_us = ctx->n_sample = 0; diff --git a/llama.h b/llama.h index fa1977f2d9492..8585942efc91c 100644 --- a/llama.h +++ b/llama.h @@ -8,6 +8,7 @@ #else #define LLAMA_MAX_DEVICES 1 #endif // GGML_USE_CUBLAS +#include #include #include #include @@ -446,6 +447,7 @@ extern "C" { // Performance information LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx); + LLAMA_API void llama_sqlite_append_timings(struct llama_context * ctx, std::ostringstream & sql_insert_values); LLAMA_API void llama_reset_timings(struct llama_context * ctx); // Print system information From 97c809448ff106ca8084a8775f14e2f8af71f325 Mon Sep 17 00:00:00 2001 From: JohannesGaessler Date: Tue, 8 Aug 2023 19:34:31 +0200 Subject: [PATCH 2/3] add plotting files --- benchmark.sh | 6 ++++++ plot_ts_per_ngl.py | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100755 benchmark.sh create mode 100644 plot_ts_per_ngl.py diff --git a/benchmark.sh b/benchmark.sh new file mode 100755 index 0000000000000..67d628698ed25 --- /dev/null +++ b/benchmark.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env sh + +for ngl in {0..35} +do + ./main --model models/nvme/llama-7b-ggml-q4_0.bin --seed 1337 --ignore-eos --n-predict 128 --ctx-size 2048 --threads 8 -ngl $ngl -mmq +done diff --git a/plot_ts_per_ngl.py b/plot_ts_per_ngl.py new file mode 100644 index 0000000000000..b922cee7bbcf9 --- /dev/null +++ b/plot_ts_per_ngl.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import sqlite3 +import matplotlib.pyplot as plt + +con = sqlite3.connect("llama.sqlite") +cur = con.cursor() + +ts = [] + +for ngl in range(0, 36): + res = cur.execute(f"SELECT t_eval_us,n_eval FROM llama_runs WHERE n_gpu_layers={ngl};") + t_eval_us, n_eval = res.fetchone() + ts.append(n_eval * 1000000/t_eval_us) + +plt.plot(ts) +plt.xlim(0, 35) +plt.ylim(0, 130) +plt.title("7b q4_0, 3700X, 3200 MHz dual-channel RAM, RTX 3090") +plt.xlabel("-ngl") +plt.ylabel("Generated t/s") +plt.savefig("benchmark.png", dpi=240) +plt.show() From 829565b13dbb59411e22c89a5de50636a26c701c Mon Sep 17 00:00:00 2001 From: JohannesGaessler Date: Wed, 9 Aug 2023 01:31:26 +0200 Subject: [PATCH 3/3] better SQL --- plot_ts_per_ngl.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/plot_ts_per_ngl.py b/plot_ts_per_ngl.py index b922cee7bbcf9..de056b0ed256c 100644 --- a/plot_ts_per_ngl.py +++ b/plot_ts_per_ngl.py @@ -1,19 +1,16 @@ #!/usr/bin/env python3 import sqlite3 +import numpy as np import matplotlib.pyplot as plt con = sqlite3.connect("llama.sqlite") cur = con.cursor() -ts = [] +res = cur.execute("SELECT n_gpu_layers, 1000000.0*n_eval/t_eval_us FROM llama_runs ORDER BY n_gpu_layers;") +ts = np.array(res.fetchall()) -for ngl in range(0, 36): - res = cur.execute(f"SELECT t_eval_us,n_eval FROM llama_runs WHERE n_gpu_layers={ngl};") - t_eval_us, n_eval = res.fetchone() - ts.append(n_eval * 1000000/t_eval_us) - -plt.plot(ts) +plt.plot(ts[:, 0], ts[:, 1]) plt.xlim(0, 35) plt.ylim(0, 130) plt.title("7b q4_0, 3700X, 3200 MHz dual-channel RAM, RTX 3090")