Skip to content

Commit 80724eb

Browse files
authored
Merge branch 'master' into server-oai-compat
2 parents f25308b + b35f3d0 commit 80724eb

File tree

10 files changed

+285
-11
lines changed

10 files changed

+285
-11
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1010

1111
### Hot topics
1212

13-
- *No hot topics atm. Open to suggestions about what is hot today*
13+
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
1414

1515
----
1616

@@ -422,8 +422,9 @@ Building the program with BLAS support may lead to some performance improvements
422422
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
423423
cmake --build .
424424
```
425-
- Using `CMake` for Windows:
425+
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS):
426426
```bash
427+
set PATH=%HIP_PATH%\bin;%PATH%
427428
mkdir build
428429
cd build
429430
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..

common/common.cpp

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <regex>
1313
#include <sstream>
1414
#include <string>
15+
#include <unordered_map>
1516
#include <unordered_set>
1617
#include <vector>
1718
#include <cinttypes>
@@ -495,6 +496,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
495496
params.chatml = true;
496497
} else if (arg == "--infill") {
497498
params.infill = true;
499+
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
500+
params.dump_kv_cache = true;
498501
} else if (arg == "--multiline-input") {
499502
params.multiline_input = true;
500503
} else if (arg == "--simple-io") {
@@ -835,6 +838,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
835838
#endif // GGML_USE_CUBLAS
836839
#endif
837840
printf(" --verbose-prompt print prompt before generation\n");
841+
printf(" -dkvc, --dump-kv-cache\n");
842+
printf(" verbose print of the KV cache\n");
838843
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
839844
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
840845
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@@ -1386,3 +1391,77 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
13861391
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
13871392
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
13881393
}
1394+
1395+
//
1396+
// KV cache utils
1397+
//
1398+
1399+
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
1400+
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1401+
1402+
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1403+
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1404+
1405+
llama_kv_cache_view_cell * c_curr = view.cells;
1406+
llama_seq_id * cs_curr = view.cells_sequences;
1407+
1408+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1409+
if (i % row_size == 0) {
1410+
printf("\n%5d: ", i);
1411+
}
1412+
int seq_count = 0;
1413+
for (int j = 0; j < view.n_max_seq; j++) {
1414+
if (cs_curr[j] >= 0) { seq_count++; }
1415+
}
1416+
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1417+
}
1418+
1419+
printf("\n=== Done dumping\n");
1420+
}
1421+
1422+
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
1423+
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1424+
1425+
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1426+
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1427+
1428+
std::unordered_map<llama_seq_id, size_t> seqs;
1429+
llama_kv_cache_view_cell * c_curr = view.cells;
1430+
llama_seq_id * cs_curr = view.cells_sequences;
1431+
1432+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1433+
for (int j = 0; j < view.n_max_seq; j++) {
1434+
if (cs_curr[j] < 0) { continue; }
1435+
if (seqs.find(cs_curr[j]) == seqs.end()) {
1436+
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1437+
seqs[cs_curr[j]] = seqs.size();
1438+
}
1439+
}
1440+
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1441+
}
1442+
1443+
printf("=== Sequence legend: ");
1444+
for (const auto & it : seqs) {
1445+
printf("%zu=%d, ", it.second, it.first);
1446+
}
1447+
printf("'+'=other sequence ids");
1448+
1449+
c_curr = view.cells;
1450+
cs_curr = view.cells_sequences;
1451+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1452+
if (i % row_size == 0) {
1453+
printf("\n%5d: ", i);
1454+
}
1455+
for (int j = 0; j < view.n_max_seq; j++) {
1456+
if (cs_curr[j] >= 0) {
1457+
const auto & it = seqs.find(cs_curr[j]);
1458+
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1459+
} else {
1460+
putchar('.');
1461+
}
1462+
}
1463+
putchar(' ');
1464+
}
1465+
1466+
printf("\n=== Done dumping\n");
1467+
}

common/common.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ struct gpt_params {
122122
bool numa = false; // attempt optimizations that help on some NUMA systems
123123
bool verbose_prompt = false; // print prompt tokens before generation
124124
bool infill = false; // use infill mode
125+
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
125126

126127
// multimodal models (see examples/llava)
127128
std::string mmproj = ""; // path to multimodal projector
@@ -218,3 +219,13 @@ std::string get_sortable_timestamp();
218219
void dump_non_result_info_yaml(
219220
FILE * stream, const gpt_params & params, const llama_context * lctx,
220221
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
222+
223+
//
224+
// KV cache utils
225+
//
226+
227+
// Dump the KV cache view with the number of sequences per cell.
228+
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
229+
230+
// Dump the KV cache view showing individual sequences in each cell (long output).
231+
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

docs/llama-star/idea-arch.key

477 KB
Binary file not shown.

docs/llama-star/idea-arch.pdf

41.3 KB
Binary file not shown.

examples/parallel/parallel.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// A basic application simulating a server with multiple clients.
2-
// The clients submite requests to the server and they are processed in parallel.
2+
// The clients submit requests to the server and they are processed in parallel.
33

44
#include "common.h"
55
#include "llama.h"
@@ -113,6 +113,8 @@ int main(int argc, char ** argv) {
113113
// insert new requests as soon as the previous one is done
114114
const bool cont_batching = params.cont_batching;
115115

116+
const bool dump_kv_cache = params.dump_kv_cache;
117+
116118
#ifndef LOG_DISABLE_LOGS
117119
log_set_target(log_filename_generator("parallel", "log"));
118120
LOG_TEE("Log start\n");
@@ -172,6 +174,8 @@ int main(int argc, char ** argv) {
172174
int32_t n_total_gen = 0;
173175
int32_t n_cache_miss = 0;
174176

177+
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
178+
175179
const auto t_main_start = ggml_time_us();
176180

177181
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
@@ -201,6 +205,11 @@ int main(int argc, char ** argv) {
201205
LOG_TEE("Processing requests ...\n\n");
202206

203207
while (true) {
208+
if (dump_kv_cache) {
209+
llama_kv_cache_view_update(ctx, &kvc_view);
210+
dump_kv_cache_view_seqs(kvc_view, 40);
211+
}
212+
204213
llama_batch_clear(batch);
205214

206215
// decode any currently ongoing sequences

examples/server/server.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1112,6 +1112,7 @@ struct llama_server_context
11121112
std::lock_guard<std::mutex> lock(mutex_results);
11131113
task_result res;
11141114
res.id = id;
1115+
res.stop = false;
11151116
res.error = true;
11161117
res.result_json = { { "content", error } };
11171118
queue_results.push_back(res);
@@ -1284,6 +1285,7 @@ struct llama_server_context
12841285
std::lock_guard<std::mutex> lock(mutex_tasks);
12851286
task_server task;
12861287
task.id = id_gen++;
1288+
task.target_id = 0;
12871289
task.data = std::move(data);
12881290
task.infill_mode = infill;
12891291
task.embedding_mode = embedding;

ggml-cuda.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include <algorithm>
2+
#include <cinttypes>
23
#include <cstddef>
34
#include <cstdint>
45
#include <limits>
@@ -8057,7 +8058,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
80578058
if (tensor->op == GGML_OP_MUL_MAT) {
80588059
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
80598060
#ifndef NDEBUG
8060-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
8061+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
80618062
#endif
80628063
return false;
80638064
}

0 commit comments

Comments
 (0)