Skip to content

Commit cbedf6c

Browse files
ggerganovinfil00p
authored andcommitted
llama : remove llama_kv_cache_view API + remove deprecated (ggml-org#13653)
ggml-ci
1 parent 016e5f4 commit cbedf6c

File tree

10 files changed

+1
-390
lines changed

10 files changed

+1
-390
lines changed

common/arg.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1452,7 +1452,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14521452
[](common_params & params) {
14531453
params.swa_full = true;
14541454
}
1455-
));
1455+
).set_env("LLAMA_ARG_SWA_FULL"));
14561456
add_opt(common_arg(
14571457
{"--no-context-shift"},
14581458
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -2065,13 +2065,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20652065
params.grp_attn_w = value;
20662066
}
20672067
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
2068-
add_opt(common_arg(
2069-
{"-dkvc", "--dump-kv-cache"},
2070-
"verbose print of the KV cache",
2071-
[](common_params & params) {
2072-
params.dump_kv_cache = true;
2073-
}
2074-
));
20752068
add_opt(common_arg(
20762069
{"-nkvo", "--no-kv-offload"},
20772070
"disable KV offload",

common/common.cpp

Lines changed: 0 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,81 +1329,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
13291329
return text;
13301330
}
13311331

1332-
//
1333-
// KV cache utils
1334-
//
1335-
1336-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1337-
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1338-
1339-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1340-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1341-
1342-
llama_kv_cache_view_cell * c_curr = view.cells;
1343-
llama_seq_id * cs_curr = view.cells_sequences;
1344-
1345-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1346-
if (i % row_size == 0) {
1347-
printf("\n%5d: ", i);
1348-
}
1349-
int seq_count = 0;
1350-
for (int j = 0; j < view.n_seq_max; j++) {
1351-
if (cs_curr[j] >= 0) { seq_count++; }
1352-
}
1353-
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1354-
}
1355-
1356-
printf("\n=== Done dumping\n");
1357-
}
1358-
1359-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1360-
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1361-
1362-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1363-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1364-
1365-
std::unordered_map<llama_seq_id, size_t> seqs;
1366-
llama_kv_cache_view_cell * c_curr = view.cells;
1367-
llama_seq_id * cs_curr = view.cells_sequences;
1368-
1369-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1370-
for (int j = 0; j < view.n_seq_max; j++) {
1371-
if (cs_curr[j] < 0) { continue; }
1372-
if (seqs.find(cs_curr[j]) == seqs.end()) {
1373-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1374-
const size_t sz = seqs.size();
1375-
seqs[cs_curr[j]] = sz;
1376-
}
1377-
}
1378-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1379-
}
1380-
1381-
printf("=== Sequence legend: ");
1382-
for (const auto & it : seqs) {
1383-
printf("%zu=%d, ", it.second, it.first);
1384-
}
1385-
printf("'+'=other sequence ids");
1386-
1387-
c_curr = view.cells;
1388-
cs_curr = view.cells_sequences;
1389-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1390-
if (i % row_size == 0) {
1391-
printf("\n%5d: ", i);
1392-
}
1393-
for (int j = 0; j < view.n_seq_max; j++) {
1394-
if (cs_curr[j] >= 0) {
1395-
const auto & it = seqs.find(cs_curr[j]);
1396-
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1397-
} else {
1398-
putchar('.');
1399-
}
1400-
}
1401-
putchar(' ');
1402-
}
1403-
1404-
printf("\n=== Done dumping\n");
1405-
}
1406-
14071332
//
14081333
// Embedding utils
14091334
//

common/common.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,6 @@ struct common_params {
330330
bool use_mlock = false; // use mlock to keep model in memory
331331
bool verbose_prompt = false; // print prompt tokens before generation
332332
bool display_prompt = true; // print prompt before generation
333-
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
334333
bool no_kv_offload = false; // disable KV offloading
335334
bool warmup = true; // warmup run
336335
bool check_tensors = false; // validate tensor data
@@ -622,16 +621,6 @@ std::string common_detokenize(
622621
const std::vector<llama_token> & tokens,
623622
bool special = true);
624623

625-
//
626-
// KV cache utils
627-
//
628-
629-
// Dump the KV cache view with the number of sequences per cell.
630-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
631-
632-
// Dump the KV cache view showing individual sequences in each cell (long output).
633-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
634-
635624
//
636625
// Embedding utils
637626
//

examples/lookahead/lookahead.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
5050
const int N = 5; // n-gram size
5151
const int G = 15; // max verification n-grams
5252

53-
const bool dump_kv_cache = params.dump_kv_cache;
54-
5553
// init llama.cpp
5654
llama_backend_init();
5755
llama_numa_init(params.numa);
@@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
152150
// here we keep adding new n-grams as we go
153151
ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
154152

155-
// debug
156-
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
157-
158153
const auto t_dec_start = ggml_time_us();
159154

160155
// sample first token
@@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
172167
}
173168

174169
while (true) {
175-
// debug
176-
if (dump_kv_cache) {
177-
llama_kv_cache_view_update(ctx, &kvc_view);
178-
common_kv_cache_dump_view_seqs(kvc_view, 40);
179-
}
180-
181170
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
182171
//
183172
// Example for W = 5, N = 4, G = 2:
@@ -473,8 +462,6 @@ int main(int argc, char ** argv) {
473462

474463
common_sampler_free(smpl);
475464

476-
llama_kv_cache_view_free(&kvc_view);
477-
478465
llama_batch_free(batch);
479466

480467
llama_backend_free();

examples/lookup/lookup.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ int main(int argc, char ** argv){
2424
// max. number of additional tokens to draft if match is found
2525
const int n_draft = params.speculative.n_max;
2626

27-
const bool dump_kv_cache = params.dump_kv_cache;
28-
2927
// init llama.cpp
3028
llama_backend_init();
3129
llama_numa_init(params.numa);
@@ -110,18 +108,9 @@ int main(int argc, char ** argv){
110108

111109
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
112110

113-
// debug
114-
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
115-
116111
const auto t_dec_start = ggml_time_us();
117112

118113
while (true) {
119-
// debug
120-
if (dump_kv_cache) {
121-
llama_kv_cache_view_update(ctx, &kvc_view);
122-
common_kv_cache_dump_view_seqs(kvc_view, 40);
123-
}
124-
125114
// print current draft sequence
126115
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
127116

examples/parallel/parallel.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,6 @@ int main(int argc, char ** argv) {
178178
// insert new requests as soon as the previous one is done
179179
const bool cont_batching = params.cont_batching;
180180

181-
const bool dump_kv_cache = params.dump_kv_cache;
182-
183181
// is the system prompt shared in the cache
184182
const bool is_sp_shared = params.is_pp_shared;
185183

@@ -241,8 +239,6 @@ int main(int argc, char ** argv) {
241239
int32_t n_total_gen = 0;
242240
int32_t n_cache_miss = 0;
243241

244-
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
245-
246242
const auto t_main_start = ggml_time_us();
247243

248244
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
@@ -272,11 +268,6 @@ int main(int argc, char ** argv) {
272268
LOG_INF("Processing requests ...\n\n");
273269

274270
while (true) {
275-
if (dump_kv_cache) {
276-
llama_kv_cache_view_update(ctx, &kvc_view);
277-
common_kv_cache_dump_view_seqs(kvc_view, 40);
278-
}
279-
280271
common_batch_clear(batch);
281272

282273
// decode any currently ongoing sequences

include/llama.h

Lines changed: 0 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -608,72 +608,13 @@ extern "C" {
608608
// KV cache
609609
//
610610

611-
// TODO: start using struct llama_kv_cache
612-
613-
// Information associated with an individual cell in the KV cache view.
614-
struct llama_kv_cache_view_cell {
615-
// The position for this cell. Takes KV cache shifts into account.
616-
// May be negative if the cell is not populated.
617-
llama_pos pos;
618-
};
619-
620-
// An updateable view of the KV cache.
621-
struct llama_kv_cache_view {
622-
// Number of KV cache cells. This will be the same as the context size.
623-
int32_t n_cells;
624-
625-
// Maximum number of sequences that can exist in a cell. It's not an error
626-
// if there are more sequences in a cell than this value, however they will
627-
// not be visible in the view cells_sequences.
628-
int32_t n_seq_max;
629-
630-
// Number of tokens in the cache. For example, if there are two populated
631-
// cells, the first with 1 sequence id in it and the second with 2 sequence
632-
// ids then you'll have 3 tokens.
633-
int32_t token_count;
634-
635-
// Number of populated cache cells.
636-
int32_t used_cells;
637-
638-
// Maximum contiguous empty slots in the cache.
639-
int32_t max_contiguous;
640-
641-
// Index to the start of the max_contiguous slot range. Can be negative
642-
// when cache is full.
643-
int32_t max_contiguous_idx;
644-
645-
// Information for an individual cell.
646-
struct llama_kv_cache_view_cell * cells;
647-
648-
// The sequences for each cell. There will be n_seq_max items per cell.
649-
llama_seq_id * cells_sequences;
650-
};
651-
652-
// Create an empty KV cache view. (use only for debugging purposes)
653-
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
654-
655-
// Free a KV cache view. (use only for debugging purposes)
656-
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
657-
658-
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
659-
// TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
660-
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
661-
662-
///
663-
664611
// Returns the number of tokens in the KV cache (slow, use only for debug)
665612
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
666613
LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
667614

668-
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
669-
"use llama_kv_self_n_tokens instead");
670-
671615
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
672616
LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
673617

674-
DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
675-
"use llama_kv_self_used_cells instead");
676-
677618
// Clear the KV cache - both cell info is erased and KV data is zeroed
678619
LLAMA_API void llama_kv_self_clear(
679620
struct llama_context * ctx);
@@ -756,61 +697,6 @@ extern "C" {
756697
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
757698
LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
758699

759-
DEPRECATED(LLAMA_API void llama_kv_cache_clear(
760-
struct llama_context * ctx),
761-
"use llama_kv_self_clear instead");
762-
763-
DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
764-
struct llama_context * ctx,
765-
llama_seq_id seq_id,
766-
llama_pos p0,
767-
llama_pos p1),
768-
"use llama_kv_self_seq_rm instead");
769-
770-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
771-
struct llama_context * ctx,
772-
llama_seq_id seq_id_src,
773-
llama_seq_id seq_id_dst,
774-
llama_pos p0,
775-
llama_pos p1),
776-
"use llama_kv_self_seq_cp instead");
777-
778-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
779-
struct llama_context * ctx,
780-
llama_seq_id seq_id),
781-
"use llama_kv_self_seq_keep instead");
782-
783-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
784-
struct llama_context * ctx,
785-
llama_seq_id seq_id,
786-
llama_pos p0,
787-
llama_pos p1,
788-
llama_pos delta),
789-
"use llama_kv_self_seq_add instead");
790-
791-
DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
792-
struct llama_context * ctx,
793-
llama_seq_id seq_id,
794-
llama_pos p0,
795-
llama_pos p1,
796-
int d),
797-
"use llama_kv_self_seq_div instead");
798-
799-
DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
800-
struct llama_context * ctx,
801-
llama_seq_id seq_id),
802-
"use llama_kv_self_seq_pos_max instead");
803-
804-
DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
805-
"use llama_kv_self_defrag instead");
806-
807-
DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
808-
"use llama_kv_self_can_shift instead");
809-
810-
DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
811-
"use llama_kv_self_update instead");
812-
813-
814700
//
815701
// State / sessions
816702
//

0 commit comments

Comments
 (0)