Skip to content

Commit be5cacc

Browse files
authored
llama : only use default buffer types for the KV cache (#10358)
1 parent 20a780c commit be5cacc

File tree

2 files changed

+9
-16
lines changed

2 files changed

+9
-16
lines changed

ggml/src/ggml-backend.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -689,7 +689,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
689689
}
690690

691691
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
692-
ggml_backend_buffer_t buffer = tensor->buffer;
692+
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
693693
if (buffer == NULL) {
694694
return -1;
695695
}
@@ -722,8 +722,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML
722722

723723
// returns the backend that should be used for the node based on the current locations
724724
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
725-
// TODO: use supports_op to check if the backend supports the op
726-
727725
// assign pre-allocated nodes to their backend
728726
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
729727
if (cur_backend_id != -1) {
@@ -742,7 +740,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
742740

743741
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
744742
// since the tensor is pre-allocated, it cannot be moved to another backend
745-
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
743+
GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
746744
}
747745

748746
// graph input
@@ -886,6 +884,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
886884
for (int i = 0; i < graph->n_nodes; i++) {
887885
struct ggml_tensor * node = graph->nodes[i];
888886
int * node_backend_id = &tensor_backend_id(node);
887+
if (ggml_is_view_op(node->op)) {
888+
continue;
889+
}
889890
// do not overwrite user assignments
890891
if (*node_backend_id == -1) {
891892
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);

src/llama.cpp

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3460,21 +3460,13 @@ static bool llama_kv_cache_init(
34603460
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
34613461
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
34623462

3463-
const llama_model::buft_list_t * buft_list;
3463+
ggml_backend_buffer_type_t buft;
34643464
if (offload) {
3465-
buft_list = model.dev_layer.at(i).buft_list;
3465+
auto * dev = model.dev_layer.at(i).dev;
3466+
buft = ggml_backend_dev_buffer_type(dev);
34663467
} else {
3467-
buft_list = &model.cpu_buft_list;
3468+
buft = ggml_backend_cpu_buffer_type();
34683469
}
3469-
ggml_backend_buffer_type_t buft = select_buft(*buft_list,
3470-
[&](ggml_context * ctx) {
3471-
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
3472-
if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
3473-
return k;
3474-
}
3475-
ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
3476-
return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
3477-
});
34783470
ggml_context * ctx = ctx_for_buft(buft);
34793471

34803472
if (!ctx) {

0 commit comments

Comments
 (0)