Skip to content

Commit fc8ef54

Browse files
lshzh-wwslarenggerganov
authored
metal : enable ggml-alloc (#2627)
* metal: enable ggml-alloc Make ggml-alloc work with concurrently dispatch. * style-fix Co-authored-by: slaren <slarengh@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent bf83bff commit fc8ef54

File tree

5 files changed

+61
-26
lines changed

5 files changed

+61
-26
lines changed

ggml-alloc.c

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ struct ggml_allocr {
6767
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
6868
size_t max_size;
6969
bool measure;
70+
int parse_seq[GGML_MAX_NODES];
71+
bool has_parse_seq;
7072

7173
#ifdef GGML_ALLOCATOR_DEBUG
7274
struct ggml_tensor * allocated_tensors[1024];
@@ -229,6 +231,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
229231
alloc->n_free_blocks++;
230232
}
231233

234+
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
235+
int pos = 0;
236+
for (int i = 0; i < n; i++) {
237+
if (list[i] != -1) {
238+
alloc->parse_seq[pos] = list[i];
239+
pos++;
240+
}
241+
}
242+
alloc->has_parse_seq = true;
243+
}
244+
232245
void ggml_allocr_reset(struct ggml_allocr * alloc) {
233246
alloc->n_free_blocks = 1;
234247
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@@ -248,6 +261,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
248261
/*.hash_table = */ {{0}},
249262
/*.max_size = */ 0,
250263
/*.measure = */ false,
264+
/*.parse_seq = */ {0},
265+
/*.has_parse_seq = */ false,
251266
#ifdef GGML_ALLOCATOR_DEBUG
252267
/*.allocated_tensors = */ = {0},
253268
#endif
@@ -275,6 +290,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
275290
/*.hash_table = */ {{0}},
276291
/*.max_size = */ 0,
277292
/*.measure = */ true,
293+
/*.parse_seq = */ {0},
294+
/*.has_parse_seq = */ false,
278295
#ifdef GGML_ALLOCATOR_DEBUG
279296
/*.allocated_tensors = */ = {0},
280297
#endif
@@ -473,7 +490,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
473490
allocate_node(alloc, input);
474491
}
475492
}
476-
for (int i = 0; i < gf->n_nodes; i++) {
493+
for (int ind = 0; ind < gf->n_nodes; ind++) {
494+
int i;
495+
if (alloc->has_parse_seq) {
496+
i = alloc->parse_seq[ind];
497+
} else {
498+
i = ind;
499+
}
477500
struct ggml_tensor * node = gf->nodes[i];
478501

479502
// allocate parents (leafs)

ggml-alloc.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ extern "C" {
1010
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
1111
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
1212

13+
// tell the allocator to parse nodes following the order described in the list
14+
// you should call this if your graph are optimized to execute out-of-order
15+
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
16+
1317
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
1418
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
1519
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);

ggml-metal.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
6363

6464
// try to find operations that can be run concurrently in the graph
6565
// you should run it again if the topology of your graph changes
66-
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
66+
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
6767

68-
// if the graph has been optimized for concurrently dispatch
69-
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
68+
// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
69+
int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
70+
71+
// output the concur_list for ggml_alloc
72+
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
7073

7174
// same as ggml_graph_compute but uses Metal
7275
// creates gf->n_threads command buffers in parallel

ggml-metal.m

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,11 +236,12 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
236236
ctx->n_cb = n_cb;
237237
}
238238

239-
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
240-
if (ctx->concur_list_len) {
241-
return true;
242-
}
243-
return false;
239+
int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
240+
return ctx->concur_list_len;
241+
}
242+
243+
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
244+
return ctx->concur_list;
244245
}
245246

246247
// finds the Metal buffer that contains the tensor data on the GPU device
@@ -383,7 +384,7 @@ void ggml_metal_get_tensor(
383384

384385
void ggml_metal_graph_find_concurrency(
385386
struct ggml_metal_context * ctx,
386-
struct ggml_cgraph * gf) {
387+
struct ggml_cgraph * gf, bool check_mem) {
387388
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
388389
int nodes_unused[GGML_MAX_CONCUR];
389390

@@ -430,7 +431,7 @@ void ggml_metal_graph_find_concurrency(
430431
}
431432
}
432433
}
433-
if (exe_flag) {
434+
if (exe_flag && check_mem) {
434435
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
435436
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
436437
int64_t data_start = (int64_t) gf->nodes[i]->data;

llama.cpp

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text,
6363
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
6464

6565

66-
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
66+
#if !defined(GGML_USE_CUBLAS)
6767
#include "ggml-alloc.h"
6868
#define LLAMA_USE_ALLOCATOR
6969
#else
@@ -1846,10 +1846,6 @@ static bool llama_eval_internal(
18461846

18471847
#ifdef GGML_USE_METAL
18481848
if (lctx.ctx_metal) {
1849-
// TODO: disabled until #2413 is resolved
1850-
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1851-
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1852-
//}
18531849
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
18541850
ggml_metal_graph_compute(lctx.ctx_metal, gf);
18551851
ggml_metal_get_tensor (lctx.ctx_metal, res);
@@ -3287,7 +3283,18 @@ struct llama_context * llama_new_context_with_model(
32873283
int n_past = hparams.n_ctx - n_tokens;
32883284
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
32893285
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3290-
3286+
#ifdef GGML_USE_METAL
3287+
if (params.n_gpu_layers > 0) {
3288+
ctx->ctx_metal = ggml_metal_init(1);
3289+
if (!ctx->ctx_metal) {
3290+
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
3291+
llama_free(ctx);
3292+
return NULL;
3293+
}
3294+
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
3295+
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
3296+
}
3297+
#endif
32913298
// measure memory requirements for the graph
32923299
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
32933300

@@ -3305,6 +3312,11 @@ struct llama_context * llama_new_context_with_model(
33053312

33063313
ctx->buf_alloc.resize(alloc_size);
33073314
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3315+
#ifdef GGML_USE_METAL
3316+
if (ctx->ctx_metal) {
3317+
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
3318+
}
3319+
#endif
33083320
}
33093321
#else
33103322
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
@@ -3319,13 +3331,6 @@ struct llama_context * llama_new_context_with_model(
33193331
#ifdef GGML_USE_METAL
33203332
if (params.n_gpu_layers > 0) {
33213333
// this allocates all Metal resources and memory buffers
3322-
ctx->ctx_metal = ggml_metal_init(1);
3323-
3324-
if (!ctx->ctx_metal) {
3325-
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
3326-
llama_free(ctx);
3327-
return NULL;
3328-
}
33293334

33303335
void * data_ptr = NULL;
33313336
size_t data_size = 0;
@@ -3354,8 +3359,7 @@ struct llama_context * llama_new_context_with_model(
33543359
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
33553360
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
33563361

3357-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
3358-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
3362+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
33593363
#undef LLAMA_METAL_CHECK_BUF
33603364
}
33613365
#endif

0 commit comments

Comments
 (0)