Skip to content

Commit 2540886

Browse files
committed
rpc : make RPC servers come first in the device list
1 parent 8f1d81a commit 2540886

File tree

1 file changed

+32
-31
lines changed

1 file changed

+32
-31
lines changed

src/llama.cpp

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3346,29 +3346,29 @@ static size_t llama_get_device_count(const llama_model & model) {
33463346
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
33473347
ggml_backend_buffer_type_t buft = nullptr;
33483348

3349-
#if defined(GGML_USE_RPC)
3350-
int dev_count = (int)llama_get_device_count(model);
33513349
int rpc_count = (int)model.rpc_servers.size();
3352-
if (gpu >= dev_count - rpc_count) {
3353-
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
3350+
int local_gpu = gpu - rpc_count;
3351+
#if defined(GGML_USE_RPC)
3352+
if (gpu < rpc_count) {
3353+
const char * endpoint = model.rpc_servers[gpu].c_str();
33543354
return ggml_backend_rpc_buffer_type(endpoint);
33553355
}
33563356
#endif
33573357
#if defined(GGML_USE_METAL)
33583358
buft = ggml_backend_metal_buffer_type();
33593359
#elif defined(GGML_USE_CUDA)
3360-
buft = ggml_backend_cuda_buffer_type(gpu);
3360+
buft = ggml_backend_cuda_buffer_type(local_gpu);
33613361
#elif defined(GGML_USE_VULKAN)
3362-
buft = ggml_backend_vk_buffer_type(gpu);
3362+
buft = ggml_backend_vk_buffer_type(local_gpu);
33633363
#elif defined(GGML_USE_SYCL)
3364-
buft = ggml_backend_sycl_buffer_type(gpu);
3364+
buft = ggml_backend_sycl_buffer_type(local_gpu);
33653365
#elif defined(GGML_USE_KOMPUTE)
3366-
buft = ggml_backend_kompute_buffer_type(gpu);
3366+
buft = ggml_backend_kompute_buffer_type(local_gpu);
33673367
if (buft == nullptr) {
3368-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
3368+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
33693369
}
33703370
#elif defined(GGML_USE_CANN)
3371-
buft = ggml_backend_cann_buffer_type(gpu);
3371+
buft = ggml_backend_cann_buffer_type(local_gpu);
33723372
#endif
33733373

33743374
if (buft == nullptr) {
@@ -3403,36 +3403,36 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
34033403
}
34043404

34053405
static size_t llama_get_device_memory(const llama_model & model, int device) {
3406-
#if defined(GGML_USE_RPC)
3407-
int dev_count = (int)llama_get_device_count(model);
34083406
int rpc_count = (int)model.rpc_servers.size();
3409-
if (device >= dev_count - rpc_count) {
3407+
int local_device = device - rpc_count;
3408+
#if defined(GGML_USE_RPC)
3409+
if (device < rpc_count) {
34103410
size_t total;
34113411
size_t free;
3412-
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
3412+
const char * endpoint = model.rpc_servers[device].c_str();
34133413
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
34143414
return free;
34153415
}
34163416
#endif
34173417
#if defined(GGML_USE_CUDA)
34183418
size_t total;
34193419
size_t free;
3420-
ggml_backend_cuda_get_device_memory(device, &free, &total);
3420+
ggml_backend_cuda_get_device_memory(local_device, &free, &total);
34213421
return free;
34223422
#elif defined(GGML_USE_SYCL)
34233423
size_t total;
34243424
size_t free;
3425-
ggml_backend_sycl_get_device_memory(device, &free, &total);
3425+
ggml_backend_sycl_get_device_memory(local_device, &free, &total);
34263426
return free;
34273427
#elif defined(GGML_USE_VULKAN)
34283428
size_t total;
34293429
size_t free;
3430-
ggml_backend_vk_get_device_memory(device, &free, &total);
3430+
ggml_backend_vk_get_device_memory(local_device, &free, &total);
34313431
return free;
34323432
#elif defined(GGML_USE_CANN)
34333433
size_t total;
34343434
size_t free;
3435-
ggml_backend_cann_get_device_memory(device, &free, &total);
3435+
ggml_backend_cann_get_device_memory(local_device, &free, &total);
34363436
return free;
34373437
#else
34383438
return 1;
@@ -18188,6 +18188,20 @@ struct llama_context * llama_new_context_with_model(
1818818188

1818918189
if (!hparams.vocab_only) {
1819018190
// initialize backends
18191+
#if defined(GGML_USE_RPC)
18192+
if (model->n_gpu_layers > 0) {
18193+
for (const auto & endpoint : model->rpc_servers) {
18194+
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
18195+
if (backend == nullptr) {
18196+
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18197+
llama_free(ctx);
18198+
return nullptr;
18199+
}
18200+
ctx->backends.push_back(backend);
18201+
}
18202+
}
18203+
#endif
18204+
1819118205
#if defined(GGML_USE_METAL)
1819218206
if (model->n_gpu_layers > 0) {
1819318207
ctx->backend_metal = ggml_backend_metal_init();
@@ -18312,19 +18326,6 @@ struct llama_context * llama_new_context_with_model(
1831218326
}
1831318327
#endif
1831418328

18315-
#if defined(GGML_USE_RPC)
18316-
if (model->n_gpu_layers > 0) {
18317-
for (const auto & endpoint : model->rpc_servers) {
18318-
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
18319-
if (backend == nullptr) {
18320-
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
18321-
llama_free(ctx);
18322-
return nullptr;
18323-
}
18324-
ctx->backends.push_back(backend);
18325-
}
18326-
}
18327-
#endif
1832818329
ctx->backend_cpu = ggml_backend_cpu_init();
1832918330
if (ctx->backend_cpu == nullptr) {
1833018331
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);

0 commit comments

Comments
 (0)