Skip to content

Commit d44e0fb

Browse files
committed
Added more comprehensive graph node checking
1 parent 9c57861 commit d44e0fb

File tree

1 file changed

+59
-9
lines changed

1 file changed

+59
-9
lines changed

ggml-cuda.cu

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2409,22 +2409,57 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
24092409
#define USE_CUDA_GRAPH
24102410
#endif
24112411

2412+
struct ggml_graph_node_properties {
2413+
void * node_address;
2414+
int node_op;
2415+
int64_t ne[GGML_MAX_DIMS];
2416+
size_t nb[GGML_MAX_DIMS];
2417+
void * src_address[GGML_MAX_SRC];
2418+
};
2419+
24122420
#ifdef USE_CUDA_GRAPH
24132421
#define MAX_NODES_IN_CUDA_GRAPH 10000
24142422
struct ggml_cuda_graph {
24152423
int count = 0;
24162424
cudaGraph_t graph = nullptr;
24172425
cudaGraphExec_t instance = nullptr;
24182426
size_t num_nodes = 0;
2419-
int softmax_ne0 = 0;
24202427
cudaGraphNode_t nodes[MAX_NODES_IN_CUDA_GRAPH];
24212428
cudaKernelNodeParams params[MAX_NODES_IN_CUDA_GRAPH];
24222429
bool disable_due_to_gpu_arch = false;
2430+
bool disable_due_to_too_many_updates = false;
2431+
int number_consecutive_updates = 0;
2432+
ggml_graph_node_properties ggml_graph_properties[MAX_NODES_IN_CUDA_GRAPH];
24232433
};
24242434
#endif
24252435

24262436
const bool disable_cuda_graphs = (getenv("LLAMACPP_DISABLE_CUDA_GRAPHS") != nullptr);
24272437

2438+
GGML_CALL static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2439+
graph_node_properties->node_address = node;
2440+
graph_node_properties->node_op = node->op;
2441+
for(int i=0; i<GGML_MAX_DIMS; i++) {
2442+
graph_node_properties->ne[i] = node->ne[i];
2443+
graph_node_properties->nb[i] = node->nb[i];
2444+
}
2445+
for(int i=0; i<GGML_MAX_SRC; i++) {
2446+
graph_node_properties->src_address[i] = node->src[i];
2447+
}
2448+
}
2449+
2450+
GGML_CALL static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2451+
if(node != graph_node_properties->node_address) return false;
2452+
if(node->op != graph_node_properties->node_op) return false;
2453+
for(int i=0; i<GGML_MAX_DIMS; i++) {
2454+
if(node->ne[i] != graph_node_properties->ne[i]) return false;
2455+
if(node->nb[i] != graph_node_properties->nb[i]) return false;
2456+
}
2457+
for(int i=0; i<GGML_MAX_SRC; i++) {
2458+
if(node->src[i] != graph_node_properties->src_address[i]) return false;
2459+
}
2460+
return true;
2461+
}
2462+
24282463
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
24292464
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
24302465

@@ -2446,30 +2481,36 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
24462481
}
24472482
}
24482483

2449-
// Disable CUDA graphs in presence of env var or old GPU.
2484+
// Disable CUDA graphs in presence of env var, old GPU or use-case which is changing too rapidly.
24502485
// Also disable for multi-gpu for now. TO DO investigate
2451-
if(disable_cuda_graphs || cuda_graph.disable_due_to_gpu_arch || ggml_backend_cuda_get_device_count() > 1){
2486+
if(disable_cuda_graphs || cuda_graph.disable_due_to_gpu_arch || cuda_graph.disable_due_to_too_many_updates ||
2487+
ggml_backend_cuda_get_device_count() > 1){
24522488
use_cuda_graph = false;
24532489
}
24542490

24552491
if(use_cuda_graph) {
24562492

24572493
if(cuda_graph.instance == nullptr) cuda_graph_update_required=true;
24582494

2495+
// Loop over nodes in GGML graph to determine if CUDA graph update is required
2496+
// and store properties to allow this comparison for the next token
2497+
for (int i = 0; i < cgraph->n_nodes; i++) {
2498+
bool has_matching_properties = true;
2499+
if(!cuda_graph_update_required) {
2500+
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_graph.ggml_graph_properties[i]);
2501+
}
2502+
if(!has_matching_properties) cuda_graph_update_required = true;
2503+
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph.ggml_graph_properties[i]);
2504+
}
2505+
24592506
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
24602507
int k=0;
24612508
for (int i = 0; i < cgraph->n_nodes; i++) {
24622509
ggml_tensor * node = cgraph->nodes[i];
2463-
// Identify if the graph needs to be updated for this token due to the number of elements changing
2464-
// (identified by inspecting soft max op parameters)
24652510
if(node->op == GGML_OP_SOFT_MAX) {
24662511
if(node->src[1]->ne[1] > 1){
24672512
use_cuda_graph = false; // disable CUDA graphs for batch size > 1 for now. TO DO investigate
24682513
}
2469-
if(node->src[0]->ne[0] != cuda_graph.softmax_ne0) {
2470-
cuda_graph_update_required = true;
2471-
cuda_graph.softmax_ne0 = node->src[0]->ne[0];
2472-
}
24732514
}
24742515
if(node->op == GGML_OP_CPY) {
24752516
// store the copy op parameter which changes with each token.
@@ -2480,6 +2521,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
24802521
}
24812522
}
24822523
}
2524+
2525+
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2526+
if(cuda_graph_update_required) {
2527+
cuda_graph.number_consecutive_updates++;
2528+
}
2529+
else {
2530+
cuda_graph.number_consecutive_updates = 0;
2531+
}
2532+
if (cuda_graph.number_consecutive_updates >= 4) cuda_graph.disable_due_to_too_many_updates = true;
24832533
}
24842534

24852535
if(use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture

0 commit comments

Comments
 (0)