@@ -2409,22 +2409,57 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
2409
2409
#define USE_CUDA_GRAPH
2410
2410
#endif
2411
2411
2412
+ struct ggml_graph_node_properties {
2413
+ void * node_address;
2414
+ int node_op;
2415
+ int64_t ne[GGML_MAX_DIMS];
2416
+ size_t nb[GGML_MAX_DIMS];
2417
+ void * src_address[GGML_MAX_SRC];
2418
+ };
2419
+
2412
2420
#ifdef USE_CUDA_GRAPH
2413
2421
#define MAX_NODES_IN_CUDA_GRAPH 10000
2414
2422
struct ggml_cuda_graph {
2415
2423
int count = 0 ;
2416
2424
cudaGraph_t graph = nullptr ;
2417
2425
cudaGraphExec_t instance = nullptr ;
2418
2426
size_t num_nodes = 0 ;
2419
- int softmax_ne0 = 0 ;
2420
2427
cudaGraphNode_t nodes[MAX_NODES_IN_CUDA_GRAPH];
2421
2428
cudaKernelNodeParams params[MAX_NODES_IN_CUDA_GRAPH];
2422
2429
bool disable_due_to_gpu_arch = false ;
2430
+ bool disable_due_to_too_many_updates = false ;
2431
+ int number_consecutive_updates = 0 ;
2432
+ ggml_graph_node_properties ggml_graph_properties[MAX_NODES_IN_CUDA_GRAPH];
2423
2433
};
2424
2434
#endif
2425
2435
2426
2436
const bool disable_cuda_graphs = (getenv(" LLAMACPP_DISABLE_CUDA_GRAPHS" ) != nullptr );
2427
2437
2438
+ GGML_CALL static void set_ggml_graph_node_properties (ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2439
+ graph_node_properties->node_address = node;
2440
+ graph_node_properties->node_op = node->op ;
2441
+ for (int i=0 ; i<GGML_MAX_DIMS; i++) {
2442
+ graph_node_properties->ne [i] = node->ne [i];
2443
+ graph_node_properties->nb [i] = node->nb [i];
2444
+ }
2445
+ for (int i=0 ; i<GGML_MAX_SRC; i++) {
2446
+ graph_node_properties->src_address [i] = node->src [i];
2447
+ }
2448
+ }
2449
+
2450
+ GGML_CALL static bool ggml_graph_node_has_matching_properties (ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
2451
+ if (node != graph_node_properties->node_address ) return false ;
2452
+ if (node->op != graph_node_properties->node_op ) return false ;
2453
+ for (int i=0 ; i<GGML_MAX_DIMS; i++) {
2454
+ if (node->ne [i] != graph_node_properties->ne [i]) return false ;
2455
+ if (node->nb [i] != graph_node_properties->nb [i]) return false ;
2456
+ }
2457
+ for (int i=0 ; i<GGML_MAX_SRC; i++) {
2458
+ if (node->src [i] != graph_node_properties->src_address [i]) return false ;
2459
+ }
2460
+ return true ;
2461
+ }
2462
+
2428
2463
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute (ggml_backend_t backend, ggml_cgraph * cgraph) {
2429
2464
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context ;
2430
2465
@@ -2446,30 +2481,36 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2446
2481
}
2447
2482
}
2448
2483
2449
- // Disable CUDA graphs in presence of env var or old GPU.
2484
+ // Disable CUDA graphs in presence of env var, old GPU or use-case which is changing too rapidly .
2450
2485
// Also disable for multi-gpu for now. TO DO investigate
2451
- if (disable_cuda_graphs || cuda_graph.disable_due_to_gpu_arch || ggml_backend_cuda_get_device_count () > 1 ){
2486
+ if (disable_cuda_graphs || cuda_graph.disable_due_to_gpu_arch || cuda_graph.disable_due_to_too_many_updates ||
2487
+ ggml_backend_cuda_get_device_count () > 1 ){
2452
2488
use_cuda_graph = false ;
2453
2489
}
2454
2490
2455
2491
if (use_cuda_graph) {
2456
2492
2457
2493
if (cuda_graph.instance == nullptr ) cuda_graph_update_required=true ;
2458
2494
2495
+ // Loop over nodes in GGML graph to determine if CUDA graph update is required
2496
+ // and store properties to allow this comparison for the next token
2497
+ for (int i = 0 ; i < cgraph->n_nodes ; i++) {
2498
+ bool has_matching_properties = true ;
2499
+ if (!cuda_graph_update_required) {
2500
+ has_matching_properties = ggml_graph_node_has_matching_properties (cgraph->nodes [i], &cuda_graph.ggml_graph_properties [i]);
2501
+ }
2502
+ if (!has_matching_properties) cuda_graph_update_required = true ;
2503
+ set_ggml_graph_node_properties (cgraph->nodes [i], &cuda_graph.ggml_graph_properties [i]);
2504
+ }
2505
+
2459
2506
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
2460
2507
int k=0 ;
2461
2508
for (int i = 0 ; i < cgraph->n_nodes ; i++) {
2462
2509
ggml_tensor * node = cgraph->nodes [i];
2463
- // Identify if the graph needs to be updated for this token due to the number of elements changing
2464
- // (identified by inspecting soft max op parameters)
2465
2510
if (node->op == GGML_OP_SOFT_MAX) {
2466
2511
if (node->src [1 ]->ne [1 ] > 1 ){
2467
2512
use_cuda_graph = false ; // disable CUDA graphs for batch size > 1 for now. TO DO investigate
2468
2513
}
2469
- if (node->src [0 ]->ne [0 ] != cuda_graph.softmax_ne0 ) {
2470
- cuda_graph_update_required = true ;
2471
- cuda_graph.softmax_ne0 = node->src [0 ]->ne [0 ];
2472
- }
2473
2514
}
2474
2515
if (node->op == GGML_OP_CPY) {
2475
2516
// store the copy op parameter which changes with each token.
@@ -2480,6 +2521,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2480
2521
}
2481
2522
}
2482
2523
}
2524
+
2525
+ // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
2526
+ if (cuda_graph_update_required) {
2527
+ cuda_graph.number_consecutive_updates ++;
2528
+ }
2529
+ else {
2530
+ cuda_graph.number_consecutive_updates = 0 ;
2531
+ }
2532
+ if (cuda_graph.number_consecutive_updates >= 4 ) cuda_graph.disable_due_to_too_many_updates = true ;
2483
2533
}
2484
2534
2485
2535
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
0 commit comments