@@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
787
787
788
788
static void ggml_vk_submit (vk_context& ctx, vk::Fence fence) {
789
789
if (ctx->seqs .empty ()) {
790
+ if (fence) {
791
+ ctx->q ->queue .submit ({}, fence);
792
+ }
790
793
return ;
791
794
}
792
795
VK_LOG_DEBUG (" ggml_vk_submit(" << ctx << " , " << fence << " )" );
@@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5658
5661
}
5659
5662
}
5660
5663
5661
- static void ggml_vk_build_graph (ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
5664
+ static bool ggml_vk_compute_forward (ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
5665
+
5666
+ // Returns true if node has enqueued work into the queue, false otherwise
5667
+ // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
5668
+ static bool ggml_vk_build_graph (ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
5662
5669
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra ;
5663
5670
5664
5671
if (ggml_is_empty (node) || extra == nullptr ) {
5665
- return ;
5672
+ return false ;
5666
5673
}
5667
5674
5668
5675
VK_LOG_DEBUG (" ggml_vk_build_graph(" << node << " , " << ggml_op_name (node->op ) << " )" );
@@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5679
5686
case GGML_OP_PERMUTE:
5680
5687
case GGML_OP_TRANSPOSE:
5681
5688
case GGML_OP_NONE:
5682
- return ;
5689
+ return false ;
5683
5690
case GGML_OP_UNARY:
5684
5691
switch (ggml_get_unary_op (node)) {
5685
5692
case GGML_UNARY_OP_SILU:
@@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5689
5696
case GGML_UNARY_OP_TANH:
5690
5697
break ;
5691
5698
default :
5692
- return ;
5699
+ return false ;
5693
5700
}
5694
5701
break ;
5695
5702
case GGML_OP_REPEAT:
@@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5726
5733
default :
5727
5734
std::cerr << " ggml_vulkan: Error: Missing op: " << ggml_op_name (node->op ) << std::endl;
5728
5735
GGML_ABORT (" fatal error" );
5729
- return ;
5736
+ return false ;
5730
5737
}
5731
5738
5732
5739
vk_context compute_ctx;
@@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5826
5833
ggml_vk_unary (ctx, compute_ctx, src0, node, dryrun);
5827
5834
break ;
5828
5835
default :
5829
- return ;
5836
+ return false ;
5830
5837
}
5831
5838
break ;
5832
5839
case GGML_OP_DIAG_MASK_INF:
@@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5870
5877
5871
5878
break ;
5872
5879
default :
5873
- return ;
5880
+ return false ;
5874
5881
}
5875
5882
5876
5883
if (dryrun) {
5877
- return ;
5884
+ return false ;
5878
5885
}
5879
5886
5880
5887
ctx->tensor_ctxs [node_idx] = compute_ctx;
@@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5885
5892
last_node = true ;
5886
5893
#endif
5887
5894
5888
- if (last_node) {
5895
+ if (submit || last_node) {
5889
5896
ggml_vk_ctx_end (compute_ctx);
5890
- compute_ctx->exit_tensor_idx = node_idx;
5897
+
5898
+ // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
5899
+ if (last_node) {
5900
+ compute_ctx->exit_tensor_idx = node_idx_begin;
5901
+ }
5902
+ else {
5903
+ compute_ctx->exit_tensor_idx = -1 ;
5904
+ }
5905
+
5891
5906
ctx->compute_ctx .reset ();
5907
+
5908
+ bool ok = ggml_vk_compute_forward (ctx, node_begin, node_idx_begin, false );
5909
+ if (!ok) {
5910
+ if (node->op == GGML_OP_UNARY) {
5911
+ std::cerr << __func__ << " : error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name (static_cast <ggml_unary_op>(node->op_params [0 ])) << " )" << std::endl;
5912
+ }
5913
+ else {
5914
+ std::cerr << __func__ << " : error: op not supported " << node->name << " (" << ggml_op_name (node->op ) << " )" << std::endl;
5915
+ }
5916
+ }
5917
+
5892
5918
}
5919
+ return true ;
5893
5920
}
5894
5921
5895
- static bool ggml_vk_compute_forward (ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
5922
+ static bool ggml_vk_compute_forward (ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true ){
5896
5923
ggml_tensor_extra_gpu * extra = nullptr ;
5897
5924
5898
5925
switch (tensor->op ) {
@@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
5960
5987
5961
5988
VK_LOG_DEBUG (" ggml_vk_compute_forward(" << tensor << " , name=" << tensor->name << " , op=" << ggml_op_name (tensor->op ) << " , type=" << tensor->type << " , ne0=" << tensor->ne [0 ] << " , ne1=" << tensor->ne [1 ] << " , ne2=" << tensor->ne [2 ] << " , ne3=" << tensor->ne [3 ] << " , nb0=" << tensor->nb [0 ] << " , nb1=" << tensor->nb [1 ] << " , nb2=" << tensor->nb [2 ] << " , nb3=" << tensor->nb [3 ] << " , view_src=" << tensor->view_src << " , view_offs=" << tensor->view_offs << " )" );
5962
5989
5963
- #ifdef GGML_VULKAN_CHECK_RESULTS
5964
- ggml_vk_check_results_0 (tensor);
5965
- #endif
5966
-
5967
5990
vk_context subctx = ctx->tensor_ctxs [tensor_idx].lock ();
5968
5991
5969
- #ifdef GGML_VULKAN_PERF
5970
- std::chrono::steady_clock::time_point start;
5971
- #endif // GGML_VULKAN_PERF
5992
+ // always wait for the GPU work to be done for the last submit
5993
+ if (tensor_idx == subctx->exit_tensor_idx ) {
5994
+ use_fence = true ;
5995
+ }
5972
5996
5973
5997
// Only run if ctx hasn't been submitted yet
5974
5998
if (!subctx->seqs .empty ()) {
5999
+ #ifdef GGML_VULKAN_CHECK_RESULTS
6000
+ ggml_vk_check_results_0 (tensor);
6001
+ use_fence = true ;
6002
+ #endif
6003
+
5975
6004
// Do staging buffer copies
5976
6005
for (auto & cpy : subctx->in_memcpys ) {
5977
6006
memcpy (cpy.dst , cpy.src , cpy.n );
5978
6007
}
5979
6008
5980
- #ifdef GGML_VULKAN_PERF
5981
- start = std::chrono::steady_clock::now ();
5982
- #endif // GGML_VULKAN_PERF
6009
+ ggml_vk_submit (subctx, use_fence ? ctx->fence : vk::Fence{});
6010
+
6011
+ if (use_fence) {
6012
+ VK_CHECK (ctx->device ->device .waitForFences ({ ctx->fence }, true , UINT64_MAX), " ggml_vk_compute_forward waitForFences" );
5983
6013
5984
- ggml_vk_submit (subctx, ctx->fence );
6014
+ ctx->device ->device .resetFences ({ ctx->fence });
6015
+ }
6016
+ #ifdef GGML_VULKAN_CHECK_RESULTS
6017
+ ggml_vk_check_results_1 (tensor);
6018
+ #endif
5985
6019
}
5986
6020
5987
6021
if (tensor_idx == subctx->exit_tensor_idx ) {
5988
- VK_CHECK (ctx->device ->device .waitForFences ({ ctx->fence }, true , UINT64_MAX), " ggml_vk_compute_forward waitForFences" );
5989
-
5990
- #ifdef GGML_VULKAN_PERF
5991
- auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now () - start);
5992
- ctx->device ->perf_logger ->log_timing (tensor, duration.count ());
5993
- #endif // GGML_VULKAN_PERF
5994
-
5995
- ctx->device ->device .resetFences ({ ctx->fence });
5996
-
5997
6022
// Do staging buffer copies
5998
6023
for (auto & cpy : subctx->out_memcpys ) {
5999
6024
memcpy (cpy.dst , cpy.src , cpy.n );
@@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6482
6507
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context ;
6483
6508
6484
6509
for (int i = 0 ; i < cgraph->n_nodes ; i++) {
6485
- ggml_vk_build_graph (ctx, cgraph->nodes [i], i, 0 , true );
6510
+ ggml_vk_build_graph (ctx, cgraph->nodes [i], i, nullptr , 0 , true , false , false );
6486
6511
}
6487
6512
ggml_vk_preallocate_buffers (ctx);
6488
6513
ggml_pipeline_allocate_descriptor_sets (ctx->device );
@@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6497
6522
// Reserve tensor context space for all nodes
6498
6523
ctx->tensor_ctxs .resize (cgraph->n_nodes );
6499
6524
6500
- for (int i = 0 ; i < cgraph->n_nodes ; i++) {
6501
- ggml_vk_build_graph (ctx, cgraph->nodes [i], i, i == last_node, false );
6502
- }
6525
+ bool first_node_in_batch = true ; // true if next node will be first node in a batch
6526
+ int submit_node_idx = 0 ; // index to first node in a batch
6503
6527
6528
+ // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
6529
+ constexpr int submit_count = 100 ;
6530
+ int submitted_nodes = 0 ;
6504
6531
for (int i = 0 ; i < cgraph->n_nodes ; i++) {
6505
- ggml_tensor * node = cgraph->nodes [i];
6506
-
6507
- if (ggml_vk_is_empty (node)) {
6508
- continue ;
6532
+ if (first_node_in_batch) {
6533
+ submit_node_idx = i;
6509
6534
}
6510
6535
6511
- bool ok = ggml_vk_compute_forward (ctx, node, i);
6512
- if (!ok) {
6513
- if (node->op == GGML_OP_UNARY) {
6514
- std::cerr << __func__ << " : error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name (static_cast <ggml_unary_op>(node->op_params [0 ])) << " )" << std::endl;
6515
- } else {
6516
- std::cerr << __func__ << " : error: op not supported " << node->name << " (" << ggml_op_name (node->op ) << " )" << std::endl;
6536
+ bool submit = (submitted_nodes >= submit_count) || (i == last_node);
6537
+
6538
+
6539
+ bool enqueued = ggml_vk_build_graph (ctx, cgraph->nodes [i], i, cgraph->nodes [submit_node_idx], submit_node_idx, false , i == last_node, submit);
6540
+
6541
+ if (enqueued) {
6542
+ ++submitted_nodes;
6543
+
6544
+ #ifndef GGML_VULKAN_CHECK_RESULTS
6545
+ if (first_node_in_batch) {
6546
+ first_node_in_batch = false ;
6517
6547
}
6548
+ #endif
6518
6549
}
6519
- #ifdef GGML_VULKAN_CHECK_RESULTS
6520
- else {
6521
- ggml_vk_check_results_1 (node);
6550
+
6551
+ if (submit) {
6552
+ first_node_in_batch = true ;
6553
+ submitted_nodes = 0 ;
6522
6554
}
6523
- #endif
6524
- GGML_ASSERT (ok);
6525
6555
}
6526
6556
6527
6557
#ifdef GGML_VULKAN_PERF
0 commit comments