From 54016c452418b02b2d880ad49047238b21706d71 Mon Sep 17 00:00:00 2001 From: Anton Volkov Date: Mon, 11 Dec 2023 22:07:50 +0100 Subject: [PATCH] Work around sub_group load/store issues --- CMakeLists.txt | 2 +- dpnp/backend/CMakeLists.txt | 2 ++ dpnp/backend/extensions/lapack/CMakeLists.txt | 2 +- dpnp/backend/extensions/sycl_ext/CMakeLists.txt | 2 +- dpnp/backend/extensions/vm/CMakeLists.txt | 2 +- dpnp/backend/kernels/dpnp_krnl_bitwise.cpp | 11 ++++++++++- dpnp/backend/kernels/dpnp_krnl_elemwise.cpp | 14 ++++++++++++-- dpnp/backend/kernels/dpnp_krnl_logic.cpp | 14 ++++++++++++-- dpnp/backend/kernels/dpnp_krnl_mathematical.cpp | 11 ++++++++++- 9 files changed, 50 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 46e988bdcbe6..ccb2bef6972e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,7 +80,7 @@ set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"") find_package(Cython REQUIRED) find_package(Dpctl REQUIRED) -message(STATUS "Dpctl_INCLUDE_DIRS=" ${Dpctl_INCLUDE_DIRS}) +message(STATUS "Dpctl_INCLUDE_DIR=" ${Dpctl_INCLUDE_DIR}) message(STATUS "Dpctl_TENSOR_INCLUDE_DIR=" ${Dpctl_TENSOR_INCLUDE_DIR}) if(WIN32) diff --git a/dpnp/backend/CMakeLists.txt b/dpnp/backend/CMakeLists.txt index 18f084d54478..23ddc447bd9b 100644 --- a/dpnp/backend/CMakeLists.txt +++ b/dpnp/backend/CMakeLists.txt @@ -107,6 +107,8 @@ target_compile_definitions(${_trgt} PUBLIC PSTL_USE_PARALLEL_POLICIES=0) target_compile_definitions(${_trgt} PUBLIC ONEDPL_USE_PREDEFINED_POLICIES=0) target_include_directories(${_trgt} PUBLIC ${Dpctl_INCLUDE_DIR}) +target_include_directories(${_trgt} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR}) + target_link_directories(${_trgt} PUBLIC "${Dpctl_INCLUDE_DIR}/..") target_link_libraries(${_trgt} PUBLIC DPCTLSyclInterface) diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index 7679db38d6a7..c9c04db05813 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -47,7 +47,7 @@ set_target_properties(${python_module_name} PROPERTIES CMAKE_POSITION_INDEPENDEN target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include) target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src) -target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) +target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR}) if (WIN32) diff --git a/dpnp/backend/extensions/sycl_ext/CMakeLists.txt b/dpnp/backend/extensions/sycl_ext/CMakeLists.txt index 4aff4d2e766c..5f7c25f6c93f 100644 --- a/dpnp/backend/extensions/sycl_ext/CMakeLists.txt +++ b/dpnp/backend/extensions/sycl_ext/CMakeLists.txt @@ -45,7 +45,7 @@ set_target_properties(${python_module_name} PROPERTIES CMAKE_POSITION_INDEPENDEN target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include) target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src) -target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) +target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR}) if (WIN32) diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt index 3f13fb571a6b..b6f863164da2 100644 --- a/dpnp/backend/extensions/vm/CMakeLists.txt +++ b/dpnp/backend/extensions/vm/CMakeLists.txt @@ -45,7 +45,7 @@ set_target_properties(${python_module_name} PROPERTIES CMAKE_POSITION_INDEPENDEN target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include) target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src) -target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) +target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR}) if (WIN32) diff --git a/dpnp/backend/kernels/dpnp_krnl_bitwise.cpp b/dpnp/backend/kernels/dpnp_krnl_bitwise.cpp index 4fdd1a394ed5..521006770c5d 100644 --- a/dpnp/backend/kernels/dpnp_krnl_bitwise.cpp +++ b/dpnp/backend/kernels/dpnp_krnl_bitwise.cpp @@ -32,6 +32,12 @@ #include "dpnpc_memory_adapter.hpp" #include "queue_sycl.hpp" +// dpctl tensor headers +#include "kernels/alignment.hpp" + +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + template class dpnp_invert_c_kernel; @@ -67,7 +73,10 @@ DPCTLSyclEventRef dpnp_invert_c(DPCTLSyclQueueRef q_ref, vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) + sg.get_group_id()[0] * max_sg_size); - if (start + static_cast(vec_sz) * max_sg_size < size) { + if (is_aligned(input_data) && + is_aligned(result) && + (start + static_cast(vec_sz) * max_sg_size < size)) + { auto input_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&input_data[start]); diff --git a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp index 9a85510cb015..0433ec5c18e9 100644 --- a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp +++ b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp @@ -34,6 +34,12 @@ #include "dpnpc_memory_adapter.hpp" #include "queue_sycl.hpp" +// dpctl tensor headers +#include "kernels/alignment.hpp" + +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + #define MACRO_1ARG_2TYPES_OP(__name__, __operation1__, __operation2__) \ template \ @@ -1198,8 +1204,12 @@ static void func_map_init_elemwise_1arg_1type(func_map_t &fmap) (nd_it.get_group(0) * nd_it.get_local_range(0) + \ sg.get_group_id()[0] * max_sg_size); \ \ - if (start + static_cast(vec_sz) * max_sg_size < \ - result_size) { \ + if (is_aligned(input1_data) && \ + is_aligned(input2_data) && \ + is_aligned(result) && \ + (start + static_cast(vec_sz) * max_sg_size < \ + result_size)) \ + { \ auto input1_multi_ptr = sycl::address_space_cast< \ sycl::access::address_space::global_space, \ sycl::access::decorated::yes>( \ diff --git a/dpnp/backend/kernels/dpnp_krnl_logic.cpp b/dpnp/backend/kernels/dpnp_krnl_logic.cpp index 0674f1387b79..359830242e38 100644 --- a/dpnp/backend/kernels/dpnp_krnl_logic.cpp +++ b/dpnp/backend/kernels/dpnp_krnl_logic.cpp @@ -31,6 +31,12 @@ #include "dpnpc_memory_adapter.hpp" #include "queue_sycl.hpp" +// dpctl tensor headers +#include "kernels/alignment.hpp" + +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + template class dpnp_all_c_kernel; @@ -610,8 +616,12 @@ DPCTLSyclEventRef (*dpnp_any_ext_c)(DPCTLSyclQueueRef, vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) + \ sg.get_group_id()[0] * max_sg_size); \ \ - if (start + static_cast(vec_sz) * max_sg_size < \ - result_size) { \ + if (is_aligned(input1_data) && \ + is_aligned(input2_data) && \ + is_aligned(result) && \ + (start + static_cast(vec_sz) * max_sg_size < \ + result_size)) \ + { \ auto input1_multi_ptr = sycl::address_space_cast< \ sycl::access::address_space::global_space, \ sycl::access::decorated::yes>(&input1_data[start]); \ diff --git a/dpnp/backend/kernels/dpnp_krnl_mathematical.cpp b/dpnp/backend/kernels/dpnp_krnl_mathematical.cpp index 975f4b67ca8d..25c5b1c8d7dc 100644 --- a/dpnp/backend/kernels/dpnp_krnl_mathematical.cpp +++ b/dpnp/backend/kernels/dpnp_krnl_mathematical.cpp @@ -35,6 +35,12 @@ #include "dpnpc_memory_adapter.hpp" #include "queue_sycl.hpp" +// dpctl tensor headers +#include "kernels/alignment.hpp" + +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + static_assert(__SYCL_COMPILER_VERSION >= __SYCL_COMPILER_VECTOR_ABS_CHANGED, "SYCL DPC++ compiler does not meet minimum version requirement"); @@ -163,7 +169,10 @@ DPCTLSyclEventRef vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) + sg.get_group_id()[0] * max_sg_size); - if (start + static_cast(vec_sz) * max_sg_size < size) { + if (is_aligned(array1) && + is_aligned(result) && + (start + static_cast(vec_sz) * max_sg_size < size)) + { auto array_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, sycl::access::decorated::yes>(&array1[start]);