From c26934dd24456f5a09f9ef1d45faf294afe82009 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 24 May 2025 11:52:11 +0530 Subject: [PATCH 1/6] SYCL: Implement few same quantized type copy kernels --- ggml/src/ggml-sycl/cpy.cpp | 137 +++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 15 ++++ 2 files changed, 152 insertions(+) diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index 44487c25646d6..967da2fb1d8f1 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -3,6 +3,8 @@ #include #include "dequantize.hpp" +#include "ggml-sycl/common.hpp" +#include "ggml.h" static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) { if (x <= val[0]) { @@ -116,6 +118,38 @@ static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { } } +/* quantized type same copy */ +static void cpy_block_q8_0_q8_0(const char * cxi, char * cdsti) { + const block_q8_0 * xi = (const block_q8_0 *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; + *dsti = *xi; +} + +static void cpy_block_q5_0_q5_0(const char * cxi, char * cdsti) { + const block_q5_0 * xi = (const block_q5_0 *) cxi; + block_q5_0 * dsti = (block_q5_0 *) cdsti; + *dsti = *xi; +} + + +static void cpy_block_q5_1_q5_1(const char * cxi, char * cdsti) { + const block_q5_1 * xi = (const block_q5_1 *) cxi; + block_q5_1 * dsti = (block_q5_1 *) cdsti; + *dsti = *xi; +} + +static void cpy_block_q4_0_q4_0(const char * cxi, char * cdsti) { + const block_q4_0 * xi = (const block_q4_0 *) cxi; + block_q4_0 * dsti = (block_q4_0 *) cdsti; + *dsti = *xi; +} + +static void cpy_block_q4_1_q4_1(const char * cxi, char * cdsti) { + const block_q4_1 * xi = (const block_q4_1 *) cxi; + block_q4_1 * dsti = (block_q4_1 *) cdsti; + *dsti = *xi; +} + static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) { float * cdstf = (float *) (cdsti); @@ -311,6 +345,34 @@ template static void cpy_blck_q_f32(const } } + +template +static void cpy_q_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; + + if (i >= ne) { + return; + } + + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_blck(cx + x_offset, cdst + dst_offset); +} + template static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, @@ -322,6 +384,7 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00 return; } + const int i03 = i / (ne00 * ne01 * ne02); const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; @@ -615,6 +678,70 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co } } +static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + + const int num_blocks = ne; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + + const int num_blocks = ne; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + + const int num_blocks = ne; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + + const int num_blocks = ne; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + + const int num_blocks = ne; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, @@ -684,6 +811,16 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) { ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_q8_0_q8_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_Q5_0) { + ggml_cpy_q5_0_q5_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_Q5_1) { + ggml_cpy_q5_1_q5_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_q4_0_q4_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_q4_1_q4_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else { GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 78513114c55f3..d2cedaf2f5564 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4271,6 +4271,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) { return true; } + if(src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_Q8_0) { + return true; + } + if(src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_Q5_0) { + return true; + } + if(src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_Q5_1) { + return true; + } + if(src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_Q4_0) { + return true; + } + if(src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_Q4_1) { + return true; + } return false; } case GGML_OP_CONCAT: From 608e88115244f1aa3c587336b3134b4f8c070007 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Wed, 28 May 2025 11:28:28 +0530 Subject: [PATCH 2/6] Use memcpy for copying contiguous tensors ggml-ci --- ggml/src/ggml-sycl/cpy.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index 967da2fb1d8f1..bc8d566eba738 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -759,8 +759,11 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co char * src0_ddc = (char *) src0->data; char * src1_ddc = (char *) src1->data; - - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type), + ggml_type_name(src1->type)); + if ((src0->type == src1->type) && (ggml_is_contiguous(src0) && ggml_is_contiguous(src1))) { + main_stream->memcpy(src1_ddc, src0_ddc, ggml_nbytes(src0)); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { From faeb7f3493923b48d4be68d5aa2cca363f867165 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 31 May 2025 12:09:58 +0530 Subject: [PATCH 3/6] feat(sycl): add contiguous tensor copy support and device checks Adds a memcpy path for contiguous tensors of the same type to optimize data transfer. Updates device support checks to recognize contiguous tensor operations, improving compatibility and performance. --- ggml/src/ggml-sycl/cpy.cpp | 4 ++-- ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index bc8d566eba738..073731c6c7837 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -1,6 +1,7 @@ #include "cpy.hpp" #include +#include #include "dequantize.hpp" #include "ggml-sycl/common.hpp" @@ -759,9 +760,8 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co char * src0_ddc = (char *) src0->data; char * src1_ddc = (char *) src1->data; - GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type), - ggml_type_name(src1->type)); if ((src0->type == src1->type) && (ggml_is_contiguous(src0) && ggml_is_contiguous(src1))) { + GGML_SYCL_DEBUG("%s: memcpy path\n", __func__); main_stream->memcpy(src1_ddc, src0_ddc, ggml_nbytes(src0)); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index d2cedaf2f5564..8ccc6f1c8af94 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4226,6 +4226,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g { ggml_type src0_type = op->src[0]->type; ggml_type src1_type = op->src[1]->type; + if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]))) { + return true; + } if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return true; } From b36c550d34de496f0c14a3eac6b96c3bee4e3422 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 1 Jun 2025 18:37:02 +0530 Subject: [PATCH 4/6] refactor: replace specific block copy functions with template The changes replace multiple redundant block copy functions (e.g., cpy_block_q8_0_q8_0, cpy_block_q5_0_q5_0) with a single templated function cpy_blck_q_q. This reduces code duplication by using a generic template that works for any block type, improving maintainability while preserving the same functionality. The template is instantiated with specific block types (e.g., block_q8_0) where needed. --- ggml/src/ggml-sycl/cpy.cpp | 45 ++++++++++---------------------------- 1 file changed, 11 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index 073731c6c7837..77c00846cfc0b 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -120,36 +120,13 @@ static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { } /* quantized type same copy */ -static void cpy_block_q8_0_q8_0(const char * cxi, char * cdsti) { - const block_q8_0 * xi = (const block_q8_0 *) cxi; - block_q8_0 * dsti = (block_q8_0 *) cdsti; +template +static void cpy_blck_q_q(const char * cxi, char * cdsti) { + const T * xi = (const T *) cxi; + T * dsti = (T *) cdsti; *dsti = *xi; } -static void cpy_block_q5_0_q5_0(const char * cxi, char * cdsti) { - const block_q5_0 * xi = (const block_q5_0 *) cxi; - block_q5_0 * dsti = (block_q5_0 *) cdsti; - *dsti = *xi; -} - - -static void cpy_block_q5_1_q5_1(const char * cxi, char * cdsti) { - const block_q5_1 * xi = (const block_q5_1 *) cxi; - block_q5_1 * dsti = (block_q5_1 *) cdsti; - *dsti = *xi; -} - -static void cpy_block_q4_0_q4_0(const char * cxi, char * cdsti) { - const block_q4_0 * xi = (const block_q4_0 *) cxi; - block_q4_0 * dsti = (block_q4_0 *) cdsti; - *dsti = *xi; -} - -static void cpy_block_q4_1_q4_1(const char * cxi, char * cdsti) { - const block_q4_1 * xi = (const block_q4_1 *) cxi; - block_q4_1 * dsti = (block_q4_1 *) cdsti; - *dsti = *xi; -} static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) { float * cdstf = (float *) (cdsti); @@ -347,7 +324,7 @@ template static void cpy_blck_q_f32(const } -template +template static void cpy_q_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, @@ -371,7 +348,7 @@ static void cpy_q_q(const char * cx, char * cdst, const int ne, const int ne00, const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; - cpy_blck(cx + x_offset, cdst + dst_offset); + cpy_blck_q_q(cx + x_offset, cdst + dst_offset); } template @@ -687,7 +664,7 @@ static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const const int num_blocks = ne; stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -700,7 +677,7 @@ static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const const int num_blocks = ne; stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -713,7 +690,7 @@ static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const const int num_blocks = ne; stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -726,7 +703,7 @@ static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const const int num_blocks = ne; stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -739,7 +716,7 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const const int num_blocks = ne; stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } From b6db00566db36ae72adc6e680b3a6561a54e6851 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Tue, 3 Jun 2025 19:01:51 +0530 Subject: [PATCH 5/6] Exclude BF16 support for COPY tensors for now ggml-ci --- ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 8ccc6f1c8af94..3936f1eaf5ef6 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4226,7 +4226,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g { ggml_type src0_type = op->src[0]->type; ggml_type src1_type = op->src[1]->type; - if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]))) { + if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) && src0_type != GGML_TYPE_BF16) { return true; } if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { From 4aa261af8983276fd6771e19eef9498cf469e825 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Wed, 4 Jun 2025 08:23:54 +0530 Subject: [PATCH 6/6] perf: adjust SYCL copy kernel block sizes for efficiency Use ceil_div to ensure full element coverage and update nd_range parameters to better align with SYCL block sizes, improving parallelism and device utilization in copy operations. --- ggml/src/ggml-sycl/cpy.cpp | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index 77c00846cfc0b..56373b4d085d5 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -5,6 +5,7 @@ #include "dequantize.hpp" #include "ggml-sycl/common.hpp" +#include "ggml-sycl/presets.hpp" #include "ggml.h" static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) { @@ -660,10 +661,10 @@ static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, queue_ptr stream) { - - const int num_blocks = ne; + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -673,10 +674,10 @@ static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, queue_ptr stream) { - - const int num_blocks = ne; + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -686,10 +687,11 @@ static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); - const int num_blocks = ne; stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -699,10 +701,9 @@ static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, queue_ptr stream) { - - const int num_blocks = ne; + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -713,9 +714,9 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = ne; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); }