From 187451b67172dfaffe71b5a0d8cf90a9874911fa Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Wed, 9 Apr 2025 22:40:00 +0100 Subject: [PATCH 01/14] sycl : Implemented reorder Q4_0 mmvq Signed-off-by: Alberto Cabrera --- ggml/src/ggml-sycl/backend.hpp | 17 ++- ggml/src/ggml-sycl/common.hpp | 5 + ggml/src/ggml-sycl/ggml-sycl.cpp | 42 ++++-- ggml/src/ggml-sycl/mmvq.cpp | 244 ++++++++++++++++++++----------- ggml/src/ggml-sycl/quants.hpp | 48 ++++++ ggml/src/ggml-sycl/vecdotq.hpp | 71 +++++++-- 6 files changed, 307 insertions(+), 120 deletions(-) create mode 100644 ggml/src/ggml-sycl/quants.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 73d807cab0be9..e7c00b596ee5f 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -13,23 +13,24 @@ #ifndef GGML_SYCL_BACKEND_HPP #define GGML_SYCL_BACKEND_HPP -#include "concat.hpp" #include "common.hpp" +#include "concat.hpp" #include "conv.hpp" #include "convert.hpp" +#include "cpy.hpp" #include "dequantize.hpp" #include "dmmv.hpp" +#include "element_wise.hpp" +#include "gla.hpp" +#include "im2col.hpp" #include "mmq.hpp" #include "mmvq.hpp" -#include "rope.hpp" #include "norm.hpp" +#include "outprod.hpp" +#include "quants.hpp" +#include "rope.hpp" #include "softmax.hpp" #include "tsembd.hpp" -#include "im2col.hpp" #include "wkv.hpp" -#include "outprod.hpp" -#include "element_wise.hpp" -#include "cpy.hpp" -#include "gla.hpp" -#endif // GGML_SYCL_BACKEND_HPP +#endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 3e1ceeaa49486..64ca8e84ddef7 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -776,4 +776,9 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t } bool gpu_has_xmx(sycl::device &dev); + +constexpr size_t safe_div(const size_t m, const size_t n) { + return (m + n - 1) / n; +} + #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 89715eaea0753..04ab68edda326 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2887,6 +2887,15 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) { return false; } +inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return true; + default: + return false; + } +} + static bool ggml_sycl_supports_dmmv(enum ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: @@ -2906,13 +2915,14 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) { } } -static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - - const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); - int64_t min_compute_capability = INT_MAX; +static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { + const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); + int64_t min_compute_capability = INT_MAX; if (split) { - ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context; + ggml_backend_sycl_split_buffer_type_context * buft_ctx = + (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context; auto & tensor_split = buft_ctx->tensor_split; for (int id = 0; id < ggml_sycl_info().device_count; ++id) { // skip devices that are not going to do any work: @@ -2925,7 +2935,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor } } } else { - min_compute_capability = ggml_sycl_info().devices[ctx.device].cc; + min_compute_capability = ggml_sycl_info().devices[ctx.device].cc; } // check data types and tensor shapes for custom matrix multiplication kernels: @@ -2948,8 +2958,13 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor #endif // SYCL_USE_XMX // mmvq path is faster in the CUDA backend. - if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda) + if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda + // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization + // is enabled takes precedence over DMMV, the current if-else implementation + // requires disabling DMMV if both conditions are met + || (ctx.opt_feature.reorder && ggml_sycl_supports_reorder_mmvq(src0->type))) { use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; + } if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // TODO: Refactor and cleanup of mul mat dispatching. @@ -2968,14 +2983,17 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor // KQ + KQV multi-batch ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { - ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); - // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream()); + constexpr bool convert_src1_to_q8_1 = false; + ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1); } else if (use_mul_mat_vec_q) { - ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); + constexpr bool convert_src1_to_q8_1 = true; + ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1); } else if (use_mul_mat_q) { - ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true); + constexpr bool convert_src1_to_q8_1 = true; + ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1); } else { - ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); + constexpr bool convert_src1_to_q8_1 = false; + ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1); } } diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 1b92ba2d6047e..de8f8102c3ce5 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -1,6 +1,59 @@ #include "mmvq.hpp" + +#include "ggml.h" +#include "common.hpp" +#include "quants.hpp" #include "vecdotq.hpp" -#include + +template +static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols, const int nrows, const sycl::nd_item<3> & nd_item) { + using block_type = ggml_sycl_reordered::block_q_t; + using block_traits = typename block_type::traits; + + const auto sg = nd_item.get_sub_group(); + const int sg_range = sg.get_group_linear_range(); + const int workgroup_id = nd_item.get_group_linear_id(); + const int sg_id = sg.get_group_linear_id(); + const int row = workgroup_id * sg_range + sg_id; + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / block_traits::qk; + constexpr int blocks_per_subgroup = safe_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi); + constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq; + + assert(blocks_per_subgroup > 0); + assert(block_elements_per_subgroup > 0); + + const block_q8_1 * y = (const block_q8_1 *) vy; + + float partial_sum = 0.0f; + for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) { + const int ibx = row * blocks_per_row + i; // x block index + // TODO: Generalize offsets, right now only works for quantizations that don't split high and low bits + const int bx_offset = block_type::get_block_offset(ibx); + const int d_offset = block_type::get_d_offset(nrows, ncols, ibx); + + // Y block index that aligns with ibx + const int iby = i * block_type::block_to_q8_1_ratio(); + + for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) { + // x block quant index when casting the quants to int + const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup); + + partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs); + } + } + + auto sum = sycl::reduce_over_group(nd_item.get_sub_group(), partial_sum, std::plus<>()); + + if (sg.leader()) { + dst[row] = sum; + } +} template static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, @@ -480,26 +533,39 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, } } -static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, +static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, + const int nrows, dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = safe_div(nrows, GGML_SYCL_MMV_Y); + constexpr size_t num_subgroups = 16; + GGML_ASSERT(block_num_y % num_subgroups == 0); + + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE)); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, + nd_item); + }); + }); +} + +static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK4_0 == 0); const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - stream->submit([&](sycl::handler &cgh) { - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1); - }); + { + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); }); } } @@ -916,93 +982,93 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, } } -void ggml_sycl_op_mul_mat_vec_q( - ggml_backend_sycl_context & ctx, - const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, - float *dst_dd_i, const int64_t row_low, const int64_t row_high, - const int64_t src1_ncols, const int64_t src1_padded_col_size, - const dpct::queue_ptr &stream) { - +void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, + const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_col_size, + const dpct::queue_ptr & stream) { const int64_t ne10 = src1->ne[0]; GGML_ASSERT(ne10 % QK8_1 == 0); - const int64_t ne00 = src0->ne[0]; + const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); + SYCL_CHECK(CHECK_TRY_ERROR(id = get_current_device_id())); const size_t q8_1_ts = sizeof(block_q8_1); const size_t q8_1_bs = QK8_1; // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the kernel writes into - for (int i = 0; i < src1_ncols; i++) - { + for (int i = 0; i < src1_ncols; i++) { const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs; - const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset; - float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0]; + const char * src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset; + float * dst_dd_i_bs = dst_dd_i + i * dst->ne[0]; switch (src0->type) { - case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_Q8_0: - mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_Q4_K: - mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_K: - mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ1_S: - mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ1_M: - mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ2_XXS: - mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ2_XS: - mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ2_S: - mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ3_XXS: - mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ3_S: - mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ4_NL: - mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - case GGML_TYPE_IQ4_XS: - mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); - break; - default: - GGML_ABORT("fatal error"); + case GGML_TYPE_Q4_0: + if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && + ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } else { + mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_M: + mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_XXS: + mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ3_XXS: + mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ4_NL: + mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ4_XS: + mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + break; + default: + GGML_ABORT("fatal error"); } } GGML_UNUSED(src1); diff --git a/ggml/src/ggml-sycl/quants.hpp b/ggml/src/ggml-sycl/quants.hpp new file mode 100644 index 0000000000000..f3dea35b15d94 --- /dev/null +++ b/ggml/src/ggml-sycl/quants.hpp @@ -0,0 +1,48 @@ +// +// MIT license +// Copyright (C) 2025 Codeplay Software Ltd. +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_QUANTS_HPP +#define GGML_SYCL_QUANTS_HPP + +#include "ggml-common.h" +#include "ggml.h" + +namespace ggml_sycl_reordered { + +template struct block_q_t; + +template <> struct block_q_t { + struct traits { + static constexpr uint32_t qk = QK4_0; + static constexpr uint32_t qi = QI4_0; + static constexpr uint32_t qr = QR4_0; + static constexpr uint32_t vdr_mmvq = 2; + }; + + // qs and d are expected to be contiguous in memory + // out-of-bounds qs will access d values + // Aligment relies on the allocated size of qs, so other block types + // may require padding + + static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); } + + static constexpr int get_d_offset(int nrows, int ncols, const int block_index) { + return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half); + } + + static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } +}; + +} // namespace ggml_sycl_reordered + +#endif // GGML_SYCL_QUANTS_HPP diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index c5942008adfbd..29e80bbc28a95 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -1,6 +1,6 @@ // // MIT license -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2025 Intel Corporation // SPDX-License-Identifier: MIT // @@ -14,8 +14,11 @@ #define GGML_SYCL_VECDOTQ_HPP #include "dpct/helper.hpp" +#include "ggml.h" +#include "quants.hpp" -typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); +typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, + const int & iqs); static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) { const uint16_t* x16 = @@ -252,13 +255,60 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q +template struct reorder_vec_dot_q_sycl { + static_assert(T != T, "ggml_type for reorder vecdot not implemented"); +}; + +template <> struct reorder_vec_dot_q_sycl { + static constexpr ggml_type gtype = GGML_TYPE_Q4_0; + + using q4_0_block = ggml_sycl_reordered::block_q_t; + using q4_0_traits = typename q4_0_block::traits; + + float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4, const sycl::half2 & ds8) { + int sumi = 0; + +#pragma unroll + for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); + sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); + } + + const sycl::float2 ds8f = ds8.convert(); + + // second part effectively subtracts 8 from each quant value + return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y()); + } + + float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset, + const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + const uint8_t * bq4_0 = static_cast(vbq) + ibx_offset; + const ggml_half d = *(reinterpret_cast(static_cast(vbq) + d_offset)); + int v[q4_0_traits::vdr_mmvq]; + int u[2 * q4_0_traits::vdr_mmvq]; + +#pragma unroll + + for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) { + v[i] = get_int_from_uint8(bq4_0, iqs + i); + u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + q4_0_traits::qi); + } + + return vec_dot_q4_0_q8_1_impl(v, u, d, bq8_1->ds); + }; +}; + #define VDR_Q4_0_Q8_1_MMVQ 2 #define VDR_Q4_0_Q8_1_MMQ 4 template -static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u, - const float &d4, - const sycl::half2 &ds8) { +static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4, + const sycl::half2 & ds8) { int sumi = 0; #pragma unroll for (int i = 0; i < vdr; ++i) { @@ -270,8 +320,7 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u, sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); } - const sycl::float2 ds8f = - ds8.convert(); + const sycl::float2 ds8f = ds8.convert(); // second part effectively subtracts 8 from each quant value return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y()); @@ -456,13 +505,13 @@ vec_dot_q4_0_q8_1(const void *__restrict__ vbq, const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; int v[VDR_Q4_0_Q8_1_MMVQ]; - int u[2*VDR_Q4_0_Q8_1_MMVQ]; + int u[2 * VDR_Q4_0_Q8_1_MMVQ]; #pragma unroll for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); + v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); + u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); } return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); From 9c8d809fb36de447ea2ec85332421a7d5bdd1615 Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Wed, 9 Apr 2025 22:55:41 +0100 Subject: [PATCH 02/14] sycl : Fixed mmvq being called when reorder is disabled --- ggml/src/ggml-sycl/ggml-sycl.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 04ab68edda326..41ec0d1f09d9d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2957,12 +2957,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE); #endif // SYCL_USE_XMX + const bool reorder = static_cast(dst->src[0]->extra) && + static_cast(dst->src[0]->extra)->optimized_feature.reorder; + // mmvq path is faster in the CUDA backend. if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization // is enabled takes precedence over DMMV, the current if-else implementation // requires disabling DMMV if both conditions are met - || (ctx.opt_feature.reorder && ggml_sycl_supports_reorder_mmvq(src0->type))) { + || (reorder && ggml_sycl_supports_reorder_mmvq(src0->type))) { use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; } From 52b1622781f1a0e1116ad4668bb10adbc35cfa16 Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Thu, 10 Apr 2025 16:10:43 +0100 Subject: [PATCH 03/14] sycl : Improved comments in the quants header Signed-off-by: Alberto Cabrera --- ggml/src/ggml-sycl/quants.hpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-sycl/quants.hpp b/ggml/src/ggml-sycl/quants.hpp index f3dea35b15d94..a9c3ddf7aea85 100644 --- a/ggml/src/ggml-sycl/quants.hpp +++ b/ggml/src/ggml-sycl/quants.hpp @@ -19,8 +19,26 @@ namespace ggml_sycl_reordered { + +// The reordered block moves quants (qs) and scales(d) to two +// uniform regions of memory that is contiguous in the same tensor. +// What this means is that instead of having: +// [d0, qs0] [d1, qs1] [d2, qs2] ... [dN, qsN] +// We have: +// [qs0, qs1, qs2, ..., qsN] [d0, d1, d2, ..., dN] +// +// Notes: out-of-bounds qs will run into d values +// Aligment relies on the allocated size of qs + template struct block_q_t; + +// qk number of weights / quants in a block +// qr number of weights in a byte (described as 'before dequantization') +// for quantization types that has low and high bits split, qr is calculated with +// using the lower bits, e.g for Q6 quants QR6 is 2 +// qi size of a block in 32 bit integers +// See ggml-common.h to see how these are calculated template <> struct block_q_t { struct traits { static constexpr uint32_t qk = QK4_0; @@ -29,11 +47,6 @@ template <> struct block_q_t { static constexpr uint32_t vdr_mmvq = 2; }; - // qs and d are expected to be contiguous in memory - // out-of-bounds qs will access d values - // Aligment relies on the allocated size of qs, so other block types - // may require padding - static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); } static constexpr int get_d_offset(int nrows, int ncols, const int block_index) { From e8555ab4bd92324e16ae11c58d2638a303469a72 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Wed, 16 Apr 2025 18:31:52 +0200 Subject: [PATCH 04/14] Use static_assert --- ggml/src/ggml-sycl/mmvq.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index de8f8102c3ce5..f2d4af43db95c 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -25,8 +25,8 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r constexpr int blocks_per_subgroup = safe_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi); constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq; - assert(blocks_per_subgroup > 0); - assert(block_elements_per_subgroup > 0); + static_assert(blocks_per_subgroup > 0); + static_assert(block_elements_per_subgroup > 0); const block_q8_1 * y = (const block_q8_1 *) vy; From b60d637665ab5804c10e8bc085ebf07e1dd2baf2 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Wed, 16 Apr 2025 18:32:54 +0200 Subject: [PATCH 05/14] safe_div -> ceil_div --- ggml/src/ggml-sycl/common.hpp | 2 +- ggml/src/ggml-sycl/mmvq.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 64ca8e84ddef7..89474115978ed 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -777,7 +777,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t bool gpu_has_xmx(sycl::device &dev); -constexpr size_t safe_div(const size_t m, const size_t n) { +constexpr size_t ceil_div(const size_t m, const size_t n) { return (m + n - 1) / n; } diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index f2d4af43db95c..96a0ec0c091ae 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -22,7 +22,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r } const int blocks_per_row = ncols / block_traits::qk; - constexpr int blocks_per_subgroup = safe_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi); + constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi); constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq; static_assert(blocks_per_subgroup > 0); @@ -536,7 +536,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK4_0 == 0); - const int block_num_y = safe_div(nrows, GGML_SYCL_MMV_Y); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); constexpr size_t num_subgroups = 16; GGML_ASSERT(block_num_y % num_subgroups == 0); From fc768f313450051423f277ed96cd470f9e515696 Mon Sep 17 00:00:00 2001 From: Romain Biessy Date: Wed, 16 Apr 2025 18:38:09 +0200 Subject: [PATCH 06/14] Clarify qi comment --- ggml/src/ggml-sycl/quants.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/quants.hpp b/ggml/src/ggml-sycl/quants.hpp index a9c3ddf7aea85..a74e30526c1c0 100644 --- a/ggml/src/ggml-sycl/quants.hpp +++ b/ggml/src/ggml-sycl/quants.hpp @@ -37,7 +37,7 @@ template struct block_q_t; // qr number of weights in a byte (described as 'before dequantization') // for quantization types that has low and high bits split, qr is calculated with // using the lower bits, e.g for Q6 quants QR6 is 2 -// qi size of a block in 32 bit integers +// qi number of 32 bit integers needed to represent all the quants from a block (`qs` field) // See ggml-common.h to see how these are calculated template <> struct block_q_t { struct traits { From c7500c934bfc161bf9d37a37d02b2807da7e3ef4 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Fri, 18 Apr 2025 13:57:56 +0800 Subject: [PATCH 07/14] change the reorder tensor from init to execute OP --- ggml/src/ggml-sycl/common.hpp | 1 - ggml/src/ggml-sycl/ggml-sycl.cpp | 125 +++++++++++++++---------------- 2 files changed, 61 insertions(+), 65 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 3e1ceeaa49486..a8187b0a9187f 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -313,7 +313,6 @@ struct ggml_backend_sycl_context { int device; std::string name; optimize_feature opt_feature; - bool optimized_graph=false; queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 4d2fda0bfa6ae..22927338bda42 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -192,7 +192,7 @@ static void ggml_check_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); - g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1); + g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("Running with Environment Variables:\n"); @@ -2863,6 +2863,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) { } } +static void reorder_qw(char *data_device, const int ncols, const int nrows, + size_t size, size_t offset, dpct::queue_ptr stream) { + auto tmp_buf = sycl::malloc_shared(size, *stream); + SYCL_CHECK( + CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size) + .wait())); + GGML_ASSERT((size % sizeof(block_q4_0) == 0)); + GGML_ASSERT((offset % sizeof(block_q4_0) == 0)); + int offset_blks = offset / sizeof(block_q4_0); + auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;; + auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks; + + stream->parallel_for( + size / sizeof(block_q4_0), + [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + const block_q4_0* x = (const block_q4_0*)tmp_buf; + const int ib = i; + + for (int j = 0; j < QK4_0/2; j ++) + { + *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j]; + } + *(d_ptr + ib) = x[ib].d; + }); + + sycl::free(tmp_buf, *stream); +} + +static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { + char*data_device = (char*)src0->data; + size_t ncols = src0->ne[0]; + size_t nrows = src0->ne[1]; + size_t size = ggml_nbytes(src0); + + reorder_qw(data_device, ncols, nrows, size, 0, stream); +} + +/* +* This function could be called when the OP (mul_mat) function support reorder optimizition. +*/ +static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { + if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT + ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf. + dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases. + src0->type == GGML_TYPE_Q4_0 && + src1->ne[2]==1 && src1->ne[3]==1) { + + ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra; + if (!extra) return; //only happen in CI/UT permute case. + + if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder. + + reorder_qw(src0, ctx->stream()); + extra->optimized_feature.reorder = true; //used to decode/dequan in next steps. + } +} + static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); @@ -2925,6 +2983,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor // KQ + KQV multi-batch ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { + opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream()); } else if (use_mul_mat_vec_q) { @@ -2932,6 +2991,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor } else if (use_mul_mat_q) { ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true); } else { + opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); } } @@ -3561,71 +3621,8 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void reorder_qw(char *data_device, const int ncols, const int nrows, - size_t size, size_t offset, dpct::queue_ptr stream) { - auto tmp_buf = sycl::malloc_shared(size, *stream); - SYCL_CHECK( - CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size) - .wait())); - GGML_ASSERT((size % sizeof(block_q4_0) == 0)); - GGML_ASSERT((offset % sizeof(block_q4_0) == 0)); - int offset_blks = offset / sizeof(block_q4_0); - auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;; - auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks; - - stream->parallel_for( - size / sizeof(block_q4_0), - [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - const block_q4_0* x = (const block_q4_0*)tmp_buf; - const int ib = i; - - for (int j = 0; j < QK4_0/2; j ++) - { - *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j]; - } - *(d_ptr + ib) = x[ib].d; - }); - - sycl::free(tmp_buf, *stream); -} - -static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { - char*data_device = (char*)src0->data; - size_t ncols = src0->ne[0]; - size_t nrows = src0->ne[1]; - size_t size = ggml_nbytes(src0); - - reorder_qw(data_device, ncols, nrows, size, 0, stream); -} - -static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { - ggml_tensor *src0 = dst->src[0]; - ggml_tensor *src1 = dst->src[1]; - - if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 && - src1->ne[2]==1 && src1->ne[3]==1) { - reorder_qw(src0, stream); - ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra; - GGML_ASSERT(extra); - extra->optimized_feature.reorder = true; //used to decode/dequan in next steps. - } -} - -static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) { - dpct::queue_ptr stream = ctx->stream(); - if (ctx->optimized_graph) { - return; - } - ctx->optimized_graph = true; - - for (int i = 0; i < cgraph->n_nodes; i++) { - if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream); - } -} - static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) { ggml_sycl_set_main_device(sycl_ctx->device); - if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; From 1e0c4cfead6df227409a889006c1899cecc0bfd0 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Tue, 15 Apr 2025 18:03:57 +0200 Subject: [PATCH 08/14] dbg --- ggml/src/ggml-sycl/common.hpp | 21 +++-------- ggml/src/ggml-sycl/dmmv.cpp | 2 ++ ggml/src/ggml-sycl/ggml-sycl.cpp | 12 +++++-- ggml/src/ggml-sycl/mmvq.cpp | 2 ++ tests/test-backend-ops.cpp | 61 ++++++++++++++++++++++++++++++-- 5 files changed, 77 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 89474115978ed..cbac55b3ff920 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -42,6 +42,7 @@ void ggml_sycl_host_free(void* ptr); extern int g_ggml_sycl_debug; extern int g_ggml_sycl_disable_optimize; +extern int g_ggml_sycl_disable_mmvq; #define GGML_SYCL_DEBUG(...) \ do { \ @@ -285,25 +286,11 @@ struct ggml_tensor_extra_gpu { void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); -inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) { +inline optimize_feature check_gpu_optimize_feature(syclex::architecture &/*arch*/) { optimize_feature opt; - opt.reorder = - (arch == syclex::architecture::intel_gpu_dg1 || - arch == syclex::architecture::intel_gpu_acm_g10 || - arch == syclex::architecture::intel_gpu_acm_g11 || - arch == syclex::architecture::intel_gpu_acm_g12 || - arch == syclex::architecture::intel_gpu_pvc || - arch == syclex::architecture::intel_gpu_pvc_vg || - arch == syclex::architecture::intel_gpu_mtl_u || - arch == syclex::architecture::intel_gpu_mtl_s || - arch == syclex::architecture::intel_gpu_mtl_h || - arch == syclex::architecture::intel_gpu_arl_u || - arch == syclex::architecture::intel_gpu_arl_s || - arch == syclex::architecture::intel_gpu_arl_h || - arch == syclex::architecture::intel_gpu_bmg_g21 || - arch == syclex::architecture::intel_gpu_lnl_m - ); + // TODO: Romain change to Intel vendor? + opt.reorder = true; return opt; } diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 04a85fa35ff2d..8cff26e4be1b9 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -1105,8 +1105,10 @@ void ggml_sycl_op_dequantize_mul_mat_vec( case GGML_TYPE_Q4_0: if ((ggml_tensor_extra_gpu*)dst->src[0]->extra && ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { + GGML_SYCL_DEBUG("Calling dequantize_mul_mat_vec_q4_0_sycl_reorder\n"); dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); } else { + GGML_SYCL_DEBUG("Calling dequantize_mul_mat_vec_q4_0_sycl\n"); dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); } break; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 41ec0d1f09d9d..fb9d7d4e43a48 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -48,6 +48,7 @@ static bool g_sycl_loaded = false; int g_ggml_sycl_debug = 0; int g_ggml_sycl_disable_optimize = 0; int g_ggml_sycl_disable_graph = 0; +int g_ggml_sycl_disable_mmvq = 0; static ggml_sycl_device_info ggml_sycl_init() { ggml_sycl_device_info info = {}; @@ -194,11 +195,13 @@ static void ggml_check_sycl() try { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1); g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); + g_ggml_sycl_disable_mmvq = get_sycl_env("GGML_SYCL_DISABLE_MMVQ", 0); GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("Running with Environment Variables:\n"); GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize); GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph); + GGML_LOG_INFO(" GGML_SYCL_DISABLE_MMVQ: %d\n", g_ggml_sycl_disable_mmvq); GGML_LOG_INFO("Build with Macros:\n"); #if defined(GGML_SYCL_FORCE_MMQ) GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n"); @@ -2917,6 +2920,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) { static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); int64_t min_compute_capability = INT_MAX; @@ -2961,14 +2965,17 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor static_cast(dst->src[0]->extra)->optimized_feature.reorder; // mmvq path is faster in the CUDA backend. - if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda + if (!g_ggml_sycl_disable_mmvq && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization // is enabled takes precedence over DMMV, the current if-else implementation // requires disabling DMMV if both conditions are met - || (reorder && ggml_sycl_supports_reorder_mmvq(src0->type))) { + || (reorder && ggml_sycl_supports_reorder_mmvq(src0->type)))) { use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; } + // TODO: Romain + GGML_SYCL_DEBUG("mul_mat use_dequantize_mul_mat_vec=%d use_mul_mat_vec_q=%d use_mul_mat_q=%d reorder=%d split=%d m=%ld n=%ld k=%ld batch0=%ld batch1=%ld\n", use_dequantize_mul_mat_vec, use_mul_mat_vec_q, use_mul_mat_q, reorder, split, src0->ne[1], src1->ne[1], src0->ne[0], src0->ne[3], src1->ne[3]); + if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // TODO: Refactor and cleanup of mul mat dispatching. if (src0->ne[3] == 1 && src1->ne[3] == 1) { @@ -2998,6 +3005,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor constexpr bool convert_src1_to_q8_1 = false; ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1); } + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 96a0ec0c091ae..a898d5b61610a 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -1008,8 +1008,10 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens case GGML_TYPE_Q4_0: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n"); reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } else { + GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl\n"); mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } break; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e61a126cf5b2f..eed537726d142 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -36,6 +36,8 @@ #include #include +#include + static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { size_t nels = ggml_nelements(tensor); std::vector data(nels); @@ -47,8 +49,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m std::random_device rd; std::vector vec; vec.reserve(n_threads); - //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed - for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); } + for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed + //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); } return vec; }(); @@ -551,6 +553,54 @@ struct test_case { } } + struct err_t { + float a_val, b_val, err; + size_t i; + }; + std::vector top_k_abs_err; + std::vector top_k_rel_err; + size_t k = 10; + auto a = f1.data(); + auto b = f2.data(); // ref (cpu backend) + auto save_top_k_err = [=](size_t i, float a_i, float b_i, float err, std::vector& top_k_err) { + if (top_k_err.size() < k) { + top_k_err.push_back({a_i, b_i, err, i}); + if (top_k_err.size() == k) { + std::sort(top_k_err.begin(), top_k_err.end(), [](const err_t& x, const err_t& y) { + return x.err > y.err; + }); + } + } else if (top_k_err.back().err < err) { + top_k_err.back() = {a_i, b_i, err, i}; + std::sort(top_k_err.begin(), top_k_err.end(), [](const err_t& x, const err_t& y) { + return x.err > y.err; + }); + } + }; + double avg_abs_err = 0.f; + double avg_rel_err = 0.f; + for (size_t i = 0; i < f1.size(); i++) { + float a_i = a[i]; + float b_i = b[i]; + float abs_err = std::fabs(a_i - b_i); + float rel_err = (a_i - b_i) / std::fabs(b_i); + save_top_k_err(i, a_i, b_i, abs_err, top_k_abs_err); + save_top_k_err(i, a_i, b_i, rel_err, top_k_rel_err); + avg_abs_err += abs_err; + avg_rel_err += rel_err; + } + avg_abs_err /= f1.size(); + avg_rel_err /= f1.size(); + std::cout << "\nAvg abs err=" << avg_abs_err << " Top " << k << " abs err:\n"; + for (const auto& err : top_k_abs_err) { + std::cout << "i=" << err.i << " a=" << err.a_val << " b=" << err.b_val << " abs err=" << err.err << "\n"; + } + std::cout << "\nAvg rel err=" << avg_rel_err << " Top " << k << " rel err:\n"; + for (const auto& err : top_k_rel_err) { + std::cout << "i=" << err.i << " a=" << err.a_val << " b=" << err.b_val << " rel err=" << err.err << "\n"; + } + std::cout << std::endl; + double err = nmse(f1.data(), f2.data(), f1.size()); if (err > ud->max_err) { printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err); @@ -4134,6 +4184,13 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat(type_a, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1})); } } + //TODO: Romain + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 11008, 1, 4096, {1, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 11008, 2, 4096, {1, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 1, 11008, {1, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 1, 4096, {1, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 2, 11008, {1, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 2, 4096, {1, 1}, {1, 1})); #if 1 for (ggml_type type_a : base_types) { From dc19cd5fd57dd9378de2696697c55275edf437ca Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Mon, 28 Apr 2025 15:08:04 +0100 Subject: [PATCH 09/14] Undo changes to test-backend-ops --- tests/test-backend-ops.cpp | 72 +++++--------------------------------- 1 file changed, 9 insertions(+), 63 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index bb6ac7109c6e9..d70acb7719435 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -36,8 +36,6 @@ #include #include -#include - static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { size_t nels = ggml_nelements(tensor); std::vector data(nels); @@ -49,8 +47,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m std::random_device rd; std::vector vec; vec.reserve(n_threads); - for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed - //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); } + //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed + for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); } return vec; }(); @@ -561,54 +559,6 @@ struct test_case { } } - struct err_t { - float a_val, b_val, err; - size_t i; - }; - std::vector top_k_abs_err; - std::vector top_k_rel_err; - size_t k = 10; - auto a = f1.data(); - auto b = f2.data(); // ref (cpu backend) - auto save_top_k_err = [=](size_t i, float a_i, float b_i, float err, std::vector& top_k_err) { - if (top_k_err.size() < k) { - top_k_err.push_back({a_i, b_i, err, i}); - if (top_k_err.size() == k) { - std::sort(top_k_err.begin(), top_k_err.end(), [](const err_t& x, const err_t& y) { - return x.err > y.err; - }); - } - } else if (top_k_err.back().err < err) { - top_k_err.back() = {a_i, b_i, err, i}; - std::sort(top_k_err.begin(), top_k_err.end(), [](const err_t& x, const err_t& y) { - return x.err > y.err; - }); - } - }; - double avg_abs_err = 0.f; - double avg_rel_err = 0.f; - for (size_t i = 0; i < f1.size(); i++) { - float a_i = a[i]; - float b_i = b[i]; - float abs_err = std::fabs(a_i - b_i); - float rel_err = (a_i - b_i) / std::fabs(b_i); - save_top_k_err(i, a_i, b_i, abs_err, top_k_abs_err); - save_top_k_err(i, a_i, b_i, rel_err, top_k_rel_err); - avg_abs_err += abs_err; - avg_rel_err += rel_err; - } - avg_abs_err /= f1.size(); - avg_rel_err /= f1.size(); - std::cout << "\nAvg abs err=" << avg_abs_err << " Top " << k << " abs err:\n"; - for (const auto& err : top_k_abs_err) { - std::cout << "i=" << err.i << " a=" << err.a_val << " b=" << err.b_val << " abs err=" << err.err << "\n"; - } - std::cout << "\nAvg rel err=" << avg_rel_err << " Top " << k << " rel err:\n"; - for (const auto& err : top_k_rel_err) { - std::cout << "i=" << err.i << " a=" << err.a_val << " b=" << err.b_val << " rel err=" << err.err << "\n"; - } - std::cout << std::endl; - double err = nmse(f1.data(), f2.data(), f1.size()); if (err > ud->max_err) { printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err); @@ -2121,7 +2071,7 @@ struct test_mul_mat_id : public test_case { const ggml_type type_b; const int n_mats; const int n_used; - const bool b; // brodcast b matrix + const bool b; // broadcast b matrix const int64_t m; const int64_t n; const int64_t k; @@ -2656,6 +2606,8 @@ struct test_rope : public test_case { } else { out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); } + + // TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp } ggml_set_name(out, "out"); @@ -4195,13 +4147,6 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat(type_a, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1})); } } - //TODO: Romain - test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 11008, 1, 4096, {1, 1}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 11008, 2, 4096, {1, 1}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 1, 11008, {1, 1}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 1, 4096, {1, 1}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 2, 11008, {1, 1}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 4096, 2, 4096, {1, 1}, {1, 1})); #if 1 for (ggml_type type_a : base_types) { @@ -4485,10 +4430,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_timestep_embedding()); test_cases.emplace_back(new test_leaky_relu()); - for (int hsk : { 64, 80, 128, 192, 256, }) { - for (int hsv : { 64, 80, 128, 192, 256, }) { - if (hsk != 192 && hsk != hsv) continue; + for (int hsk : { 64, 80, 128, 192, 256, 576 }) { + for (int hsv : { 64, 80, 128, 192, 256, 512 }) { + if (hsk != 192 && hsk != 576 && hsk != hsv) continue; if (hsk == 192 && (hsv != 128 && hsv != 192)) continue; + if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA for (bool mask : { true, false } ) { for (float max_bias : { 0.0f, 8.0f }) { From 351ef2b97fe95e911d26cddaad861c79e802ffd6 Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Mon, 28 Apr 2025 15:08:48 +0100 Subject: [PATCH 10/14] Refactor changes on top of q4_0 reorder fix --- ggml/src/ggml-sycl/ggml-sycl.cpp | 54 ++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 4bbfe1ab9336e..a5fd2dc49f5d4 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2847,6 +2847,24 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) { return false; } +inline bool ggml_sycl_supports_reorder_dequantize(enum ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return true; + default: + return false; + } +} + +inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return true; + default: + return false; + } +} + inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: @@ -2884,7 +2902,7 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows, GGML_ASSERT((size % sizeof(block_q4_0) == 0)); GGML_ASSERT((offset % sizeof(block_q4_0) == 0)); int offset_blks = offset / sizeof(block_q4_0); - auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;; + auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2; auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks; stream->parallel_for( @@ -2912,17 +2930,19 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { reorder_qw(data_device, ncols, nrows, size, 0, stream); } +static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) { + return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT + ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf. + dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases. + dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1; +} + /* * This function could be called when the OP (mul_mat) function support reorder optimizition. */ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT - ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf. - dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases. - src0->type == GGML_TYPE_Q4_0 && - src1->ne[2]==1 && src1->ne[3]==1) { - + if (should_reorder_tensor(*ctx, dst)) { ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra; if (!extra) return; //only happen in CI/UT permute case. @@ -2975,21 +2995,16 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE); #endif // SYCL_USE_XMX - const bool reorder = static_cast(dst->src[0]->extra) && - static_cast(dst->src[0]->extra)->optimized_feature.reorder; // mmvq path is faster in the CUDA backend. if (!g_ggml_sycl_disable_mmvq && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization // is enabled takes precedence over DMMV, the current if-else implementation // requires disabling DMMV if both conditions are met - || (reorder && ggml_sycl_supports_reorder_mmvq(src0->type)))) { + || (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) { use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; } - // TODO: Romain - printf("\n\n ** mul_mat use_dequantize_mul_mat_vec=%d use_mul_mat_vec_q=%d use_mul_mat_q=%d reorder=%d split=%d m=%ld n=%ld k=%ld batch0=%ld batch1=%ld\n", use_dequantize_mul_mat_vec, use_mul_mat_vec_q, use_mul_mat_q, reorder, split, src0->ne[1], src1->ne[1], src0->ne[0], src0->ne[3], src1->ne[3]); - if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // TODO: Refactor and cleanup of mul mat dispatching. if (src0->ne[3] == 1 && src1->ne[3] == 1) { @@ -3008,19 +3023,24 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { constexpr bool convert_src1_to_q8_1 = false; - opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. + if (ggml_sycl_supports_reorder_dmmv(src0->type)) { + opt_for_reorder(&ctx, src0, src1, dst); + } ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1); - // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream()); } else if (use_mul_mat_vec_q) { constexpr bool convert_src1_to_q8_1 = true; - opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. + if (ggml_sycl_supports_reorder_mmvq(src0->type)) { + opt_for_reorder(&ctx, src0, src1, dst); + } ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1); } else if (use_mul_mat_q) { constexpr bool convert_src1_to_q8_1 = true; ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1); } else { constexpr bool convert_src1_to_q8_1 = false; - opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. + if (ggml_sycl_supports_reorder_dequantize(src0->type)) { + opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. + } ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1); } GGML_SYCL_DEBUG("call %s done\n", __func__); From d61dda32825aec923496f8d2e1794e380f1c4c50 Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Mon, 28 Apr 2025 15:55:06 +0100 Subject: [PATCH 11/14] Missing Reverts --- ggml/src/ggml-sycl/common.hpp | 23 ++++++++++++++++++----- ggml/src/ggml-sycl/dmmv.cpp | 2 -- ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 12b0cdc9c5e0f..bc54900cb076b 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -42,7 +42,7 @@ void ggml_sycl_host_free(void* ptr); extern int g_ggml_sycl_debug; extern int g_ggml_sycl_disable_optimize; -extern int g_ggml_sycl_disable_mmvq; +extern int g_ggml_sycl_prioritize_dmmv; #define GGML_SYCL_DEBUG(...) \ do { \ @@ -286,11 +286,25 @@ struct ggml_tensor_extra_gpu { void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); -inline optimize_feature check_gpu_optimize_feature(syclex::architecture &/*arch*/) { +inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) { optimize_feature opt; - // TODO: Romain change to Intel vendor? - opt.reorder = true; + opt.reorder = + (arch == syclex::architecture::intel_gpu_dg1 || + arch == syclex::architecture::intel_gpu_acm_g10 || + arch == syclex::architecture::intel_gpu_acm_g11 || + arch == syclex::architecture::intel_gpu_acm_g12 || + arch == syclex::architecture::intel_gpu_pvc || + arch == syclex::architecture::intel_gpu_pvc_vg || + arch == syclex::architecture::intel_gpu_mtl_u || + arch == syclex::architecture::intel_gpu_mtl_s || + arch == syclex::architecture::intel_gpu_mtl_h || + arch == syclex::architecture::intel_gpu_arl_u || + arch == syclex::architecture::intel_gpu_arl_s || + arch == syclex::architecture::intel_gpu_arl_h || + arch == syclex::architecture::intel_gpu_bmg_g21 || + arch == syclex::architecture::intel_gpu_lnl_m + ); return opt; } @@ -485,5 +499,4 @@ constexpr size_t ceil_div(const size_t m, const size_t n) { } bool gpu_has_xmx(sycl::device &dev); - #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 8cff26e4be1b9..04a85fa35ff2d 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -1105,10 +1105,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec( case GGML_TYPE_Q4_0: if ((ggml_tensor_extra_gpu*)dst->src[0]->extra && ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { - GGML_SYCL_DEBUG("Calling dequantize_mul_mat_vec_q4_0_sycl_reorder\n"); dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); } else { - GGML_SYCL_DEBUG("Calling dequantize_mul_mat_vec_q4_0_sycl\n"); dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); } break; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 4486ae5f634bd..7f9c2eb9a0383 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -202,7 +202,7 @@ static void ggml_check_sycl() try { GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize); GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph); - GGML_LOG_INFO(" GGML_SYCL_DISABLE_MMVQ: %d\n", g_ggml_sycl_prioritize_dmmv); + GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv); GGML_LOG_INFO("Build with Macros:\n"); #if defined(GGML_SYCL_FORCE_MMQ) GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n"); From 48480c89fa5eeb86083c732454181dac89a067f0 Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Mon, 5 May 2025 13:10:48 +0100 Subject: [PATCH 12/14] Refactored opt_for_reorder logic to simplify code path --- ggml/src/ggml-sycl/ggml-sycl.cpp | 60 +++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 7f9c2eb9a0383..4d394df20b94c 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2831,13 +2831,19 @@ catch (sycl::exception const &exc) { std::exit(1); } +enum class Mul_Mat_Algo { + DMMV = 0, + MMVQ = 1, + MUL_MAT_SYCL = 2, +}; + inline bool ggml_sycl_supports_mmq(enum ggml_type type) { // TODO: accuracy issues in MMQ GGML_UNUSED(type); return false; } -inline bool ggml_sycl_supports_reorder_dequantize(enum ggml_type type) { +inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: return true; @@ -2927,20 +2933,37 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1; } -/* -* This function could be called when the OP (mul_mat) function support reorder optimizition. -*/ -static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - if (should_reorder_tensor(*ctx, dst)) { - ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra; - if (!extra) return; //only happen in CI/UT permute case. +static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */, + ggml_tensor * dst, Mul_Mat_Algo mul_mat_algo) { + if (!should_reorder_tensor(*ctx, dst)) { + return; + } - if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder. + ggml_tensor_extra_gpu * extra = static_cast(src0->extra); + if (!extra || extra->optimized_feature.reorder) { + return; // Skip permutations and already reordered tensors + } - reorder_qw(src0, ctx->stream()); - extra->optimized_feature.reorder = true; //used to decode/dequan in next steps. + switch (mul_mat_algo) { + case Mul_Mat_Algo::DMMV: + if (!ggml_sycl_supports_reorder_dmmv(src0->type)) { + return; + } + break; + case Mul_Mat_Algo::MMVQ: + if (!ggml_sycl_supports_reorder_mmvq(src0->type)) { + return; + } + break; + case Mul_Mat_Algo::MUL_MAT_SYCL: + if (!ggml_sycl_supports_reorder_mul_mat_sycl(src0->type)) { + return; + } + break; } + + reorder_qw(src0, ctx->stream()); + extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering } static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3013,24 +3036,19 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { constexpr bool convert_src1_to_q8_1 = false; - if (ggml_sycl_supports_reorder_dmmv(src0->type)) { - opt_for_reorder(&ctx, src0, src1, dst); - } + opt_for_reorder(&ctx, src0, src1, dst, Mul_Mat_Algo::DMMV); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1); } else if (use_mul_mat_vec_q) { constexpr bool convert_src1_to_q8_1 = true; - if (ggml_sycl_supports_reorder_mmvq(src0->type)) { - opt_for_reorder(&ctx, src0, src1, dst); - } + opt_for_reorder(&ctx, src0, src1, dst, Mul_Mat_Algo::MMVQ); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1); } else if (use_mul_mat_q) { constexpr bool convert_src1_to_q8_1 = true; ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1); } else { constexpr bool convert_src1_to_q8_1 = false; - if (ggml_sycl_supports_reorder_dequantize(src0->type)) { - opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. - } + // MUL_MAT_SYCL supports reorder + opt_for_reorder(&ctx, src0, src1, dst, Mul_Mat_Algo::MUL_MAT_SYCL); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1); } GGML_SYCL_DEBUG("call %s done\n", __func__); From 6fe27eb4e86410a3a494a76067580a37aa623792 Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Fri, 9 May 2025 12:23:33 +0100 Subject: [PATCH 13/14] Explicit inlining and unroll --- ggml/src/ggml-sycl/mmvq.cpp | 1 + ggml/src/ggml-sycl/vecdotq.hpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index a898d5b61610a..3cade1a42a6fe 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -40,6 +40,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r // Y block index that aligns with ibx const int iby = i * block_type::block_to_q8_1_ratio(); +#pragma unroll for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) { // x block quant index when casting the quants to int const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup); diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index 29e80bbc28a95..cbf664fcf284b 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -265,7 +265,7 @@ template <> struct reorder_vec_dot_q_sycl { using q4_0_block = ggml_sycl_reordered::block_q_t; using q4_0_traits = typename q4_0_block::traits; - float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4, const sycl::half2 & ds8) { + __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4, const sycl::half2 & ds8) { int sumi = 0; #pragma unroll @@ -284,7 +284,7 @@ template <> struct reorder_vec_dot_q_sycl { return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y()); } - float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset, + __dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { const uint8_t * bq4_0 = static_cast(vbq) + ibx_offset; const ggml_half d = *(reinterpret_cast(static_cast(vbq) + d_offset)); From e809b0737877e71cdf1b80f1d25800b2ff82068e Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Fri, 9 May 2025 13:12:25 +0100 Subject: [PATCH 14/14] Renamed mul_mat_algo enum for consistency --- ggml/src/ggml-sycl/ggml-sycl.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 077db2c4d849c..0ea729948ec7a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2825,7 +2825,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons std::exit(1); } -enum class Mul_Mat_Algo { +enum class mul_mat_algo { DMMV = 0, MMVQ = 1, MUL_MAT_SYCL = 2, @@ -2928,7 +2928,7 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten } static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */, - ggml_tensor * dst, Mul_Mat_Algo mul_mat_algo) { + ggml_tensor * dst, mul_mat_algo mm_algorithm) { if (!should_reorder_tensor(*ctx, dst)) { return; } @@ -2938,18 +2938,18 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * return; // Skip permutations and already reordered tensors } - switch (mul_mat_algo) { - case Mul_Mat_Algo::DMMV: + switch (mm_algorithm) { + case mul_mat_algo::DMMV: if (!ggml_sycl_supports_reorder_dmmv(src0->type)) { return; } break; - case Mul_Mat_Algo::MMVQ: + case mul_mat_algo::MMVQ: if (!ggml_sycl_supports_reorder_mmvq(src0->type)) { return; } break; - case Mul_Mat_Algo::MUL_MAT_SYCL: + case mul_mat_algo::MUL_MAT_SYCL: if (!ggml_sycl_supports_reorder_mul_mat_sycl(src0->type)) { return; } @@ -3030,11 +3030,11 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { constexpr bool convert_src1_to_q8_1 = false; - opt_for_reorder(&ctx, src0, src1, dst, Mul_Mat_Algo::DMMV); + opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::DMMV); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1); } else if (use_mul_mat_vec_q) { constexpr bool convert_src1_to_q8_1 = true; - opt_for_reorder(&ctx, src0, src1, dst, Mul_Mat_Algo::MMVQ); + opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MMVQ); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1); } else if (use_mul_mat_q) { constexpr bool convert_src1_to_q8_1 = true; @@ -3042,7 +3042,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor } else { constexpr bool convert_src1_to_q8_1 = false; // MUL_MAT_SYCL supports reorder - opt_for_reorder(&ctx, src0, src1, dst, Mul_Mat_Algo::MUL_MAT_SYCL); + opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MUL_MAT_SYCL); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1); } GGML_SYCL_DEBUG("call %s done\n", __func__);