Skip to content

Introduce ThreadPool and use it for quicksort #201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ jobs:
CXX: clang++-18
run: |
make clean
meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir
meson setup -Dbuild_tests=true -Duse_openmp=true -Duse_stdthreads=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir
cd builddir
ninja

Expand Down Expand Up @@ -202,7 +202,7 @@ jobs:
CXX: clang++-18
run: |
make clean
meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir
meson setup -Dbuild_tests=true -Duse_openmp=true -Duse_stdthreads=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir
cd builddir
ninja

Expand Down Expand Up @@ -235,7 +235,7 @@ jobs:
CXX: g++-10
run: |
make clean
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
meson setup -Dbuild_tests=true -Duse_openmp=true -Duse_stdthreads=true --warnlevel 2 --werror --buildtype release builddir
cd builddir
ninja

Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ test_openmp:
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

test_stdthreads:
meson setup -Dbuild_tests=true -Duse_stdthreads=true --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

test_asan:
meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Db_lundef=false -Dasan_ci_dont_validate=true --warnlevel 0 --buildtype debugoptimized builddir
cd builddir && ninja
Expand Down
18 changes: 8 additions & 10 deletions lib/meson.build
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
libtargets = []

# Add compile flags for OpenMP if enabled
openmpflags = []
if get_option('use_openmp')
openmpflags = ['-DXSS_USE_OPENMP=true', '-fopenmp']
endif

if cpp.has_argument('-march=haswell')
libtargets += static_library('libavx',
files(
'x86simdsort-avx2.cpp',
),
include_directories : [src],
cpp_args : ['-march=haswell', openmpflags],
cpp_args : ['-march=haswell', stdthreadsflag],
dependencies: [omp_dep],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand All @@ -23,7 +18,8 @@ if cpp.has_argument('-march=skylake-avx512')
'x86simdsort-skx.cpp',
),
include_directories : [src],
cpp_args : ['-march=skylake-avx512', openmpflags],
cpp_args : ['-march=skylake-avx512', stdthreadsflag],
dependencies: [omp_dep],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand All @@ -34,7 +30,8 @@ if cpp.has_argument('-march=icelake-client')
'x86simdsort-icl.cpp',
),
include_directories : [src],
cpp_args : ['-march=icelake-client', openmpflags],
cpp_args : ['-march=icelake-client', stdthreadsflag],
dependencies: [omp_dep],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand All @@ -45,7 +42,8 @@ if cancompilefp16
'x86simdsort-spr.cpp',
),
include_directories : [src],
cpp_args : ['-march=sapphirerapids', openmpflags],
cpp_args : ['-march=sapphirerapids', stdthreadsflag],
dependencies: [omp_dep],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand Down
18 changes: 16 additions & 2 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,20 @@ if get_option('build_vqsortbench')
benchvq = true
endif

# build with openmp
omp = []
omp_dep = []
if get_option('use_openmp')
omp = dependency('openmp', required : true)
omp_dep = declare_dependency(dependencies: omp, compile_args: ['-DXSS_USE_OPENMP'])
endif

# build with std::threads
stdthreadsflag = []
if get_option('use_stdthreads')
stdthreadsflag += ['-DXSS_BUILD_WITH_STD_THREADS']
endif

fp16code = '''#include<immintrin.h>
int main() {
__m512h temp = _mm512_set1_ph(1.0f);
Expand All @@ -43,8 +57,8 @@ if get_option('lib_type') == 'shared'
libsimdsort = shared_library('x86simdsortcpp',
'lib/x86simdsort.cpp',
include_directories : [src, utils, lib],
link_args : [openmpflags],
link_with : [libtargets],
dependencies: [omp],
gnu_symbol_visibility : 'inlineshidden',
install : true,
soversion : 1,
Expand All @@ -53,7 +67,7 @@ else
libsimdsort = static_library('x86simdsortcpp',
'lib/x86simdsort.cpp',
include_directories : [src, utils, lib],
link_args : [openmpflags],
dependencies: [omp],
link_with : [libtargets],
gnu_symbol_visibility : 'inlineshidden',
install : true,
Expand Down
2 changes: 2 additions & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ option('build_vqsortbench', type : 'boolean', value : true,
description : 'Add google vqsort to benchmarks (default: "true").')
option('use_openmp', type : 'boolean', value : false,
description : 'Use OpenMP to accelerate key-value sort (default: "false").')
option('use_stdthreads', type : 'boolean', value : false,
description : 'Use std::threads to accelerate qsort (default: "false").')
option('lib_type', type : 'string', value : 'shared',
description : 'Library type: shared or static (default: "shared").')
option('fatal_sanitizers', type : 'boolean', value : 'false',
Expand Down
2 changes: 1 addition & 1 deletion scripts/bench-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then
fi
compare=$(realpath .bench/google-benchmark/tools/compare.py)

meson setup -Dbuild_benchmarks=true -Dbuild_ippbench=true --warnlevel 0 --buildtype release builddir-${branch}
meson setup -Dbuild_benchmarks=true -Duse_stdthreads=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir-${branch}
cd builddir-${branch}
ninja
$compare filters ./benchexe $1 $2 --benchmark_repetitions=$3
2 changes: 1 addition & 1 deletion scripts/branch-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ build_branch() {
fi
fi
cd $dir_name
meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir
meson setup -Dbuild_benchmarks=true -Duse_openmp=true -Duse_stdthreads=true --warnlevel 0 --buildtype release builddir
cd builddir
ninja
cd ../../
Expand Down
41 changes: 23 additions & 18 deletions src/avx512-16bit-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -548,25 +548,35 @@ avx512_qsort_fp16_helper(uint16_t *arr, arrsize_t arrsize)
using T = uint16_t;
using vtype = zmm_vector<float16>;

#ifdef XSS_COMPILE_OPENMP
#ifdef XSS_BUILD_WITH_STD_THREADS
bool use_parallel = arrsize > 100000;
#else
bool use_parallel = false;
#endif

if (use_parallel) {
// This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
#ifdef XSS_BUILD_WITH_STD_THREADS

// This thread limit was determined experimentally
constexpr int thread_limit = 8;
int thread_count = std::min(thread_limit, omp_get_max_threads());
int thread_count = std::min(thread_limit,
(int)std::thread::hardware_concurrency());
Comment on lines 561 to +563
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is 8 threads in a system with a lot of cores still better? Maybe you need to partition the data better to avoid locality problems.

arrsize_t task_threshold = std::max((arrsize_t)100000, arrsize / 100);

// We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_
// The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems
// Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
#pragma omp parallel num_threads(thread_count)
#pragma omp single
qsort_<vtype, comparator, T>(arr,
0,
arrsize - 1,
2 * (arrsize_t)log2(arrsize),
task_threshold);
// Create a thread pool
xss::tp::ThreadPool pool(thread_count);

// Initial sort task
qsort_threads<vtype, comparator, T>(arr,
0,
arrsize - 1,
2 * (arrsize_t)log2(arrsize),
task_threshold,
pool);

// Wait for all tasks to complete
pool.wait_all();
#endif
}
else {
qsort_<vtype, comparator, T>(arr,
Expand All @@ -575,11 +585,6 @@ avx512_qsort_fp16_helper(uint16_t *arr, arrsize_t arrsize)
2 * (arrsize_t)log2(arrsize),
std::numeric_limits<arrsize_t>::max());
}
#pragma omp taskwait
#else
qsort_<vtype, comparator, T>(
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
#endif
}

[[maybe_unused]] X86_SIMD_SORT_INLINE void
Expand Down
Loading
Loading