Skip to content

Commit b42c332

Browse files
authored
[libc] Use Atomics in GPU Benchmarks (#98842)
This PR replaces our old method of reducing the benchmark results by using an array to using atomics instead. This should help us implement single threaded benchmarks.
1 parent 1663ac5 commit b42c332

File tree

3 files changed

+97
-43
lines changed

3 files changed

+97
-43
lines changed

libc/benchmarks/gpu/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ add_unittest_framework_library(
4343
libc.src.__support.CPP.functional
4444
libc.src.__support.CPP.limits
4545
libc.src.__support.CPP.algorithm
46+
libc.src.__support.CPP.atomic
4647
libc.src.__support.fixed_point.fx_rep
4748
libc.src.__support.macros.properties.types
4849
libc.src.__support.OSUtil.osutil

libc/benchmarks/gpu/LibcGpuBenchmark.cpp

Lines changed: 94 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "LibcGpuBenchmark.h"
22
#include "src/__support/CPP/algorithm.h"
33
#include "src/__support/CPP/array.h"
4+
#include "src/__support/CPP/atomic.h"
45
#include "src/__support/CPP/string.h"
56
#include "src/__support/FPUtil/sqrt.h"
67
#include "src/__support/GPU/utils.h"
@@ -12,61 +13,113 @@ namespace LIBC_NAMESPACE_DECL {
1213
namespace benchmarks {
1314

1415
FixedVector<Benchmark *, 64> benchmarks;
15-
cpp::array<BenchmarkResult, 1024> results;
1616

1717
void Benchmark::add_benchmark(Benchmark *benchmark) {
1818
benchmarks.push_back(benchmark);
1919
}
2020

21-
BenchmarkResult
22-
reduce_results(const cpp::array<BenchmarkResult, 1024> &results) {
23-
BenchmarkResult result;
24-
uint64_t cycles_sum = 0;
25-
double standard_deviation_sum = 0;
26-
uint64_t min = UINT64_MAX;
27-
uint64_t max = 0;
28-
uint32_t samples_sum = 0;
29-
uint32_t iterations_sum = 0;
30-
clock_t time_sum = 0;
31-
uint64_t num_threads = gpu::get_num_threads();
32-
for (uint64_t i = 0; i < num_threads; i++) {
33-
BenchmarkResult current_result = results[i];
34-
cycles_sum += current_result.cycles;
35-
standard_deviation_sum += current_result.standard_deviation;
36-
min = cpp::min(min, current_result.min);
37-
max = cpp::max(max, current_result.max);
38-
samples_sum += current_result.samples;
39-
iterations_sum += current_result.total_iterations;
40-
time_sum += current_result.total_time;
21+
struct AtomicBenchmarkSums {
22+
cpp::Atomic<uint64_t> cycles_sum = 0;
23+
cpp::Atomic<uint64_t> standard_deviation_sum = 0;
24+
cpp::Atomic<uint64_t> min = UINT64_MAX;
25+
cpp::Atomic<uint64_t> max = 0;
26+
cpp::Atomic<uint32_t> samples_sum = 0;
27+
cpp::Atomic<uint32_t> iterations_sum = 0;
28+
cpp::Atomic<clock_t> time_sum = 0;
29+
cpp::Atomic<uint64_t> active_threads = 0;
30+
31+
void reset() {
32+
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
33+
active_threads.store(0, cpp::MemoryOrder::RELAXED);
34+
cycles_sum.store(0, cpp::MemoryOrder::RELAXED);
35+
standard_deviation_sum.store(0, cpp::MemoryOrder::RELAXED);
36+
min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
37+
max.store(0, cpp::MemoryOrder::RELAXED);
38+
samples_sum.store(0, cpp::MemoryOrder::RELAXED);
39+
iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
40+
time_sum.store(0, cpp::MemoryOrder::RELAXED);
41+
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
4142
}
42-
result.cycles = cycles_sum / num_threads;
43-
result.standard_deviation = standard_deviation_sum / num_threads;
44-
result.min = min;
45-
result.max = max;
46-
result.samples = samples_sum / num_threads;
47-
result.total_iterations = iterations_sum / num_threads;
48-
result.total_time = time_sum / num_threads;
49-
return result;
43+
44+
void update(const BenchmarkResult &result) {
45+
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
46+
active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
47+
48+
cycles_sum.fetch_add(result.cycles, cpp::MemoryOrder::RELAXED);
49+
standard_deviation_sum.fetch_add(
50+
static_cast<uint64_t>(result.standard_deviation),
51+
cpp::MemoryOrder::RELAXED);
52+
53+
// Perform a CAS loop to atomically update the min
54+
uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
55+
while (!min.compare_exchange_strong(
56+
orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE,
57+
cpp::MemoryOrder::RELAXED))
58+
;
59+
60+
// Perform a CAS loop to atomically update the max
61+
uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED);
62+
while (!max.compare_exchange_strong(
63+
orig_max, cpp::max(orig_max, result.max), cpp::MemoryOrder::ACQUIRE,
64+
cpp::MemoryOrder::RELAXED))
65+
;
66+
67+
samples_sum.fetch_add(result.samples, cpp::MemoryOrder::RELAXED);
68+
iterations_sum.fetch_add(result.total_iterations,
69+
cpp::MemoryOrder::RELAXED);
70+
time_sum.fetch_add(result.total_time, cpp::MemoryOrder::RELAXED);
71+
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
72+
}
73+
};
74+
75+
AtomicBenchmarkSums all_results;
76+
77+
void print_results(Benchmark *b) {
78+
constexpr auto GREEN = "\033[32m";
79+
constexpr auto RESET = "\033[0m";
80+
81+
BenchmarkResult result;
82+
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
83+
int num_threads = all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
84+
result.cycles =
85+
all_results.cycles_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
86+
result.standard_deviation =
87+
all_results.standard_deviation_sum.load(cpp::MemoryOrder::RELAXED) /
88+
num_threads;
89+
result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
90+
result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
91+
result.samples =
92+
all_results.samples_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
93+
result.total_iterations =
94+
all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
95+
result.total_time =
96+
all_results.time_sum.load(cpp::MemoryOrder::RELAXED) / num_threads;
97+
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
98+
99+
log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n';
100+
log << GREEN << "[ OK ] " << RESET << b->get_name() << ": "
101+
<< result.cycles << " cycles, " << result.min << " min, " << result.max
102+
<< " max, " << result.total_iterations << " iterations, "
103+
<< result.total_time << " ns, "
104+
<< static_cast<uint64_t>(result.standard_deviation)
105+
<< " stddev (num threads: " << num_threads << ")\n";
50106
}
51107

52108
void Benchmark::run_benchmarks() {
53109
uint64_t id = gpu::get_thread_id();
54110
gpu::sync_threads();
55111

56112
for (Benchmark *b : benchmarks) {
57-
results[id] = b->run();
113+
if (id == 0)
114+
all_results.reset();
115+
58116
gpu::sync_threads();
59-
if (id == 0) {
60-
BenchmarkResult all_results = reduce_results(results);
61-
constexpr auto GREEN = "\033[32m";
62-
constexpr auto RESET = "\033[0m";
63-
log << GREEN << "[ RUN ] " << RESET << b->get_name() << '\n';
64-
log << GREEN << "[ OK ] " << RESET << b->get_name() << ": "
65-
<< all_results.cycles << " cycles, " << all_results.min << " min, "
66-
<< all_results.max << " max, " << all_results.total_iterations
67-
<< " iterations, " << all_results.total_time << " ns, "
68-
<< static_cast<long>(all_results.standard_deviation) << " stddev\n";
69-
}
117+
auto current_result = b->run();
118+
all_results.update(current_result);
119+
gpu::sync_threads();
120+
121+
if (id == 0)
122+
print_results(b);
70123
}
71124
gpu::sync_threads();
72125
}

libc/benchmarks/gpu/LibcGpuBenchmark.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ class Benchmark {
8888
}
8989

9090
static void run_benchmarks();
91+
const cpp::string_view get_name() const { return name; }
9192

9293
protected:
9394
static void add_benchmark(Benchmark *benchmark);
@@ -97,13 +98,12 @@ class Benchmark {
9798
BenchmarkOptions options;
9899
return benchmark(options, func);
99100
}
100-
const cpp::string_view get_name() const { return name; }
101101
};
102102
} // namespace benchmarks
103103
} // namespace LIBC_NAMESPACE_DECL
104104

105105
#define BENCHMARK(SuiteName, TestName, Func) \
106106
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
107-
Func, #SuiteName "." #TestName);
107+
Func, #SuiteName "." #TestName)
108108

109109
#endif

0 commit comments

Comments
 (0)