1
1
#include " LibcGpuBenchmark.h"
2
2
#include " src/__support/CPP/algorithm.h"
3
3
#include " src/__support/CPP/array.h"
4
+ #include " src/__support/CPP/atomic.h"
4
5
#include " src/__support/CPP/string.h"
5
6
#include " src/__support/FPUtil/sqrt.h"
6
7
#include " src/__support/GPU/utils.h"
@@ -12,61 +13,113 @@ namespace LIBC_NAMESPACE_DECL {
12
13
namespace benchmarks {
13
14
14
15
FixedVector<Benchmark *, 64 > benchmarks;
15
- cpp::array<BenchmarkResult, 1024 > results;
16
16
17
17
void Benchmark::add_benchmark (Benchmark *benchmark) {
18
18
benchmarks.push_back (benchmark);
19
19
}
20
20
21
- BenchmarkResult
22
- reduce_results (const cpp::array<BenchmarkResult, 1024 > &results) {
23
- BenchmarkResult result;
24
- uint64_t cycles_sum = 0 ;
25
- double standard_deviation_sum = 0 ;
26
- uint64_t min = UINT64_MAX;
27
- uint64_t max = 0 ;
28
- uint32_t samples_sum = 0 ;
29
- uint32_t iterations_sum = 0 ;
30
- clock_t time_sum = 0 ;
31
- uint64_t num_threads = gpu::get_num_threads ();
32
- for (uint64_t i = 0 ; i < num_threads; i++) {
33
- BenchmarkResult current_result = results[i];
34
- cycles_sum += current_result.cycles ;
35
- standard_deviation_sum += current_result.standard_deviation ;
36
- min = cpp::min (min, current_result.min );
37
- max = cpp::max (max, current_result.max );
38
- samples_sum += current_result.samples ;
39
- iterations_sum += current_result.total_iterations ;
40
- time_sum += current_result.total_time ;
21
+ struct AtomicBenchmarkSums {
22
+ cpp::Atomic<uint64_t > cycles_sum = 0 ;
23
+ cpp::Atomic<uint64_t > standard_deviation_sum = 0 ;
24
+ cpp::Atomic<uint64_t > min = UINT64_MAX;
25
+ cpp::Atomic<uint64_t > max = 0 ;
26
+ cpp::Atomic<uint32_t > samples_sum = 0 ;
27
+ cpp::Atomic<uint32_t > iterations_sum = 0 ;
28
+ cpp::Atomic<clock_t > time_sum = 0 ;
29
+ cpp::Atomic<uint64_t > active_threads = 0 ;
30
+
31
+ void reset () {
32
+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
33
+ active_threads.store (0 , cpp::MemoryOrder::RELAXED);
34
+ cycles_sum.store (0 , cpp::MemoryOrder::RELAXED);
35
+ standard_deviation_sum.store (0 , cpp::MemoryOrder::RELAXED);
36
+ min.store (UINT64_MAX, cpp::MemoryOrder::RELAXED);
37
+ max.store (0 , cpp::MemoryOrder::RELAXED);
38
+ samples_sum.store (0 , cpp::MemoryOrder::RELAXED);
39
+ iterations_sum.store (0 , cpp::MemoryOrder::RELAXED);
40
+ time_sum.store (0 , cpp::MemoryOrder::RELAXED);
41
+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
41
42
}
42
- result.cycles = cycles_sum / num_threads;
43
- result.standard_deviation = standard_deviation_sum / num_threads;
44
- result.min = min;
45
- result.max = max;
46
- result.samples = samples_sum / num_threads;
47
- result.total_iterations = iterations_sum / num_threads;
48
- result.total_time = time_sum / num_threads;
49
- return result;
43
+
44
+ void update (const BenchmarkResult &result) {
45
+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
46
+ active_threads.fetch_add (1 , cpp::MemoryOrder::RELAXED);
47
+
48
+ cycles_sum.fetch_add (result.cycles , cpp::MemoryOrder::RELAXED);
49
+ standard_deviation_sum.fetch_add (
50
+ static_cast <uint64_t >(result.standard_deviation ),
51
+ cpp::MemoryOrder::RELAXED);
52
+
53
+ // Perform a CAS loop to atomically update the min
54
+ uint64_t orig_min = min.load (cpp::MemoryOrder::RELAXED);
55
+ while (!min.compare_exchange_strong (
56
+ orig_min, cpp::min (orig_min, result.min ), cpp::MemoryOrder::ACQUIRE,
57
+ cpp::MemoryOrder::RELAXED))
58
+ ;
59
+
60
+ // Perform a CAS loop to atomically update the max
61
+ uint64_t orig_max = max.load (cpp::MemoryOrder::RELAXED);
62
+ while (!max.compare_exchange_strong (
63
+ orig_max, cpp::max (orig_max, result.max ), cpp::MemoryOrder::ACQUIRE,
64
+ cpp::MemoryOrder::RELAXED))
65
+ ;
66
+
67
+ samples_sum.fetch_add (result.samples , cpp::MemoryOrder::RELAXED);
68
+ iterations_sum.fetch_add (result.total_iterations ,
69
+ cpp::MemoryOrder::RELAXED);
70
+ time_sum.fetch_add (result.total_time , cpp::MemoryOrder::RELAXED);
71
+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
72
+ }
73
+ };
74
+
75
+ AtomicBenchmarkSums all_results;
76
+
77
+ void print_results (Benchmark *b) {
78
+ constexpr auto GREEN = " \033 [32m" ;
79
+ constexpr auto RESET = " \033 [0m" ;
80
+
81
+ BenchmarkResult result;
82
+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
83
+ int num_threads = all_results.active_threads .load (cpp::MemoryOrder::RELAXED);
84
+ result.cycles =
85
+ all_results.cycles_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
86
+ result.standard_deviation =
87
+ all_results.standard_deviation_sum .load (cpp::MemoryOrder::RELAXED) /
88
+ num_threads;
89
+ result.min = all_results.min .load (cpp::MemoryOrder::RELAXED);
90
+ result.max = all_results.max .load (cpp::MemoryOrder::RELAXED);
91
+ result.samples =
92
+ all_results.samples_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
93
+ result.total_iterations =
94
+ all_results.iterations_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
95
+ result.total_time =
96
+ all_results.time_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
97
+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
98
+
99
+ log << GREEN << " [ RUN ] " << RESET << b->get_name () << ' \n ' ;
100
+ log << GREEN << " [ OK ] " << RESET << b->get_name () << " : "
101
+ << result.cycles << " cycles, " << result.min << " min, " << result.max
102
+ << " max, " << result.total_iterations << " iterations, "
103
+ << result.total_time << " ns, "
104
+ << static_cast <uint64_t >(result.standard_deviation )
105
+ << " stddev (num threads: " << num_threads << " )\n " ;
50
106
}
51
107
52
108
void Benchmark::run_benchmarks () {
53
109
uint64_t id = gpu::get_thread_id ();
54
110
gpu::sync_threads ();
55
111
56
112
for (Benchmark *b : benchmarks) {
57
- results[id] = b->run ();
113
+ if (id == 0 )
114
+ all_results.reset ();
115
+
58
116
gpu::sync_threads ();
59
- if (id == 0 ) {
60
- BenchmarkResult all_results = reduce_results (results);
61
- constexpr auto GREEN = " \033 [32m" ;
62
- constexpr auto RESET = " \033 [0m" ;
63
- log << GREEN << " [ RUN ] " << RESET << b->get_name () << ' \n ' ;
64
- log << GREEN << " [ OK ] " << RESET << b->get_name () << " : "
65
- << all_results.cycles << " cycles, " << all_results.min << " min, "
66
- << all_results.max << " max, " << all_results.total_iterations
67
- << " iterations, " << all_results.total_time << " ns, "
68
- << static_cast <long >(all_results.standard_deviation ) << " stddev\n " ;
69
- }
117
+ auto current_result = b->run ();
118
+ all_results.update (current_result);
119
+ gpu::sync_threads ();
120
+
121
+ if (id == 0 )
122
+ print_results (b);
70
123
}
71
124
gpu::sync_threads ();
72
125
}
0 commit comments