Skip to content

Commit eab828d

Browse files
authored
[OpenMP] Provide a specialized team reduction for the common case (#70766)
We default to < 1024 teams if the user did not specify otherwise. As such we can avoid the extra logic in the teams reduction that handles more than num_of_records (default 1024) teams. This is a stopgap but still shaves of 33% of the runtime in some simple reduction examples.
1 parent 66152f4 commit eab828d

File tree

1 file changed

+98
-0
lines changed

1 file changed

+98
-0
lines changed

openmp/libomptarget/DeviceRTL/src/Reduction.cpp

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,109 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
178178
false);
179179
}
180180

181+
/// Mostly like _v2 but with the builtin assumption that we have less than
182+
/// num_of_records (by default 1024) teams.
183+
int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
184+
IdentTy *Loc, int32_t TId, void *__restrict__ GlobalBuffer,
185+
uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct,
186+
InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
187+
ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
188+
// Terminate all threads in non-SPMD mode except for the main thread.
189+
uint32_t ThreadId = mapping::getThreadIdInBlock();
190+
if (mapping::isGenericMode()) {
191+
if (!mapping::isMainThreadInGenericMode())
192+
return 0;
193+
ThreadId = 0;
194+
}
195+
196+
uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
197+
198+
// In non-generic mode all workers participate in the teams reduction.
199+
// In generic mode only the team main participates in the teams
200+
// reduction because the workers are waiting for parallel work.
201+
uint32_t NumThreads = omp_get_num_threads();
202+
uint32_t TeamId = omp_get_team_num();
203+
uint32_t NumTeams = omp_get_num_teams();
204+
static unsigned SHARED(ChunkTeamCount);
205+
206+
// Block progress for teams greater than the current upper
207+
// limit. We always only allow a number of teams less or equal
208+
// to the number of slots in the buffer.
209+
bool IsMain = (ThreadId == 0);
210+
211+
if (IsMain) {
212+
lgcpyFct(GlobalBuffer, TeamId, reduce_data);
213+
214+
// Propagate the memory writes above to the world.
215+
fence::kernel(atomic::release);
216+
217+
// Increment team counter.
218+
// This counter is incremented by all teams in the current
219+
// BUFFER_SIZE chunk.
220+
ChunkTeamCount = atomic::inc(&Cnt, NumTeams, atomic::acq_rel,
221+
atomic::MemScopeTy::device);
222+
}
223+
224+
// Synchronize in SPMD mode as in generic mode all but 1 threads are in the
225+
// state machine.
226+
if (mapping::isSPMDMode())
227+
synchronize::threadsAligned(atomic::acq_rel);
228+
229+
// Each thread will have a local struct containing the values to be
230+
// reduced:
231+
// 1. do reduction within each warp.
232+
// 2. do reduction across warps.
233+
// 3. write the final result to the main reduction variable
234+
// by returning 1 in the thread holding the reduction result.
235+
236+
// Check if this is the very last team.
237+
if (ChunkTeamCount != NumTeams - 1)
238+
return 0;
239+
240+
// Last team processing.
241+
NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
242+
if (ThreadId >= NumThreads)
243+
return 0;
244+
245+
// Ensure we see the global memory writes by other teams
246+
fence::kernel(atomic::aquire);
247+
248+
// Load from buffer and reduce.
249+
glcpyFct(GlobalBuffer, ThreadId, reduce_data);
250+
for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
251+
glredFct(GlobalBuffer, i, reduce_data);
252+
253+
// Reduce across warps to the warp main.
254+
gpu_regular_warp_reduce(reduce_data, shflFct);
255+
256+
uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
257+
uint32_t WarpsNeeded =
258+
(ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
259+
// Gather all the reduced values from each warp
260+
// to the first warp.
261+
cpyFct(reduce_data, WarpsNeeded);
262+
263+
if (mapping::getWarpIdInBlock() == 0)
264+
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
265+
266+
return IsMain;
267+
}
268+
181269
int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
182270
IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
183271
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
184272
ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
185273
ListGlobalFnTy glredFct) {
274+
// The first check is a compile time constant, the second one a runtime check.
275+
// If the first one succeeds we will use the specialized version.
276+
if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
277+
state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
278+
num_of_records == 1024) ||
279+
(omp_get_num_teams() <= num_of_records))
280+
return __kmpc_nvptx_teams_reduce_nowait_v3(
281+
Loc, TId, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
282+
lgcpyFct, lgredFct, glcpyFct, glredFct);
283+
186284
// Terminate all threads in non-SPMD mode except for the master thread.
187285
uint32_t ThreadId = mapping::getThreadIdInBlock();
188286
if (mapping::isGenericMode()) {

0 commit comments

Comments
 (0)