Skip to content

Commit 6e574f1

Browse files
committed
Revert "[OpenMP] Provide a specialized team reduction for the common case (#70766)"
This reverts commit eab828d.
1 parent fb07d9c commit 6e574f1

File tree

1 file changed

+4
-102
lines changed

1 file changed

+4
-102
lines changed

openmp/libomptarget/DeviceRTL/src/Reduction.cpp

Lines changed: 4 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -176,109 +176,11 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
176176
return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
177177
}
178178

179-
/// Mostly like _v2 but with the builtin assumption that we have less than
180-
/// num_of_records (by default 1024) teams.
181-
int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
182-
IdentTy *Loc, void *__restrict__ GlobalBuffer, uint32_t num_of_records,
183-
uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
184-
InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
185-
ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
186-
// Terminate all threads in non-SPMD mode except for the main thread.
187-
uint32_t ThreadId = mapping::getThreadIdInBlock();
188-
if (mapping::isGenericMode()) {
189-
if (!mapping::isMainThreadInGenericMode())
190-
return 0;
191-
ThreadId = 0;
192-
}
193-
194-
uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
195-
196-
// In non-generic mode all workers participate in the teams reduction.
197-
// In generic mode only the team main participates in the teams
198-
// reduction because the workers are waiting for parallel work.
199-
uint32_t NumThreads = omp_get_num_threads();
200-
uint32_t TeamId = omp_get_team_num();
201-
uint32_t NumTeams = omp_get_num_teams();
202-
static unsigned SHARED(ChunkTeamCount);
203-
204-
// Block progress for teams greater than the current upper
205-
// limit. We always only allow a number of teams less or equal
206-
// to the number of slots in the buffer.
207-
bool IsMain = (ThreadId == 0);
208-
209-
if (IsMain) {
210-
lgcpyFct(GlobalBuffer, TeamId, reduce_data);
211-
212-
// Propagate the memory writes above to the world.
213-
fence::kernel(atomic::release);
214-
215-
// Increment team counter.
216-
// This counter is incremented by all teams in the current
217-
// BUFFER_SIZE chunk.
218-
ChunkTeamCount = atomic::inc(&Cnt, NumTeams, atomic::acq_rel,
219-
atomic::MemScopeTy::device);
220-
}
221-
222-
// Synchronize in SPMD mode as in generic mode all but 1 threads are in the
223-
// state machine.
224-
if (mapping::isSPMDMode())
225-
synchronize::threadsAligned(atomic::acq_rel);
226-
227-
// Each thread will have a local struct containing the values to be
228-
// reduced:
229-
// 1. do reduction within each warp.
230-
// 2. do reduction across warps.
231-
// 3. write the final result to the main reduction variable
232-
// by returning 1 in the thread holding the reduction result.
233-
234-
// Check if this is the very last team.
235-
if (ChunkTeamCount != NumTeams - 1)
236-
return 0;
237-
238-
// Last team processing.
239-
NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
240-
if (ThreadId >= NumThreads)
241-
return 0;
242-
243-
// Ensure we see the global memory writes by other teams
244-
fence::kernel(atomic::aquire);
245-
246-
// Load from buffer and reduce.
247-
glcpyFct(GlobalBuffer, ThreadId, reduce_data);
248-
for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
249-
glredFct(GlobalBuffer, i, reduce_data);
250-
251-
// Reduce across warps to the warp main.
252-
gpu_regular_warp_reduce(reduce_data, shflFct);
253-
254-
uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
255-
uint32_t WarpsNeeded =
256-
(ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
257-
// Gather all the reduced values from each warp
258-
// to the first warp.
259-
cpyFct(reduce_data, WarpsNeeded);
260-
261-
if (mapping::getWarpIdInBlock() == 0)
262-
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
263-
264-
return IsMain;
265-
}
266-
267179
int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
268-
IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
269-
uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
270-
InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
271-
ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
272-
// The first check is a compile time constant, the second one a runtime check.
273-
// If the first one succeeds we will use the specialized version.
274-
if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
275-
state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
276-
num_of_records == 1024) ||
277-
(omp_get_num_teams() <= num_of_records))
278-
return __kmpc_nvptx_teams_reduce_nowait_v3(
279-
Loc, GlobalBuffer, num_of_records, reduce_data_size, reduce_data,
280-
shflFct, cpyFct, lgcpyFct, lgredFct, glcpyFct, glredFct);
281-
180+
IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
181+
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
182+
ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
183+
ListGlobalFnTy glredFct) {
282184
// Terminate all threads in non-SPMD mode except for the master thread.
283185
uint32_t ThreadId = mapping::getThreadIdInBlock();
284186
if (mapping::isGenericMode()) {

0 commit comments

Comments
 (0)