@@ -176,109 +176,11 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
176
176
return nvptx_parallel_reduce_nowait (reduce_data, shflFct, cpyFct);
177
177
}
178
178
179
- // / Mostly like _v2 but with the builtin assumption that we have less than
180
- // / num_of_records (by default 1024) teams.
181
- int32_t __kmpc_nvptx_teams_reduce_nowait_v3 (
182
- IdentTy *Loc, void *__restrict__ GlobalBuffer, uint32_t num_of_records,
183
- uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
184
- InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
185
- ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
186
- // Terminate all threads in non-SPMD mode except for the main thread.
187
- uint32_t ThreadId = mapping::getThreadIdInBlock ();
188
- if (mapping::isGenericMode ()) {
189
- if (!mapping::isMainThreadInGenericMode ())
190
- return 0 ;
191
- ThreadId = 0 ;
192
- }
193
-
194
- uint32_t &Cnt = state::getKernelLaunchEnvironment ().ReductionCnt ;
195
-
196
- // In non-generic mode all workers participate in the teams reduction.
197
- // In generic mode only the team main participates in the teams
198
- // reduction because the workers are waiting for parallel work.
199
- uint32_t NumThreads = omp_get_num_threads ();
200
- uint32_t TeamId = omp_get_team_num ();
201
- uint32_t NumTeams = omp_get_num_teams ();
202
- static unsigned SHARED (ChunkTeamCount);
203
-
204
- // Block progress for teams greater than the current upper
205
- // limit. We always only allow a number of teams less or equal
206
- // to the number of slots in the buffer.
207
- bool IsMain = (ThreadId == 0 );
208
-
209
- if (IsMain) {
210
- lgcpyFct (GlobalBuffer, TeamId, reduce_data);
211
-
212
- // Propagate the memory writes above to the world.
213
- fence::kernel (atomic::release);
214
-
215
- // Increment team counter.
216
- // This counter is incremented by all teams in the current
217
- // BUFFER_SIZE chunk.
218
- ChunkTeamCount = atomic::inc (&Cnt, NumTeams, atomic::acq_rel,
219
- atomic::MemScopeTy::device);
220
- }
221
-
222
- // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
223
- // state machine.
224
- if (mapping::isSPMDMode ())
225
- synchronize::threadsAligned (atomic::acq_rel);
226
-
227
- // Each thread will have a local struct containing the values to be
228
- // reduced:
229
- // 1. do reduction within each warp.
230
- // 2. do reduction across warps.
231
- // 3. write the final result to the main reduction variable
232
- // by returning 1 in the thread holding the reduction result.
233
-
234
- // Check if this is the very last team.
235
- if (ChunkTeamCount != NumTeams - 1 )
236
- return 0 ;
237
-
238
- // Last team processing.
239
- NumThreads = roundToWarpsize (kmpcMin (NumThreads, NumTeams));
240
- if (ThreadId >= NumThreads)
241
- return 0 ;
242
-
243
- // Ensure we see the global memory writes by other teams
244
- fence::kernel (atomic::aquire);
245
-
246
- // Load from buffer and reduce.
247
- glcpyFct (GlobalBuffer, ThreadId, reduce_data);
248
- for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
249
- glredFct (GlobalBuffer, i, reduce_data);
250
-
251
- // Reduce across warps to the warp main.
252
- gpu_regular_warp_reduce (reduce_data, shflFct);
253
-
254
- uint32_t ActiveThreads = kmpcMin (NumTeams, NumThreads);
255
- uint32_t WarpsNeeded =
256
- (ActiveThreads + mapping::getWarpSize () - 1 ) / mapping::getWarpSize ();
257
- // Gather all the reduced values from each warp
258
- // to the first warp.
259
- cpyFct (reduce_data, WarpsNeeded);
260
-
261
- if (mapping::getWarpIdInBlock () == 0 )
262
- gpu_irregular_warp_reduce (reduce_data, shflFct, WarpsNeeded, ThreadId);
263
-
264
- return IsMain;
265
- }
266
-
267
179
int32_t __kmpc_nvptx_teams_reduce_nowait_v2 (
268
- IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
269
- uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
270
- InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
271
- ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
272
- // The first check is a compile time constant, the second one a runtime check.
273
- // If the first one succeeds we will use the specialized version.
274
- if ((state::getKernelEnvironment ().Configuration .MaxTeams >= 0 &&
275
- state::getKernelEnvironment ().Configuration .MaxTeams <= num_of_records &&
276
- num_of_records == 1024 ) ||
277
- (omp_get_num_teams () <= num_of_records))
278
- return __kmpc_nvptx_teams_reduce_nowait_v3 (
279
- Loc, GlobalBuffer, num_of_records, reduce_data_size, reduce_data,
280
- shflFct, cpyFct, lgcpyFct, lgredFct, glcpyFct, glredFct);
281
-
180
+ IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
181
+ void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
182
+ ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
183
+ ListGlobalFnTy glredFct) {
282
184
// Terminate all threads in non-SPMD mode except for the master thread.
283
185
uint32_t ThreadId = mapping::getThreadIdInBlock ();
284
186
if (mapping::isGenericMode ()) {
0 commit comments