Skip to content

Commit 758b849

Browse files
committed
[OpenMP] Unify omptarget API and usage wrt. __tgt_async_info
This patch unifies our libomptarget API in two ways: - always pass a `__tgt_async_info` object, the Queue member decides if it is in use or not. - (almost) always synchronize in the interface layer and not in the omptarget layer. A side effect is that we now put all constructor and static initializer kernels in a stream too, if the device utilizes `__tgt_async_info`. The patch contains a TODO which can be addressed as we add support for asynchronous malloc and free in the plugin API. This is the only `synchronizeAsyncInfo` left in the omptarget layer. Site note: On a V100 system the GridMini performance for small sizes more than doubled. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D96379
1 parent a2fc0d3 commit 758b849

File tree

7 files changed

+121
-106
lines changed

7 files changed

+121
-106
lines changed

openmp/libomptarget/src/api.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "device.h"
14+
#include "omptarget.h"
1415
#include "private.h"
1516
#include "rtl.h"
1617

@@ -171,27 +172,35 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
171172
} else if (src_device == omp_get_initial_device()) {
172173
DP("copy from host to device\n");
173174
DeviceTy &DstDev = PM->Devices[dst_device];
174-
rc = DstDev.submitData(dstAddr, srcAddr, length, nullptr);
175+
AsyncInfoTy AsyncInfo(DstDev);
176+
rc = DstDev.submitData(dstAddr, srcAddr, length, AsyncInfo);
175177
} else if (dst_device == omp_get_initial_device()) {
176178
DP("copy from device to host\n");
177179
DeviceTy &SrcDev = PM->Devices[src_device];
178-
rc = SrcDev.retrieveData(dstAddr, srcAddr, length, nullptr);
180+
AsyncInfoTy AsyncInfo(SrcDev);
181+
rc = SrcDev.retrieveData(dstAddr, srcAddr, length, AsyncInfo);
179182
} else {
180183
DP("copy from device to device\n");
181184
DeviceTy &SrcDev = PM->Devices[src_device];
182185
DeviceTy &DstDev = PM->Devices[dst_device];
183186
// First try to use D2D memcpy which is more efficient. If fails, fall back
184187
// to unefficient way.
185188
if (SrcDev.isDataExchangable(DstDev)) {
186-
rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, nullptr);
189+
AsyncInfoTy AsyncInfo(SrcDev);
190+
rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, AsyncInfo);
187191
if (rc == OFFLOAD_SUCCESS)
188192
return OFFLOAD_SUCCESS;
189193
}
190194

191195
void *buffer = malloc(length);
192-
rc = SrcDev.retrieveData(buffer, srcAddr, length, nullptr);
193-
if (rc == OFFLOAD_SUCCESS)
194-
rc = DstDev.submitData(dstAddr, buffer, length, nullptr);
196+
{
197+
AsyncInfoTy AsyncInfo(SrcDev);
198+
rc = SrcDev.retrieveData(buffer, srcAddr, length, AsyncInfo);
199+
}
200+
if (rc == OFFLOAD_SUCCESS) {
201+
AsyncInfoTy AsyncInfo(SrcDev);
202+
rc = DstDev.submitData(dstAddr, buffer, length, AsyncInfo);
203+
}
195204
free(buffer);
196205
}
197206

openmp/libomptarget/src/device.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -415,27 +415,27 @@ int32_t DeviceTy::deleteData(void *TgtPtrBegin) {
415415

416416
// Submit data to device
417417
int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
418-
__tgt_async_info *AsyncInfoPtr) {
419-
if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize)
418+
AsyncInfoTy &AsyncInfo) {
419+
if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
420420
return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
421421
else
422422
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
423-
AsyncInfoPtr);
423+
AsyncInfo);
424424
}
425425

426426
// Retrieve data from device
427427
int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
428-
int64_t Size, __tgt_async_info *AsyncInfoPtr) {
429-
if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize)
428+
int64_t Size, AsyncInfoTy &AsyncInfo) {
429+
if (!RTL->data_retrieve_async || !RTL->synchronize)
430430
return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
431431
else
432432
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
433-
AsyncInfoPtr);
433+
AsyncInfo);
434434
}
435435

436436
// Copy data from current device to destination device directly
437437
int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
438-
int64_t Size, __tgt_async_info *AsyncInfo) {
438+
int64_t Size, AsyncInfoTy &AsyncInfo) {
439439
if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
440440
assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
441441
return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
@@ -448,29 +448,29 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
448448
// Run region on device
449449
int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr,
450450
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
451-
__tgt_async_info *AsyncInfoPtr) {
452-
if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize)
451+
AsyncInfoTy &AsyncInfo) {
452+
if (!RTL->run_region || !RTL->synchronize)
453453
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
454454
TgtVarsSize);
455455
else
456456
return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
457-
TgtOffsets, TgtVarsSize, AsyncInfoPtr);
457+
TgtOffsets, TgtVarsSize, AsyncInfo);
458458
}
459459

460460
// Run team region on device.
461461
int32_t DeviceTy::runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
462462
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
463463
int32_t NumTeams, int32_t ThreadLimit,
464464
uint64_t LoopTripCount,
465-
__tgt_async_info *AsyncInfoPtr) {
466-
if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize)
465+
AsyncInfoTy &AsyncInfo) {
466+
if (!RTL->run_team_region_async || !RTL->synchronize)
467467
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
468468
TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
469469
LoopTripCount);
470470
else
471471
return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
472472
TgtOffsets, TgtVarsSize, NumTeams,
473-
ThreadLimit, LoopTripCount, AsyncInfoPtr);
473+
ThreadLimit, LoopTripCount, AsyncInfo);
474474
}
475475

476476
// Whether data can be copied to DstDevice directly
@@ -485,9 +485,9 @@ bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
485485
return false;
486486
}
487487

488-
int32_t DeviceTy::synchronize(__tgt_async_info *AsyncInfoPtr) {
488+
int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
489489
if (RTL->synchronize)
490-
return RTL->synchronize(RTLDeviceID, AsyncInfoPtr);
490+
return RTL->synchronize(RTLDeviceID, AsyncInfo);
491491
return OFFLOAD_SUCCESS;
492492
}
493493

openmp/libomptarget/src/device.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222
#include <set>
2323
#include <vector>
2424

25+
#include "omptarget.h"
2526
#include "rtl.h"
2627

2728
// Forward declarations.
2829
struct RTLInfoTy;
2930
struct __tgt_bin_desc;
3031
struct __tgt_target_table;
31-
struct __tgt_async_info;
3232

3333
using map_var_info_t = void *;
3434

@@ -200,24 +200,24 @@ struct DeviceTy {
200200
// synchronous.
201201
// Copy data from host to device
202202
int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
203-
__tgt_async_info *AsyncInfoPtr);
203+
AsyncInfoTy &AsyncInfo);
204204
// Copy data from device back to host
205205
int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
206-
__tgt_async_info *AsyncInfoPtr);
206+
AsyncInfoTy &AsyncInfo);
207207
// Copy data from current device to destination device directly
208208
int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
209-
int64_t Size, __tgt_async_info *AsyncInfo);
209+
int64_t Size, AsyncInfoTy &AsyncInfo);
210210

211211
int32_t runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets,
212-
int32_t TgtVarsSize, __tgt_async_info *AsyncInfoPtr);
212+
int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo);
213213
int32_t runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
214214
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
215215
int32_t NumTeams, int32_t ThreadLimit,
216-
uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr);
216+
uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
217217

218218
/// Synchronize device/queue/event based on \p AsyncInfoPtr and return
219219
/// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
220-
int32_t synchronize(__tgt_async_info *AsyncInfoPtr);
220+
int32_t synchronize(AsyncInfoTy &AsyncInfo);
221221

222222
private:
223223
// Call to RTL

openmp/libomptarget/src/interface.cpp

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
//===----------------------------------------------------------------------===//
1313

1414
#include "device.h"
15+
#include "omptarget.h"
1516
#include "private.h"
1617
#include "rtl.h"
1718

@@ -183,8 +184,11 @@ EXTERN void __tgt_target_data_begin_mapper(ident_t *loc, int64_t device_id,
183184
}
184185
#endif
185186

187+
AsyncInfoTy AsyncInfo(Device);
186188
int rc = targetDataBegin(loc, Device, arg_num, args_base, args, arg_sizes,
187-
arg_types, arg_names, arg_mappers, nullptr);
189+
arg_types, arg_names, arg_mappers, AsyncInfo);
190+
if (rc == OFFLOAD_SUCCESS)
191+
rc = AsyncInfo.synchronize();
188192
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
189193
}
190194

@@ -270,8 +274,11 @@ EXTERN void __tgt_target_data_end_mapper(ident_t *loc, int64_t device_id,
270274
}
271275
#endif
272276

277+
AsyncInfoTy AsyncInfo(Device);
273278
int rc = targetDataEnd(loc, Device, arg_num, args_base, args, arg_sizes,
274-
arg_types, arg_names, arg_mappers, nullptr);
279+
arg_types, arg_names, arg_mappers, AsyncInfo);
280+
if (rc == OFFLOAD_SUCCESS)
281+
rc = AsyncInfo.synchronize();
275282
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
276283
}
277284

@@ -335,8 +342,11 @@ EXTERN void __tgt_target_data_update_mapper(ident_t *loc, int64_t device_id,
335342
arg_names, "Updating OpenMP data");
336343

337344
DeviceTy &Device = PM->Devices[device_id];
345+
AsyncInfoTy AsyncInfo(Device);
338346
int rc = targetDataUpdate(loc, Device, arg_num, args_base, args, arg_sizes,
339-
arg_types, arg_names, arg_mappers, nullptr);
347+
arg_types, arg_names, arg_mappers, AsyncInfo);
348+
if (rc == OFFLOAD_SUCCESS)
349+
rc = AsyncInfo.synchronize();
340350
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
341351
}
342352

@@ -408,9 +418,12 @@ EXTERN int __tgt_target_mapper(ident_t *loc, int64_t device_id, void *host_ptr,
408418
#endif
409419

410420
DeviceTy &Device = PM->Devices[device_id];
411-
int rc =
412-
target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
413-
arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, nullptr);
421+
AsyncInfoTy AsyncInfo(Device);
422+
int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
423+
arg_types, arg_names, arg_mappers, 0, 0, false /*team*/,
424+
AsyncInfo);
425+
if (rc == OFFLOAD_SUCCESS)
426+
rc = AsyncInfo.synchronize();
414427
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
415428
return rc;
416429
}
@@ -490,9 +503,12 @@ EXTERN int __tgt_target_teams_mapper(ident_t *loc, int64_t device_id,
490503
#endif
491504

492505
DeviceTy &Device = PM->Devices[device_id];
506+
AsyncInfoTy AsyncInfo(Device);
493507
int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
494508
arg_types, arg_names, arg_mappers, team_num, thread_limit,
495-
true /*team*/, nullptr);
509+
true /*team*/, AsyncInfo);
510+
if (rc == OFFLOAD_SUCCESS)
511+
rc = AsyncInfo.synchronize();
496512
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
497513
return rc;
498514
}

0 commit comments

Comments
 (0)