Skip to content

Commit a2fc0d3

Browse files
committed
[OpenMP] Move synchronization into __tgt_async_info
The AsyncInfo should be passed everywhere and it should offer a way to ensure synchronization, given a libomptarget Device. This replaces D96431. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D96438
1 parent 9427287 commit a2fc0d3

File tree

3 files changed

+67
-43
lines changed

3 files changed

+67
-43
lines changed

openmp/libomptarget/include/omptarget.h

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ struct __tgt_target_table {
119119
*EntriesEnd; // End of the table with all the entries (non inclusive)
120120
};
121121

122+
// clang-format on
123+
122124
/// This struct contains information exchanged between different asynchronous
123125
/// operations for device-dependent optimization and potential synchronization
124126
struct __tgt_async_info {
@@ -128,15 +130,36 @@ struct __tgt_async_info {
128130
void *Queue = nullptr;
129131
};
130132

133+
struct DeviceTy;
134+
135+
/// The libomptarget wrapper around a __tgt_async_info object directly
136+
/// associated with a libomptarget layer device. RAII semantics to avoid
137+
/// mistakes.
138+
class AsyncInfoTy {
139+
__tgt_async_info AsyncInfo;
140+
DeviceTy &Device;
141+
142+
public:
143+
AsyncInfoTy(DeviceTy &Device) : Device(Device) {}
144+
~AsyncInfoTy() { synchronize(); }
145+
146+
/// Implicit conversion to the __tgt_async_info which is used in the
147+
/// plugin interface.
148+
operator __tgt_async_info *() { return &AsyncInfo; }
149+
150+
/// Synchronize all pending actions.
151+
///
152+
/// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately.
153+
int synchronize();
154+
};
155+
131156
/// This struct is a record of non-contiguous information
132157
struct __tgt_target_non_contig {
133158
uint64_t Offset;
134159
uint64_t Count;
135160
uint64_t Stride;
136161
};
137162

138-
// clang-format on
139-
140163
#ifdef __cplusplus
141164
extern "C" {
142165
#endif

openmp/libomptarget/src/omptarget.cpp

Lines changed: 37 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,18 @@
1919
#include <cassert>
2020
#include <vector>
2121

22+
int AsyncInfoTy::synchronize() {
23+
int Result = OFFLOAD_SUCCESS;
24+
if (AsyncInfo.Queue) {
25+
// If we have a queue we need to synchronize it now.
26+
Result = Device.synchronize(&AsyncInfo);
27+
assert(AsyncInfo.Queue == nullptr &&
28+
"The device plugin should have nulled the queue to indicate there "
29+
"are no outstanding actions!");
30+
}
31+
return Result;
32+
}
33+
2234
/* All begin addresses for partially mapped structs must be 8-aligned in order
2335
* to ensure proper alignment of members. E.g.
2436
*
@@ -248,7 +260,7 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
248260
MapperArgsBase.data(), MapperArgs.data(),
249261
MapperArgSizes.data(), MapperArgTypes.data(),
250262
MapperArgNames.data(), /*arg_mappers*/ nullptr,
251-
/*__tgt_async_info*/ nullptr);
263+
/* AsyncInfoTy */ nullptr);
252264

253265
return rc;
254266
}
@@ -257,7 +269,7 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
257269
int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
258270
void **args_base, void **args, int64_t *arg_sizes,
259271
int64_t *arg_types, map_var_info_t *arg_names,
260-
void **arg_mappers, __tgt_async_info *AsyncInfo) {
272+
void **arg_mappers, AsyncInfoTy *AsyncInfo) {
261273
// process each input.
262274
for (int32_t i = 0; i < arg_num; ++i) {
263275
// Ignore private variables and arrays - there is no mapping for them.
@@ -404,7 +416,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
404416
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
405417
data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
406418
int rt =
407-
Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, AsyncInfo);
419+
Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, *AsyncInfo);
408420
if (rt != OFFLOAD_SUCCESS) {
409421
REPORT("Copying data to device failed.\n");
410422
return OFFLOAD_FAIL;
@@ -418,7 +430,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
418430
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
419431
void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
420432
int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
421-
sizeof(void *), AsyncInfo);
433+
sizeof(void *), *AsyncInfo);
422434
if (rt != OFFLOAD_SUCCESS) {
423435
REPORT("Copying data to device failed.\n");
424436
return OFFLOAD_FAIL;
@@ -452,24 +464,13 @@ struct DeallocTgtPtrInfo {
452464
: HstPtrBegin(HstPtr), DataSize(Size), ForceDelete(ForceDelete),
453465
HasCloseModifier(HasCloseModifier) {}
454466
};
455-
456-
/// Synchronize device
457-
static int syncDevice(DeviceTy &Device, __tgt_async_info *AsyncInfo) {
458-
assert(AsyncInfo && AsyncInfo->Queue && "Invalid AsyncInfo");
459-
if (Device.synchronize(AsyncInfo) != OFFLOAD_SUCCESS) {
460-
REPORT("Failed to synchronize device.\n");
461-
return OFFLOAD_FAIL;
462-
}
463-
464-
return OFFLOAD_SUCCESS;
465-
}
466467
} // namespace
467468

468469
/// Internal function to undo the mapping and retrieve the data from the device.
469470
int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
470471
void **ArgBases, void **Args, int64_t *ArgSizes,
471472
int64_t *ArgTypes, map_var_info_t *ArgNames,
472-
void **ArgMappers, __tgt_async_info *AsyncInfo) {
473+
void **ArgMappers, AsyncInfoTy *AsyncInfo) {
473474
int Ret;
474475
std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
475476
// process each input.
@@ -584,7 +585,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
584585
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
585586
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
586587
Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
587-
AsyncInfo);
588+
*AsyncInfo);
588589
if (Ret != OFFLOAD_SUCCESS) {
589590
REPORT("Copying data from device failed.\n");
590591
return OFFLOAD_FAIL;
@@ -642,8 +643,8 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
642643
// nullptr, there is no data transfer happened because once there is,
643644
// AsyncInfo->Queue will not be nullptr, so again, we don't need to
644645
// synchronize.
645-
if (AsyncInfo && AsyncInfo->Queue) {
646-
Ret = syncDevice(Device, AsyncInfo);
646+
if (AsyncInfo) {
647+
Ret = AsyncInfo->synchronize();
647648
if (Ret != OFFLOAD_SUCCESS)
648649
return OFFLOAD_FAIL;
649650
}
@@ -798,7 +799,7 @@ static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
798799
int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
799800
void **ArgsBase, void **Args, int64_t *ArgSizes,
800801
int64_t *ArgTypes, map_var_info_t *ArgNames,
801-
void **ArgMappers, __tgt_async_info *AsyncInfo) {
802+
void **ArgMappers, AsyncInfoTy *AsyncInfo) {
802803
// process each input.
803804
for (int32_t I = 0; I < ArgNum; ++I) {
804805
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
@@ -948,8 +949,8 @@ class PrivateArgumentManagerTy {
948949

949950
/// A reference to the \p DeviceTy object
950951
DeviceTy &Device;
951-
/// A pointer to a \p __tgt_async_info object
952-
__tgt_async_info *AsyncInfo;
952+
/// A pointer to a \p AsyncInfoTy object
953+
AsyncInfoTy *AsyncInfo;
953954

954955
// TODO: What would be the best value here? Should we make it configurable?
955956
// If the size is larger than this threshold, we will allocate and transfer it
@@ -958,7 +959,7 @@ class PrivateArgumentManagerTy {
958959

959960
public:
960961
/// Constructor
961-
PrivateArgumentManagerTy(DeviceTy &Dev, __tgt_async_info *AsyncInfo)
962+
PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy *AsyncInfo)
962963
: Device(Dev), AsyncInfo(AsyncInfo) {}
963964

964965
/// Add a private argument
@@ -985,7 +986,7 @@ class PrivateArgumentManagerTy {
985986
#endif
986987
// If first-private, copy data from host
987988
if (IsFirstPrivate) {
988-
int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
989+
int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, *AsyncInfo);
989990
if (Ret != OFFLOAD_SUCCESS) {
990991
DP("Copying data to device failed, failed.\n");
991992
return OFFLOAD_FAIL;
@@ -1041,7 +1042,7 @@ class PrivateArgumentManagerTy {
10411042
FirstPrivateArgSize, DPxPTR(TgtPtr));
10421043
// Transfer data to target device
10431044
int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1044-
FirstPrivateArgSize, AsyncInfo);
1045+
FirstPrivateArgSize, *AsyncInfo);
10451046
if (Ret != OFFLOAD_SUCCESS) {
10461047
DP("Failed to submit data of private arguments.\n");
10471048
return OFFLOAD_FAIL;
@@ -1089,7 +1090,7 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
10891090
std::vector<void *> &TgtArgs,
10901091
std::vector<ptrdiff_t> &TgtOffsets,
10911092
PrivateArgumentManagerTy &PrivateArgumentManager,
1092-
__tgt_async_info *AsyncInfo) {
1093+
AsyncInfoTy *AsyncInfo) {
10931094
TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
10941095
DeviceTy &Device = PM->Devices[DeviceId];
10951096
int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
@@ -1140,7 +1141,7 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
11401141
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
11411142
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
11421143
Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1143-
sizeof(void *), AsyncInfo);
1144+
sizeof(void *), *AsyncInfo);
11441145
if (Ret != OFFLOAD_SUCCESS) {
11451146
REPORT("Copying data to device failed.\n");
11461147
return OFFLOAD_FAIL;
@@ -1210,7 +1211,7 @@ static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
12101211
int64_t *ArgSizes, int64_t *ArgTypes,
12111212
map_var_info_t *ArgNames, void **ArgMappers,
12121213
PrivateArgumentManagerTy &PrivateArgumentManager,
1213-
__tgt_async_info *AsyncInfo) {
1214+
AsyncInfoTy *AsyncInfo) {
12141215
TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
12151216
DeviceTy &Device = PM->Devices[DeviceId];
12161217

@@ -1242,8 +1243,7 @@ static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
12421243
int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
12431244
void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
12441245
map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1245-
int32_t ThreadLimit, int IsTeamConstruct,
1246-
__tgt_async_info *AsyncInfo) {
1246+
int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy *AsyncInfo) {
12471247
int32_t DeviceId = Device.DeviceID;
12481248

12491249
TableMap *TM = getTableMap(HostPtr);
@@ -1266,7 +1266,7 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
12661266

12671267
// TODO: This will go away as soon as we consequently pass in async info
12681268
// objects (as references).
1269-
__tgt_async_info InternalAsyncInfo;
1269+
AsyncInfoTy InternalAsyncInfo(Device);
12701270
if (!AsyncInfo)
12711271
AsyncInfo = &InternalAsyncInfo;
12721272

@@ -1301,10 +1301,10 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
13011301
if (IsTeamConstruct)
13021302
Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
13031303
TgtArgs.size(), TeamNum, ThreadLimit,
1304-
LoopTripCount, AsyncInfo);
1304+
LoopTripCount, *AsyncInfo);
13051305
else
13061306
Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1307-
TgtArgs.size(), AsyncInfo);
1307+
TgtArgs.size(), *AsyncInfo);
13081308
}
13091309

13101310
if (Ret != OFFLOAD_SUCCESS) {
@@ -1322,11 +1322,13 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
13221322
REPORT("Failed to process data after launching the kernel.\n");
13231323
return OFFLOAD_FAIL;
13241324
}
1325-
} else if (AsyncInfo->Queue) {
1325+
} else {
1326+
// TODO: We should not synchronize here but on the outer level once we pass
1327+
// in a reference AsyncInfo object.
13261328
// If ArgNum is zero, but AsyncInfo.Queue is valid, then the kernel doesn't
13271329
// hava any argument, and the device supports async operations, so we need a
13281330
// sync at this point.
1329-
return syncDevice(Device, AsyncInfo);
1331+
return AsyncInfo->synchronize();
13301332
}
13311333

13321334
return OFFLOAD_SUCCESS;

openmp/libomptarget/src/private.h

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,23 @@
2323
extern int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
2424
void **args_base, void **args, int64_t *arg_sizes,
2525
int64_t *arg_types, map_var_info_t *arg_names,
26-
void **arg_mappers, __tgt_async_info *AsyncInfo);
26+
void **arg_mappers, AsyncInfoTy *AsyncInfo);
2727

2828
extern int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
2929
void **ArgBases, void **Args, int64_t *ArgSizes,
3030
int64_t *ArgTypes, map_var_info_t *arg_names,
31-
void **ArgMappers, __tgt_async_info *AsyncInfo);
31+
void **ArgMappers, AsyncInfoTy *AsyncInfo);
3232

3333
extern int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t arg_num,
3434
void **args_base, void **args, int64_t *arg_sizes,
3535
int64_t *arg_types, map_var_info_t *arg_names,
36-
void **arg_mappers, __tgt_async_info *AsyncInfo);
36+
void **arg_mappers, AsyncInfoTy *AsyncInfo);
3737

3838
extern int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
3939
void **ArgBases, void **Args, int64_t *ArgSizes,
4040
int64_t *ArgTypes, map_var_info_t *arg_names,
4141
void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit,
42-
int IsTeamConstruct, __tgt_async_info *AsyncInfo);
42+
int IsTeamConstruct, AsyncInfoTy *AsyncInfo);
4343

4444
extern int CheckDeviceAndCtors(int64_t device_id);
4545

@@ -76,8 +76,7 @@ typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t,
7676
// targetDataEnd and targetDataUpdate).
7777
typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
7878
void **, int64_t *, int64_t *,
79-
map_var_info_t *, void **,
80-
__tgt_async_info *);
79+
map_var_info_t *, void **, AsyncInfoTy *);
8180

8281
// Implemented in libomp, they are called from within __tgt_* functions.
8382
#ifdef __cplusplus

0 commit comments

Comments
 (0)