-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[NVPTX] Add syncscope support for cmpxchg #140812
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[NVPTX] Add syncscope support for cmpxchg #140812
Conversation
@llvm/pr-subscribers-backend-arm Author: Akshay Deodhar (akshayrdeodhar) ChangesThis MR adds support for cmpxchg instructions with syncscope. Adds PatFrags for matching syncscope for 3-input atomic operations in the NVPTX backend. Patch is 2.76 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140812.diff 19 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 03099e9ad44dc..b2a75965e6c2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2319,13 +2319,15 @@ class TargetLoweringBase {
/// standard ABI uses a fence before a seq_cst load instead of after a
/// seq_cst store).
/// @{
- virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
-
- virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
+ virtual Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
+
+ virtual Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
/// @}
// Emits code that executes when the comparison result in the ll/sc
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..b8dcafa32052b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,7 +65,8 @@ class AtomicExpandImpl {
const DataLayout *DL = nullptr;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+ SyncScope::ID SSID = SyncScope::System);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
+ SyncScope::ID SSID = SyncScope::System;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
@@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+ SSID = CASI->getSyncScopeID();
CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
+ // If CAS ordering is monotonic, then the operation will
+ // take default scope. Otherwise, it will retain its scope
+ if (CASOrdering != AtomicOrdering::Monotonic)
+ CASI->setSyncScopeID(SSID);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
}
} else if (I->hasAtomicStore() &&
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
@@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
}
bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
- AtomicOrdering Order) {
+ AtomicOrdering Order,
+ SyncScope::ID SSID) {
ReplacementIRBuilder Builder(I, *DL);
- auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);
- auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
// We have a guard here because not every atomic operation generates a
// trailing fence.
if (TrailingFence)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c85f0c71ef25f..d0268545042ed 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2327,18 +2327,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isAcquireOrStronger(Ord))
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index afbf1b4c55e70..5196ce846d6a2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21229,7 +21229,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -21254,7 +21255,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..da09eca2b946f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -666,10 +666,12 @@ class VectorType;
void
emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *emitLeadingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
+ Instruction *emitTrailingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
unsigned getMaxSupportedInterleaveFactor() const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1f417dbada8e6..0bf3e5dcdbf4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6311,7 +6311,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
@@ -6319,15 +6320,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ SSID)
+ : Builder.CreateFence(AtomicOrdering::Release, SSID);
return nullptr;
}
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
@@ -6340,7 +6343,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
- return Builder.CreateFence(AtomicOrdering::Acquire);
+ return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index b4b7dad984b62..3f494c9066140 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -280,10 +280,14 @@ class NVPTXTargetLowering : public TargetLowering {
AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 193418ca391e5..4dbcf6183efe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,27 @@ def AS_match {
}];
}
+multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
+ defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
+ def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
+ }]>;
+ def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
+ }]>;
+ def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
+ }]>;
+ def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
+ }]>;
+}
+
+
// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -2111,9 +2132,9 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}
// has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
- SDPatternOperator op, list<Predicate> preds> {
- defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str,
+ string op_str, SDPatternOperator op, list<Predicate> preds> {
+ defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def rr : NVPTXInst<(outs t.RC:$dst),
(ins ADDR:$addr, t.RC:$b, t.RC:$c),
@@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
+multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string scope_str, string sem_str, string op_str, list<Predicate> preds = []> {
defvar frag_pat = (frag node:$a, node:$b, node:$c);
- defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
- defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
- defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
- defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
+ defm _G : F_ATOMIC_3<t, sem_str, scope_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
+ defm _S : F_ATOMIC_3<t, sem_str, scope_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _S_C : F_ATOMIC_3<t, sem_str, scope_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
+ defm _GEN : F_ATOMIC_3<t, sem_str, scope_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
// atom_add
@@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in {
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
+
+ // Instantiate scoped versions of the atomic compare and swap pattern
+ defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;
+
+ foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+ defvar atomic_cmp_swap_pat_scoped = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order#_#scope);
+
+ // Syncscope is only supported for SM70+
+ defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat_scoped, "."#scope, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ }
+
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#t.Size#_#order
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "", "cas.b"#t.Size, []>;
}
}
// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
RegTyInfo t, list<Predicate> Preds> {
defm "" : F_ATOMIC_3<t,
as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
- sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
+ sem_str = "",
+ scope_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
op_str = OpStr # "." # TypeStr,
op = !cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59bfec30dc211..6dd67c76b7077 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12588,7 +12588,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -12598,7 +12599,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e7e7c21b50395..964f5e11f78cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,10 +927,14 @@ namespace llvm {
return true;
}
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
bool shouldInlineQuadwordAtomics() const;
diff --git a/llvm/lib/Target/RIS...
[truncated]
|
@llvm/pr-subscribers-backend-risc-v Author: Akshay Deodhar (akshayrdeodhar) ChangesThis MR adds support for cmpxchg instructions with syncscope. Adds PatFrags for matching syncscope for 3-input atomic operations in the NVPTX backend. Patch is 2.76 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140812.diff 19 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 03099e9ad44dc..b2a75965e6c2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2319,13 +2319,15 @@ class TargetLoweringBase {
/// standard ABI uses a fence before a seq_cst load instead of after a
/// seq_cst store).
/// @{
- virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
-
- virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
+ virtual Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
+
+ virtual Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
/// @}
// Emits code that executes when the comparison result in the ll/sc
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..b8dcafa32052b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,7 +65,8 @@ class AtomicExpandImpl {
const DataLayout *DL = nullptr;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+ SyncScope::ID SSID = SyncScope::System);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
+ SyncScope::ID SSID = SyncScope::System;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
@@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+ SSID = CASI->getSyncScopeID();
CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
+ // If CAS ordering is monotonic, then the operation will
+ // take default scope. Otherwise, it will retain its scope
+ if (CASOrdering != AtomicOrdering::Monotonic)
+ CASI->setSyncScopeID(SSID);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
}
} else if (I->hasAtomicStore() &&
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
@@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
}
bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
- AtomicOrdering Order) {
+ AtomicOrdering Order,
+ SyncScope::ID SSID) {
ReplacementIRBuilder Builder(I, *DL);
- auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);
- auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
// We have a guard here because not every atomic operation generates a
// trailing fence.
if (TrailingFence)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c85f0c71ef25f..d0268545042ed 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2327,18 +2327,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isAcquireOrStronger(Ord))
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index afbf1b4c55e70..5196ce846d6a2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21229,7 +21229,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -21254,7 +21255,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..da09eca2b946f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -666,10 +666,12 @@ class VectorType;
void
emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *emitLeadingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
+ Instruction *emitTrailingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
unsigned getMaxSupportedInterleaveFactor() const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1f417dbada8e6..0bf3e5dcdbf4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6311,7 +6311,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
@@ -6319,15 +6320,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ SSID)
+ : Builder.CreateFence(AtomicOrdering::Release, SSID);
return nullptr;
}
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
@@ -6340,7 +6343,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
- return Builder.CreateFence(AtomicOrdering::Acquire);
+ return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index b4b7dad984b62..3f494c9066140 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -280,10 +280,14 @@ class NVPTXTargetLowering : public TargetLowering {
AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 193418ca391e5..4dbcf6183efe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,27 @@ def AS_match {
}];
}
+multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
+ defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
+ def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
+ }]>;
+ def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
+ }]>;
+ def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
+ }]>;
+ def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
+ }]>;
+}
+
+
// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -2111,9 +2132,9 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}
// has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
- SDPatternOperator op, list<Predicate> preds> {
- defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str,
+ string op_str, SDPatternOperator op, list<Predicate> preds> {
+ defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def rr : NVPTXInst<(outs t.RC:$dst),
(ins ADDR:$addr, t.RC:$b, t.RC:$c),
@@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
+multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string scope_str, string sem_str, string op_str, list<Predicate> preds = []> {
defvar frag_pat = (frag node:$a, node:$b, node:$c);
- defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
- defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
- defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
- defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
+ defm _G : F_ATOMIC_3<t, sem_str, scope_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
+ defm _S : F_ATOMIC_3<t, sem_str, scope_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _S_C : F_ATOMIC_3<t, sem_str, scope_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
+ defm _GEN : F_ATOMIC_3<t, sem_str, scope_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
// atom_add
@@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in {
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
+
+ // Instantiate scoped versions of the atomic compare and swap pattern
+ defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;
+
+ foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+ defvar atomic_cmp_swap_pat_scoped = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order#_#scope);
+
+ // Syncscope is only supported for SM70+
+ defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat_scoped, "."#scope, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ }
+
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#t.Size#_#order
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "", "cas.b"#t.Size, []>;
}
}
// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
RegTyInfo t, list<Predicate> Preds> {
defm "" : F_ATOMIC_3<t,
as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
- sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
+ sem_str = "",
+ scope_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
op_str = OpStr # "." # TypeStr,
op = !cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59bfec30dc211..6dd67c76b7077 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12588,7 +12588,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -12598,7 +12599,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e7e7c21b50395..964f5e11f78cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,10 +927,14 @@ namespace llvm {
return true;
}
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
bool shouldInlineQuadwordAtomics() const;
diff --git a/llvm/lib/Target/RIS...
[truncated]
|
@llvm/pr-subscribers-backend-nvptx Author: Akshay Deodhar (akshayrdeodhar) ChangesThis MR adds support for cmpxchg instructions with syncscope. Adds PatFrags for matching syncscope for 3-input atomic operations in the NVPTX backend. Patch is 2.76 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140812.diff 19 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 03099e9ad44dc..b2a75965e6c2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2319,13 +2319,15 @@ class TargetLoweringBase {
/// standard ABI uses a fence before a seq_cst load instead of after a
/// seq_cst store).
/// @{
- virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
-
- virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
+ virtual Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
+
+ virtual Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
/// @}
// Emits code that executes when the comparison result in the ll/sc
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..b8dcafa32052b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,7 +65,8 @@ class AtomicExpandImpl {
const DataLayout *DL = nullptr;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+ SyncScope::ID SSID = SyncScope::System);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
+ SyncScope::ID SSID = SyncScope::System;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
@@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+ SSID = CASI->getSyncScopeID();
CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
+ // If CAS ordering is monotonic, then the operation will
+ // take default scope. Otherwise, it will retain its scope
+ if (CASOrdering != AtomicOrdering::Monotonic)
+ CASI->setSyncScopeID(SSID);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
}
} else if (I->hasAtomicStore() &&
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
@@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
}
bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
- AtomicOrdering Order) {
+ AtomicOrdering Order,
+ SyncScope::ID SSID) {
ReplacementIRBuilder Builder(I, *DL);
- auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);
- auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
// We have a guard here because not every atomic operation generates a
// trailing fence.
if (TrailingFence)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c85f0c71ef25f..d0268545042ed 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2327,18 +2327,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isAcquireOrStronger(Ord))
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index afbf1b4c55e70..5196ce846d6a2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21229,7 +21229,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -21254,7 +21255,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..da09eca2b946f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -666,10 +666,12 @@ class VectorType;
void
emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *emitLeadingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
+ Instruction *emitTrailingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
unsigned getMaxSupportedInterleaveFactor() const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1f417dbada8e6..0bf3e5dcdbf4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6311,7 +6311,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
@@ -6319,15 +6320,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ SSID)
+ : Builder.CreateFence(AtomicOrdering::Release, SSID);
return nullptr;
}
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
@@ -6340,7 +6343,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
- return Builder.CreateFence(AtomicOrdering::Acquire);
+ return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index b4b7dad984b62..3f494c9066140 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -280,10 +280,14 @@ class NVPTXTargetLowering : public TargetLowering {
AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 193418ca391e5..4dbcf6183efe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,27 @@ def AS_match {
}];
}
+multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
+ defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
+ def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
+ }]>;
+ def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
+ }]>;
+ def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
+ }]>;
+ def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
+ }]>;
+}
+
+
// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -2111,9 +2132,9 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}
// has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
- SDPatternOperator op, list<Predicate> preds> {
- defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str,
+ string op_str, SDPatternOperator op, list<Predicate> preds> {
+ defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def rr : NVPTXInst<(outs t.RC:$dst),
(ins ADDR:$addr, t.RC:$b, t.RC:$c),
@@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
+multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string scope_str, string sem_str, string op_str, list<Predicate> preds = []> {
defvar frag_pat = (frag node:$a, node:$b, node:$c);
- defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
- defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
- defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
- defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
+ defm _G : F_ATOMIC_3<t, sem_str, scope_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
+ defm _S : F_ATOMIC_3<t, sem_str, scope_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _S_C : F_ATOMIC_3<t, sem_str, scope_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
+ defm _GEN : F_ATOMIC_3<t, sem_str, scope_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
// atom_add
@@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in {
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
+
+ // Instantiate scoped versions of the atomic compare and swap pattern
+ defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;
+
+ foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+ defvar atomic_cmp_swap_pat_scoped = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order#_#scope);
+
+ // Syncscope is only supported for SM70+
+ defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat_scoped, "."#scope, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ }
+
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#t.Size#_#order
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "", "cas.b"#t.Size, []>;
}
}
// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
RegTyInfo t, list<Predicate> Preds> {
defm "" : F_ATOMIC_3<t,
as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
- sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
+ sem_str = "",
+ scope_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
op_str = OpStr # "." # TypeStr,
op = !cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59bfec30dc211..6dd67c76b7077 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12588,7 +12588,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -12598,7 +12599,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e7e7c21b50395..964f5e11f78cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,10 +927,14 @@ namespace llvm {
return true;
}
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
bool shouldInlineQuadwordAtomics() const;
diff --git a/llvm/lib/Target/RIS...
[truncated]
|
@llvm/pr-subscribers-backend-powerpc Author: Akshay Deodhar (akshayrdeodhar) ChangesThis MR adds support for cmpxchg instructions with syncscope. Adds PatFrags for matching syncscope for 3-input atomic operations in the NVPTX backend. Patch is 2.76 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140812.diff 19 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 03099e9ad44dc..b2a75965e6c2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2319,13 +2319,15 @@ class TargetLoweringBase {
/// standard ABI uses a fence before a seq_cst load instead of after a
/// seq_cst store).
/// @{
- virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
-
- virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
+ virtual Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
+
+ virtual Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
/// @}
// Emits code that executes when the comparison result in the ll/sc
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index c376de877ac7d..b8dcafa32052b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,7 +65,8 @@ class AtomicExpandImpl {
const DataLayout *DL = nullptr;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+ SyncScope::ID SSID = SyncScope::System);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
+ SyncScope::ID SSID = SyncScope::System;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
@@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+ SSID = CASI->getSyncScopeID();
CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
+ // If CAS ordering is monotonic, then the operation will
+ // take default scope. Otherwise, it will retain its scope
+ if (CASOrdering != AtomicOrdering::Monotonic)
+ CASI->setSyncScopeID(SSID);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
}
} else if (I->hasAtomicStore() &&
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
@@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
}
bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
- AtomicOrdering Order) {
+ AtomicOrdering Order,
+ SyncScope::ID SSID) {
ReplacementIRBuilder Builder(I, *DL);
- auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);
- auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
// We have a guard here because not every atomic operation generates a
// trailing fence.
if (TrailingFence)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c85f0c71ef25f..d0268545042ed 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2327,18 +2327,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isAcquireOrStronger(Ord))
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index afbf1b4c55e70..5196ce846d6a2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21229,7 +21229,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -21254,7 +21255,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..da09eca2b946f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -666,10 +666,12 @@ class VectorType;
void
emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *emitLeadingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
+ Instruction *emitTrailingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
unsigned getMaxSupportedInterleaveFactor() const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1f417dbada8e6..0bf3e5dcdbf4e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6311,7 +6311,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
@@ -6319,15 +6320,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ SSID)
+ : Builder.CreateFence(AtomicOrdering::Release, SSID);
return nullptr;
}
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
@@ -6340,7 +6343,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
- return Builder.CreateFence(AtomicOrdering::Acquire);
+ return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index b4b7dad984b62..3f494c9066140 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -280,10 +280,14 @@ class NVPTXTargetLowering : public TargetLowering {
AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 193418ca391e5..4dbcf6183efe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,27 @@ def AS_match {
}];
}
+multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
+ defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
+ def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
+ }]>;
+ def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
+ }]>;
+ def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
+ }]>;
+ def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
+ }]>;
+}
+
+
// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -2111,9 +2132,9 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}
// has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
- SDPatternOperator op, list<Predicate> preds> {
- defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str,
+ string op_str, SDPatternOperator op, list<Predicate> preds> {
+ defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def rr : NVPTXInst<(outs t.RC:$dst),
(ins ADDR:$addr, t.RC:$b, t.RC:$c),
@@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
+multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string scope_str, string sem_str, string op_str, list<Predicate> preds = []> {
defvar frag_pat = (frag node:$a, node:$b, node:$c);
- defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
- defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
- defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
- defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
+ defm _G : F_ATOMIC_3<t, sem_str, scope_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
+ defm _S : F_ATOMIC_3<t, sem_str, scope_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _S_C : F_ATOMIC_3<t, sem_str, scope_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
+ defm _GEN : F_ATOMIC_3<t, sem_str, scope_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
// atom_add
@@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in {
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
+
+ // Instantiate scoped versions of the atomic compare and swap pattern
+ defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;
+
+ foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+ defvar atomic_cmp_swap_pat_scoped = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order#_#scope);
+
+ // Syncscope is only supported for SM70+
+ defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat_scoped, "."#scope, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ }
+
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#t.Size#_#order
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
- : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
+ : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "", "cas.b"#t.Size, []>;
}
}
// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
@@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
RegTyInfo t, list<Predicate> Preds> {
defm "" : F_ATOMIC_3<t,
as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
- sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
+ sem_str = "",
+ scope_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
op_str = OpStr # "." # TypeStr,
op = !cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59bfec30dc211..6dd67c76b7077 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12588,7 +12588,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -12598,7 +12599,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e7e7c21b50395..964f5e11f78cd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,10 +927,14 @@ namespace llvm {
return true;
}
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
bool shouldInlineQuadwordAtomics() const;
diff --git a/llvm/lib/Target/RIS...
[truncated]
|
CC: @gonzalobg |
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions cpp -- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2da69b6bc..304c93e3e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6182,11 +6182,10 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
if (isReleaseOrStronger(Ord))
- return Builder.CreateFence(
- Ord == AtomicOrdering::SequentiallyConsistent
- ? AtomicOrdering::SequentiallyConsistent
- : AtomicOrdering::Release,
- SSID);
+ return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
+ ? AtomicOrdering::SequentiallyConsistent
+ : AtomicOrdering::Release,
+ SSID);
return nullptr;
}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM in principle.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we do this without modifying the base API? Looks like the SSID information is enclosed in the instruction which we can query in NVPTX specific NVPTXTargetLowering::emitTrailingFence
/NVPTXTargetLowering::emitLeadingFence
.
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { | |||
|
|||
if (TLI->shouldInsertFencesForAtomic(I)) { | |||
auto FenceOrdering = AtomicOrdering::Monotonic; | |||
SyncScope::ID SSID = SyncScope::System; | |||
if (LI && isAcquireOrStronger(LI->getOrdering())) { | |||
FenceOrdering = LI->getOrdering(); | |||
LI->setOrdering(AtomicOrdering::Monotonic); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was expecting load, store, and rmw to require changes here as well.
If the intent is to incrementally improve it for CAS only, it may make sense to leave a TODO comment on these to call that out.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The intent was to introduce a mechanism for supporting syncscope for atomics, and as a first step, support it for CAS.
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{ | ||
return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System; | ||
}]>; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@AlexMaclean @Artem-B extra pair of eyes here would be really appreciated.
The NAME$_{sco} == NVPTX::Scope::{sco}
seems correct to me but I'm not an expert on tablegen.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks reasonable, but I wonder if we're the first target to have added functionality like this? Have any other targets added support for scoped atomics? If so how did they implement it? I wonder if this could be simplified with a complex pattern or something similar.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AMDGPU would be the most likely candidate and they do have some references to getSyncScopeID()
in their back-end. @arsenm would you happen to know the details?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There isn't a general mechanism, and I gave a lot of thought to creating one before creating this PR. The idea that I came up with was something like the following- an integer parameter to binary_atomic_op, and ternary_atomic_op called "scope".
Then either instantiate ternary_atomic_op<op, max_scopeid_across_backends> in TargetSelectionDAG, or have individual targets instantiate this multiclass based on the highest ID that they want to support.
The first idea would require backends to mutually ensure that max(syncscope_id) across backends is maintained.
multiclass ternary_atomic_op<SDNode atomic_op, int scopes = 2> { # default, singlethread and system
foreach vt = [ i8, i16, i32, i64 ] in {
def _#vt : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
(atomic_op node:$ptr, node:$cmp, node:$val)> {
let IsAtomic = true;
let MemoryVT = vt;
}
foreach scope = !range(scopes) in {
defvar scope_str = !cast<string>(scope);
defvar frag_pat = (!cast<SDPatternOperator>(NAME_#_vt) node:$a node:$b node:$c)
def _#vt_#scope_#scope_str : PatFrag<!setdagop(frag_pat, ops), frag_pat>;
defm NAME#_#vt#_#scope_#scope_str : ternary_atomic_op_ord
}
defm NAME#_#vt : ternary_atomic_op_ord;
}
}
…roject into upstream/cmpxchg-syncscope
535c4f6
to
93ff279
Compare
✅ With the latest revision this PR passed the Python code formatter. |
@akshayrdeodhar when looking at the tests, e.g., here: https://github.com/llvm/llvm-project/blob/026e94ab2832d1e207439c8f52f2482206b848f5/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll#L28 I observe that a non-atomic load is being generated to read the initial CAS value. If so, that load needs to be an atomic load of the appropriate scope. |
This does make sense- the first load generated by the emulation loop should be atomic! AtomicExpand does not currently do this. Are we sure that generating an atomic load is valid for all targets? Then we'll have to modify AtomicExpand to issue atomic loads. |
if (isReleaseOrStronger(Ord)) | ||
return Ord == AtomicOrdering::SequentiallyConsistent | ||
? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) | ||
: Builder.CreateFence(AtomicOrdering::Release); | ||
? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: can you push the ternary operator inward so it is the first operand to CreateFence?
multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str, | ||
SDPatternOperator op, list<Predicate> preds> { | ||
defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;"; | ||
multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For LD/ST instructions we keep the scope as an immediate operand with a custom printing function. I think it would be a bit cleaner to re-use this system for scope here as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
printLdStCode
also prints based on the "order" modifier, yet we support order for atomics in tablegen. .scope
is a modifier explicitly defined in atom. I would prefer to keep more stuff in TableGen than in C++ source.
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{ | ||
return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System; | ||
}]>; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks reasonable, but I wonder if we're the first target to have added functionality like this? Have any other targets added support for scoped atomics? If so how did they implement it? I wonder if this could be simplified with a complex pattern or something similar.
This MR adds support for cmpxchg instructions with syncscope.