-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Add codegen for atomicrmw operations usub_cond and usub_sat #141068
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
AMDGPU: Add codegen for atomicrmw operations usub_cond and usub_sat #141068
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-ir Author: None (anjenner) ChangesSplit off from #105553 as per discussion there. Patch is 234.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141068.diff 30 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 412993755dac8..77429c88dfa3e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1612,6 +1612,8 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_usub_cond : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_usub_sat : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc555..f80ba66053e1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -273,6 +273,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index cf72608420add..81f2d3ac686ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4077,6 +4077,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
+ case TargetOpcode::G_ATOMICRMW_USUB_COND:
+ case TargetOpcode::G_ATOMICRMW_USUB_SAT:
case TargetOpcode::G_ATOMICRMW_FADD:
case TargetOpcode::G_ATOMICRMW_FMIN:
case TargetOpcode::G_ATOMICRMW_FMAX:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 18a948d68e97b..b33e4de72acd0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -692,6 +692,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
+defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
+defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 667c466a998e0..4603e2112d3ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1671,6 +1671,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ auto &Atomics32 =
+ getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
+ .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
+ if (ST.hasFlatAddressSpace()) {
+ Atomics32.legalFor({{S32, FlatPtr}});
+ }
+
// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 20ccdcaa0e9a0..fc7f74f0f6f36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1744,6 +1744,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::FMin:
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
break;
+ case AtomicRMWInst::USubCond:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_cond;
+ break;
+ case AtomicRMWInst::USubSat:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_sat;
+ break;
case AtomicRMWInst::FSub: {
report_fatal_error("atomic floating point subtraction not supported for "
"buffer resources and should've been expanded away");
@@ -1766,13 +1772,10 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
report_fatal_error("wrapping increment/decrement not supported for "
- "buffer resources and should've ben expanded away");
+ "buffer resources and should've been expanded away");
break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
- case AtomicRMWInst::USubCond:
- case AtomicRMWInst::USubSat:
- break;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..db0f68aa3b225 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5419,6 +5419,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
+ case AMDGPU::G_ATOMICRMW_USUB_COND:
+ case AMDGPU::G_ATOMICRMW_USUB_SAT:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 604eb7f2c3878..0d51f75eb3be8 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1089,7 +1089,34 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
}
}
+multiclass DSAtomicRetNoRetPatCondSub_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+ ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
+ }
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
+ }
+
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_region_m0_"#vt),
+ /* complexity */ 0, /* gds */ 1>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
+ /* complexity */ 1, /* gds */ 1>;
+ }
+}
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
@@ -1172,6 +1199,14 @@ defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_l
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
}
+let SubtargetPredicate = isGFX12Plus in {
+
+defm : DSAtomicRetNoRetPatCondSub_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
+
+defm : DSAtomicRetNoRetPat_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
+
+} // let SubtargetPredicate = isGFX12Plus
+
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c17fda1346115..17d2f0159fa9f 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1642,6 +1642,12 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
}
+let SubtargetPredicate = isGFX12Plus in {
+ defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >;
+
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
+}
} // end foreach as
let SubtargetPredicate = isGFX12Plus in {
@@ -1788,10 +1794,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
-defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
-defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
@@ -1808,10 +1814,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
let SubtargetPredicate = isGFX12Plus in {
- defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
}
let OtherPredicates = [isGFX12Plus] in {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 9c2811006bc1c..a2f9a3ad6ea3a 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2191,6 +2191,14 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// FIXME: Cayman at least appears to have instructions for this, but the
// instruction defintions appear to be missing.
return AtomicExpansionKind::CmpXChg;
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
+ unsigned Size = IntTy->getBitWidth();
+ if (Size == 32)
+ return AtomicExpansionKind::None;
+ }
+ return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2d337fafe6dc2..c5d617d1f6be4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -994,6 +994,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
+ ISD::ATOMIC_LOAD_USUB_COND,
+ ISD::ATOMIC_LOAD_USUB_SAT,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_W_CHAIN});
@@ -16806,10 +16808,10 @@ static bool isV2BF16(Type *Ty) {
}
/// \return true if atomicrmw integer ops work for the type.
-static bool isAtomicRMWLegalIntTy(Type *Ty) {
+static bool isAtomicRMWLegalIntTy(Type *Ty, bool Allow64 = true) {
if (auto *IT = dyn_cast<IntegerType>(Ty)) {
unsigned BW = IT->getBitWidth();
- return BW == 32 || BW == 64;
+ return BW == 32 || (BW == 64 && Allow64);
}
return false;
@@ -16861,8 +16863,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
/// \return Action to perform on AtomicRMWInsts for integer operations.
static TargetLowering::AtomicExpansionKind
-atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
- return isAtomicRMWLegalIntTy(RMW->getType())
+atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW, bool Allow64 = true) {
+ return isAtomicRMWLegalIntTy(RMW->getType(), Allow64)
? TargetLowering::AtomicExpansionKind::None
: TargetLowering::AtomicExpansionKind::CmpXChg;
}
@@ -16931,6 +16933,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
return atomicSupportedIfLegalIntType(RMW);
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ return atomicSupportedIfLegalIntType(RMW, false);
case AtomicRMWInst::Sub:
case AtomicRMWInst::Or:
case AtomicRMWInst::Xor: {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 84a6aeacc226a..d15f2339899fc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -809,6 +809,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
+defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
+defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
index 6729faf233c35..d7862e8373121 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
@@ -82,6 +82,12 @@ body: |
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UDEC_WRAP
%20:_(s32) = G_ATOMICRMW_UDEC_WRAP %1, %5
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_COND
+ %21:_(s32) = G_ATOMICRMW_USUB_COND %1, %5
+
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_SAT
+ %22:_(s32) = G_ATOMICRMW_USUB_SAT %1, %5
+
$vgpr0 = COPY %4(s32)
SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
deleted file mode 100644
index d0d4f4bedf314..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ /dev/null
@@ -1,213 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
-
-define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
- ret i32 %ret
-}
-
-define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- ret i32 %ret
-}
-
-define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: None (anjenner) ChangesSplit off from #105553 as per discussion there. Patch is 234.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141068.diff 30 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 412993755dac8..77429c88dfa3e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1612,6 +1612,8 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_usub_cond : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_usub_sat : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc555..f80ba66053e1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -273,6 +273,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index cf72608420add..81f2d3ac686ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4077,6 +4077,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
+ case TargetOpcode::G_ATOMICRMW_USUB_COND:
+ case TargetOpcode::G_ATOMICRMW_USUB_SAT:
case TargetOpcode::G_ATOMICRMW_FADD:
case TargetOpcode::G_ATOMICRMW_FMIN:
case TargetOpcode::G_ATOMICRMW_FMAX:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 18a948d68e97b..b33e4de72acd0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -692,6 +692,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
+defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
+defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 667c466a998e0..4603e2112d3ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1671,6 +1671,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ auto &Atomics32 =
+ getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
+ .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
+ if (ST.hasFlatAddressSpace()) {
+ Atomics32.legalFor({{S32, FlatPtr}});
+ }
+
// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 20ccdcaa0e9a0..fc7f74f0f6f36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1744,6 +1744,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::FMin:
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
break;
+ case AtomicRMWInst::USubCond:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_cond;
+ break;
+ case AtomicRMWInst::USubSat:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_sat;
+ break;
case AtomicRMWInst::FSub: {
report_fatal_error("atomic floating point subtraction not supported for "
"buffer resources and should've been expanded away");
@@ -1766,13 +1772,10 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
report_fatal_error("wrapping increment/decrement not supported for "
- "buffer resources and should've ben expanded away");
+ "buffer resources and should've been expanded away");
break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
- case AtomicRMWInst::USubCond:
- case AtomicRMWInst::USubSat:
- break;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..db0f68aa3b225 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5419,6 +5419,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
+ case AMDGPU::G_ATOMICRMW_USUB_COND:
+ case AMDGPU::G_ATOMICRMW_USUB_SAT:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 604eb7f2c3878..0d51f75eb3be8 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1089,7 +1089,34 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
}
}
+multiclass DSAtomicRetNoRetPatCondSub_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+ ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
+ }
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
+ }
+
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_region_m0_"#vt),
+ /* complexity */ 0, /* gds */ 1>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
+ /* complexity */ 1, /* gds */ 1>;
+ }
+}
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
@@ -1172,6 +1199,14 @@ defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_l
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
}
+let SubtargetPredicate = isGFX12Plus in {
+
+defm : DSAtomicRetNoRetPatCondSub_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
+
+defm : DSAtomicRetNoRetPat_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
+
+} // let SubtargetPredicate = isGFX12Plus
+
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c17fda1346115..17d2f0159fa9f 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1642,6 +1642,12 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
}
+let SubtargetPredicate = isGFX12Plus in {
+ defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >;
+
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
+}
} // end foreach as
let SubtargetPredicate = isGFX12Plus in {
@@ -1788,10 +1794,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
-defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
-defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
@@ -1808,10 +1814,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
let SubtargetPredicate = isGFX12Plus in {
- defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
}
let OtherPredicates = [isGFX12Plus] in {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 9c2811006bc1c..a2f9a3ad6ea3a 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2191,6 +2191,14 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// FIXME: Cayman at least appears to have instructions for this, but the
// instruction defintions appear to be missing.
return AtomicExpansionKind::CmpXChg;
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
+ unsigned Size = IntTy->getBitWidth();
+ if (Size == 32)
+ return AtomicExpansionKind::None;
+ }
+ return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2d337fafe6dc2..c5d617d1f6be4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -994,6 +994,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
+ ISD::ATOMIC_LOAD_USUB_COND,
+ ISD::ATOMIC_LOAD_USUB_SAT,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_W_CHAIN});
@@ -16806,10 +16808,10 @@ static bool isV2BF16(Type *Ty) {
}
/// \return true if atomicrmw integer ops work for the type.
-static bool isAtomicRMWLegalIntTy(Type *Ty) {
+static bool isAtomicRMWLegalIntTy(Type *Ty, bool Allow64 = true) {
if (auto *IT = dyn_cast<IntegerType>(Ty)) {
unsigned BW = IT->getBitWidth();
- return BW == 32 || BW == 64;
+ return BW == 32 || (BW == 64 && Allow64);
}
return false;
@@ -16861,8 +16863,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
/// \return Action to perform on AtomicRMWInsts for integer operations.
static TargetLowering::AtomicExpansionKind
-atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
- return isAtomicRMWLegalIntTy(RMW->getType())
+atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW, bool Allow64 = true) {
+ return isAtomicRMWLegalIntTy(RMW->getType(), Allow64)
? TargetLowering::AtomicExpansionKind::None
: TargetLowering::AtomicExpansionKind::CmpXChg;
}
@@ -16931,6 +16933,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
return atomicSupportedIfLegalIntType(RMW);
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ return atomicSupportedIfLegalIntType(RMW, false);
case AtomicRMWInst::Sub:
case AtomicRMWInst::Or:
case AtomicRMWInst::Xor: {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 84a6aeacc226a..d15f2339899fc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -809,6 +809,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
+defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
+defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
index 6729faf233c35..d7862e8373121 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
@@ -82,6 +82,12 @@ body: |
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UDEC_WRAP
%20:_(s32) = G_ATOMICRMW_UDEC_WRAP %1, %5
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_COND
+ %21:_(s32) = G_ATOMICRMW_USUB_COND %1, %5
+
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_SAT
+ %22:_(s32) = G_ATOMICRMW_USUB_SAT %1, %5
+
$vgpr0 = COPY %4(s32)
SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
deleted file mode 100644
index d0d4f4bedf314..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ /dev/null
@@ -1,213 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
-
-define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
- ret i32 %ret
-}
-
-define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- ret i32 %ret
-}
-
-define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: None (anjenner) ChangesSplit off from #105553 as per discussion there. Patch is 234.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141068.diff 30 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 412993755dac8..77429c88dfa3e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1612,6 +1612,8 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_usub_cond : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_usub_sat : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc555..f80ba66053e1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -273,6 +273,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index cf72608420add..81f2d3ac686ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4077,6 +4077,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
+ case TargetOpcode::G_ATOMICRMW_USUB_COND:
+ case TargetOpcode::G_ATOMICRMW_USUB_SAT:
case TargetOpcode::G_ATOMICRMW_FADD:
case TargetOpcode::G_ATOMICRMW_FMIN:
case TargetOpcode::G_ATOMICRMW_FMAX:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 18a948d68e97b..b33e4de72acd0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -692,6 +692,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
+defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
+defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 667c466a998e0..4603e2112d3ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1671,6 +1671,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ auto &Atomics32 =
+ getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
+ .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
+ if (ST.hasFlatAddressSpace()) {
+ Atomics32.legalFor({{S32, FlatPtr}});
+ }
+
// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 20ccdcaa0e9a0..fc7f74f0f6f36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1744,6 +1744,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::FMin:
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
break;
+ case AtomicRMWInst::USubCond:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_cond;
+ break;
+ case AtomicRMWInst::USubSat:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_sat;
+ break;
case AtomicRMWInst::FSub: {
report_fatal_error("atomic floating point subtraction not supported for "
"buffer resources and should've been expanded away");
@@ -1766,13 +1772,10 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
report_fatal_error("wrapping increment/decrement not supported for "
- "buffer resources and should've ben expanded away");
+ "buffer resources and should've been expanded away");
break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
- case AtomicRMWInst::USubCond:
- case AtomicRMWInst::USubSat:
- break;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..db0f68aa3b225 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5419,6 +5419,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
+ case AMDGPU::G_ATOMICRMW_USUB_COND:
+ case AMDGPU::G_ATOMICRMW_USUB_SAT:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 604eb7f2c3878..0d51f75eb3be8 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1089,7 +1089,34 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
}
}
+multiclass DSAtomicRetNoRetPatCondSub_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+ ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
+ }
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
+ }
+
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_region_m0_"#vt),
+ /* complexity */ 0, /* gds */ 1>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
+ /* complexity */ 1, /* gds */ 1>;
+ }
+}
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
@@ -1172,6 +1199,14 @@ defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_l
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
}
+let SubtargetPredicate = isGFX12Plus in {
+
+defm : DSAtomicRetNoRetPatCondSub_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
+
+defm : DSAtomicRetNoRetPat_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
+
+} // let SubtargetPredicate = isGFX12Plus
+
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c17fda1346115..17d2f0159fa9f 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1642,6 +1642,12 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
}
+let SubtargetPredicate = isGFX12Plus in {
+ defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >;
+
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
+}
} // end foreach as
let SubtargetPredicate = isGFX12Plus in {
@@ -1788,10 +1794,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
-defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
-defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
@@ -1808,10 +1814,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
let SubtargetPredicate = isGFX12Plus in {
- defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
}
let OtherPredicates = [isGFX12Plus] in {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 9c2811006bc1c..a2f9a3ad6ea3a 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2191,6 +2191,14 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// FIXME: Cayman at least appears to have instructions for this, but the
// instruction defintions appear to be missing.
return AtomicExpansionKind::CmpXChg;
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
+ unsigned Size = IntTy->getBitWidth();
+ if (Size == 32)
+ return AtomicExpansionKind::None;
+ }
+ return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2d337fafe6dc2..c5d617d1f6be4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -994,6 +994,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
+ ISD::ATOMIC_LOAD_USUB_COND,
+ ISD::ATOMIC_LOAD_USUB_SAT,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_W_CHAIN});
@@ -16806,10 +16808,10 @@ static bool isV2BF16(Type *Ty) {
}
/// \return true if atomicrmw integer ops work for the type.
-static bool isAtomicRMWLegalIntTy(Type *Ty) {
+static bool isAtomicRMWLegalIntTy(Type *Ty, bool Allow64 = true) {
if (auto *IT = dyn_cast<IntegerType>(Ty)) {
unsigned BW = IT->getBitWidth();
- return BW == 32 || BW == 64;
+ return BW == 32 || (BW == 64 && Allow64);
}
return false;
@@ -16861,8 +16863,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
/// \return Action to perform on AtomicRMWInsts for integer operations.
static TargetLowering::AtomicExpansionKind
-atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
- return isAtomicRMWLegalIntTy(RMW->getType())
+atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW, bool Allow64 = true) {
+ return isAtomicRMWLegalIntTy(RMW->getType(), Allow64)
? TargetLowering::AtomicExpansionKind::None
: TargetLowering::AtomicExpansionKind::CmpXChg;
}
@@ -16931,6 +16933,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
return atomicSupportedIfLegalIntType(RMW);
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ return atomicSupportedIfLegalIntType(RMW, false);
case AtomicRMWInst::Sub:
case AtomicRMWInst::Or:
case AtomicRMWInst::Xor: {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 84a6aeacc226a..d15f2339899fc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -809,6 +809,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
+defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
+defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
index 6729faf233c35..d7862e8373121 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
@@ -82,6 +82,12 @@ body: |
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UDEC_WRAP
%20:_(s32) = G_ATOMICRMW_UDEC_WRAP %1, %5
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_COND
+ %21:_(s32) = G_ATOMICRMW_USUB_COND %1, %5
+
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_SAT
+ %22:_(s32) = G_ATOMICRMW_USUB_SAT %1, %5
+
$vgpr0 = COPY %4(s32)
SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
deleted file mode 100644
index d0d4f4bedf314..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ /dev/null
@@ -1,213 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
-
-define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
- ret i32 %ret
-}
-
-define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- ret i32 %ret
-}
-
-define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: None (anjenner) ChangesSplit off from #105553 as per discussion there. Patch is 234.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141068.diff 30 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 412993755dac8..77429c88dfa3e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1612,6 +1612,8 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_usub_cond : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_usub_sat : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc555..f80ba66053e1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -273,6 +273,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index cf72608420add..81f2d3ac686ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4077,6 +4077,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
+ case TargetOpcode::G_ATOMICRMW_USUB_COND:
+ case TargetOpcode::G_ATOMICRMW_USUB_SAT:
case TargetOpcode::G_ATOMICRMW_FADD:
case TargetOpcode::G_ATOMICRMW_FMIN:
case TargetOpcode::G_ATOMICRMW_FMAX:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 18a948d68e97b..b33e4de72acd0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -692,6 +692,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
+defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
+defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 667c466a998e0..4603e2112d3ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1671,6 +1671,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ auto &Atomics32 =
+ getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
+ .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
+ if (ST.hasFlatAddressSpace()) {
+ Atomics32.legalFor({{S32, FlatPtr}});
+ }
+
// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 20ccdcaa0e9a0..fc7f74f0f6f36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1744,6 +1744,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::FMin:
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
break;
+ case AtomicRMWInst::USubCond:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_cond;
+ break;
+ case AtomicRMWInst::USubSat:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_sat;
+ break;
case AtomicRMWInst::FSub: {
report_fatal_error("atomic floating point subtraction not supported for "
"buffer resources and should've been expanded away");
@@ -1766,13 +1772,10 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
report_fatal_error("wrapping increment/decrement not supported for "
- "buffer resources and should've ben expanded away");
+ "buffer resources and should've been expanded away");
break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
- case AtomicRMWInst::USubCond:
- case AtomicRMWInst::USubSat:
- break;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..db0f68aa3b225 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5419,6 +5419,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
+ case AMDGPU::G_ATOMICRMW_USUB_COND:
+ case AMDGPU::G_ATOMICRMW_USUB_SAT:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 604eb7f2c3878..0d51f75eb3be8 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1089,7 +1089,34 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
}
}
+multiclass DSAtomicRetNoRetPatCondSub_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+ ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
+ }
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
+ }
+
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_region_m0_"#vt),
+ /* complexity */ 0, /* gds */ 1>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
+ /* complexity */ 1, /* gds */ 1>;
+ }
+}
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
@@ -1172,6 +1199,14 @@ defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_l
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
}
+let SubtargetPredicate = isGFX12Plus in {
+
+defm : DSAtomicRetNoRetPatCondSub_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
+
+defm : DSAtomicRetNoRetPat_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
+
+} // let SubtargetPredicate = isGFX12Plus
+
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c17fda1346115..17d2f0159fa9f 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1642,6 +1642,12 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
}
+let SubtargetPredicate = isGFX12Plus in {
+ defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >;
+
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
+}
} // end foreach as
let SubtargetPredicate = isGFX12Plus in {
@@ -1788,10 +1794,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
-defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
-defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
@@ -1808,10 +1814,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
let SubtargetPredicate = isGFX12Plus in {
- defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
}
let OtherPredicates = [isGFX12Plus] in {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 9c2811006bc1c..a2f9a3ad6ea3a 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2191,6 +2191,14 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// FIXME: Cayman at least appears to have instructions for this, but the
// instruction defintions appear to be missing.
return AtomicExpansionKind::CmpXChg;
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
+ unsigned Size = IntTy->getBitWidth();
+ if (Size == 32)
+ return AtomicExpansionKind::None;
+ }
+ return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2d337fafe6dc2..c5d617d1f6be4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -994,6 +994,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
+ ISD::ATOMIC_LOAD_USUB_COND,
+ ISD::ATOMIC_LOAD_USUB_SAT,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_W_CHAIN});
@@ -16806,10 +16808,10 @@ static bool isV2BF16(Type *Ty) {
}
/// \return true if atomicrmw integer ops work for the type.
-static bool isAtomicRMWLegalIntTy(Type *Ty) {
+static bool isAtomicRMWLegalIntTy(Type *Ty, bool Allow64 = true) {
if (auto *IT = dyn_cast<IntegerType>(Ty)) {
unsigned BW = IT->getBitWidth();
- return BW == 32 || BW == 64;
+ return BW == 32 || (BW == 64 && Allow64);
}
return false;
@@ -16861,8 +16863,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
/// \return Action to perform on AtomicRMWInsts for integer operations.
static TargetLowering::AtomicExpansionKind
-atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
- return isAtomicRMWLegalIntTy(RMW->getType())
+atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW, bool Allow64 = true) {
+ return isAtomicRMWLegalIntTy(RMW->getType(), Allow64)
? TargetLowering::AtomicExpansionKind::None
: TargetLowering::AtomicExpansionKind::CmpXChg;
}
@@ -16931,6 +16933,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
return atomicSupportedIfLegalIntType(RMW);
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ return atomicSupportedIfLegalIntType(RMW, false);
case AtomicRMWInst::Sub:
case AtomicRMWInst::Or:
case AtomicRMWInst::Xor: {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 84a6aeacc226a..d15f2339899fc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -809,6 +809,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
+defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
+defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
index 6729faf233c35..d7862e8373121 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
@@ -82,6 +82,12 @@ body: |
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UDEC_WRAP
%20:_(s32) = G_ATOMICRMW_UDEC_WRAP %1, %5
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_COND
+ %21:_(s32) = G_ATOMICRMW_USUB_COND %1, %5
+
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_SAT
+ %22:_(s32) = G_ATOMICRMW_USUB_SAT %1, %5
+
$vgpr0 = COPY %4(s32)
SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
deleted file mode 100644
index d0d4f4bedf314..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ /dev/null
@@ -1,213 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
-
-define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
- ret i32 %ret
-}
-
-define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- ret i32 %ret
-}
-
-define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ...
[truncated]
|
Split off from llvm#105553 as per discussion there.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for doing this!
let OtherPredicates = [LDSRequiresM0Init] in { | ||
def : DSAtomicRetPat<inst, vt, | ||
!cast<PatFrag>(frag#"_local_m0_"#vt)>; | ||
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks wrong. "CSub" refers to *_atomic_csub
instructions which is the old name for *_atomic_sub_clamp
not *_atomic_cond_sub
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There doesn't seem to be a corresponding predicate for cond_sub - presumably because it's supported the nortn form from the start? In which case, it seems like just removing the predicate from these defs is the right fix, as I've done in my latest commit.
let SubtargetPredicate = isGFX12Plus in { | ||
defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >; | ||
|
||
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Another wrong use of "CSub" predicate.
@@ -1808,10 +1814,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_ | |||
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>; | |||
|
|||
let SubtargetPredicate = isGFX12Plus in { | |||
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; | |||
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>; | |||
|
|||
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Another one (pre-existing).
static bool isAtomicRMWLegalIntTy(Type *Ty, bool Allow64 = true) { | ||
if (auto *IT = dyn_cast<IntegerType>(Ty)) { | ||
unsigned BW = IT->getBitWidth(); | ||
return BW == 32 || BW == 64; | ||
return BW == 32 || (BW == 64 && Allow64); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would just leave this alone and handle the one special case separately
@@ -1,246 +0,0 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Deleting this should be left for the later patch
@@ -1,213 +0,0 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Deleting this should be left for the later patch
if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) { | ||
unsigned Size = IntTy->getBitWidth(); | ||
if (Size == 32) | ||
return AtomicExpansionKind::None; | ||
} | ||
return AtomicExpansionKind::CmpXChg; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't tested? I'm also assuming r600 didn't have these and should just return cmpxchg
let OtherPredicates = [LDSRequiresM0Init] in { | ||
def : DSAtomicRetPat<inst, vt, | ||
!cast<PatFrag>(frag#"_local_m0_"#vt)>; | ||
def : DSAtomicRetPat<noRetInst, vt, | ||
!cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this dead? Weren't these only introduced after gfx9?
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-SDAG | ||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-SDAG | ||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-SDAG | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will need to cover the different scopes and metadata like in #122138
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-GISEL | ||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-GISEL |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think testing both 1030 and 1031 is useful. It should test the older subtargets too
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-GISEL | ||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-GISEL | ||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-GISEL | ||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-SDAG |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test with mixed global-isel and selectiondag run lines should explicitly disable selectiondag
|
||
let OtherPredicates = [HasGDS] in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
GDS case isn't tested, but GDS atomic support is mostly missing for every other operator
case AtomicRMWInst::USubCond: | ||
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_cond; | ||
break; | ||
case AtomicRMWInst::USubSat: | ||
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_sat; | ||
break; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These should be tested in one of the llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-* tests
Split off from #105553 as per discussion there.