Skip to content

AMDGPU: Add codegen for atomicrmw operations usub_cond and usub_sat #141068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1612,6 +1612,8 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_usub_cond : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_usub_sat : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4077,6 +4077,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
case TargetOpcode::G_ATOMICRMW_USUB_COND:
case TargetOpcode::G_ATOMICRMW_USUB_SAT:
case TargetOpcode::G_ATOMICRMW_FADD:
case TargetOpcode::G_ATOMICRMW_FMIN:
case TargetOpcode::G_ATOMICRMW_FMAX:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;

def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1671,6 +1671,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}

auto &Atomics32 =
getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
.legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
if (ST.hasFlatAddressSpace()) {
Atomics32.legalFor({{S32, FlatPtr}});
}

// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
Expand Down
11 changes: 7 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1744,6 +1744,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::FMin:
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
break;
case AtomicRMWInst::USubCond:
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_cond;
break;
case AtomicRMWInst::USubSat:
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_sat;
break;
Comment on lines +1747 to +1752
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should be tested in one of the llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-* tests

case AtomicRMWInst::FSub: {
report_fatal_error("atomic floating point subtraction not supported for "
"buffer resources and should've been expanded away");
Expand All @@ -1766,13 +1772,10 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
report_fatal_error("wrapping increment/decrement not supported for "
"buffer resources and should've ben expanded away");
"buffer resources and should've been expanded away");
break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
case AtomicRMWInst::USubCond:
case AtomicRMWInst::USubSat:
break;
}
}

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5419,6 +5419,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
case AMDGPU::G_ATOMICRMW_USUB_COND:
case AMDGPU::G_ATOMICRMW_USUB_SAT:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
Expand Down
32 changes: 32 additions & 0 deletions llvm/lib/Target/AMDGPU/DSInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,31 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
}
}

multiclass DSAtomicRetNoRetPatCondSub_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
def : DSAtomicRetPat<inst, vt,
!cast<PatFrag>(frag#"_local_m0_"#vt)>;
def : DSAtomicRetPat<noRetInst, vt,
!cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
}

Comment on lines +1094 to +1100
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this dead? Weren't these only introduced after gfx9?

let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
!cast<PatFrag>(frag#"_local_"#vt)>;
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
!cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
}

let OtherPredicates = [HasGDS] in {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GDS case isn't tested, but GDS atomic support is mostly missing for every other operator

def : DSAtomicRetPat<inst, vt,
!cast<PatFrag>(frag#"_region_m0_"#vt),
/* complexity */ 0, /* gds */ 1>;
def : DSAtomicRetPat<noRetInst, vt,
!cast<PatFrag>(frag#"_region_m0_noret_"#vt),
/* complexity */ 1, /* gds */ 1>;
}
}

let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
Expand Down Expand Up @@ -1172,6 +1196,14 @@ defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_l
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
}

let SubtargetPredicate = isGFX12Plus in {

defm : DSAtomicRetNoRetPatCondSub_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;

defm : DSAtomicRetNoRetPat_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;

} // let SubtargetPredicate = isGFX12Plus

let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
Expand Down
20 changes: 11 additions & 9 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1642,13 +1642,17 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
}

let SubtargetPredicate = isGFX12Plus in {
defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >;

let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another wrong use of "CSub" predicate.

defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
}
} // end foreach as

let SubtargetPredicate = isGFX12Plus in {
defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;

let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
}

let OtherPredicates = [HasD16LoadStore] in {
Expand Down Expand Up @@ -1788,10 +1792,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;

let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;

defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
Expand All @@ -1808,10 +1812,8 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;

let SubtargetPredicate = isGFX12Plus in {
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;

let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
}

let OtherPredicates = [isGFX12Plus] in {
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2191,6 +2191,14 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// FIXME: Cayman at least appears to have instructions for this, but the
// instruction defintions appear to be missing.
return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::USubCond:
case AtomicRMWInst::USubSat:
if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
unsigned Size = IntTy->getBitWidth();
if (Size == 32)
return AtomicExpansionKind::None;
}
return AtomicExpansionKind::CmpXChg;
Comment on lines +2196 to +2201
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't tested? I'm also assuming r600 didn't have these and should just return cmpxchg

case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
Expand Down
13 changes: 9 additions & 4 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
ISD::ATOMIC_LOAD_USUB_COND,
ISD::ATOMIC_LOAD_USUB_SAT,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_W_CHAIN});

Expand Down Expand Up @@ -16806,10 +16808,10 @@ static bool isV2BF16(Type *Ty) {
}

/// \return true if atomicrmw integer ops work for the type.
static bool isAtomicRMWLegalIntTy(Type *Ty) {
static bool isAtomicRMWLegalIntTy(Type *Ty, bool Allow64 = true) {
if (auto *IT = dyn_cast<IntegerType>(Ty)) {
unsigned BW = IT->getBitWidth();
return BW == 32 || BW == 64;
return BW == 32 || (BW == 64 && Allow64);
}
Comment on lines +16811 to 16815
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would just leave this alone and handle the one special case separately


return false;
Expand Down Expand Up @@ -16861,8 +16863,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,

/// \return Action to perform on AtomicRMWInsts for integer operations.
static TargetLowering::AtomicExpansionKind
atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
return isAtomicRMWLegalIntTy(RMW->getType())
atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW, bool Allow64 = true) {
return isAtomicRMWLegalIntTy(RMW->getType(), Allow64)
? TargetLowering::AtomicExpansionKind::None
: TargetLowering::AtomicExpansionKind::CmpXChg;
}
Expand Down Expand Up @@ -16931,6 +16933,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
return atomicSupportedIfLegalIntType(RMW);
case AtomicRMWInst::USubCond:
case AtomicRMWInst::USubSat:
return atomicSupportedIfLegalIntType(RMW, false);
case AtomicRMWInst::Sub:
case AtomicRMWInst::Or:
case AtomicRMWInst::Xor: {
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ body: |
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UDEC_WRAP
%20:_(s32) = G_ATOMICRMW_UDEC_WRAP %1, %5

; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_COND
%21:_(s32) = G_ATOMICRMW_USUB_COND %1, %5

; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_SAT
%22:_(s32) = G_ATOMICRMW_USUB_SAT %1, %5

$vgpr0 = COPY %4(s32)
SI_RETURN implicit $vgpr0

Expand Down
Loading