Skip to content

Commit cf230e7

Browse files
authored
[AMDGPU] Enable atomic optimizer for divergent i64 and double values (#96934)
1 parent b42c332 commit cf230e7

15 files changed

+39203
-9027
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,20 @@ bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
178178
return Changed;
179179
}
180180

181+
static bool isLegalCrossLaneType(Type *Ty) {
182+
switch (Ty->getTypeID()) {
183+
case Type::FloatTyID:
184+
case Type::DoubleTyID:
185+
return true;
186+
case Type::IntegerTyID: {
187+
unsigned Size = Ty->getIntegerBitWidth();
188+
return (Size == 32 || Size == 64);
189+
}
190+
default:
191+
return false;
192+
}
193+
}
194+
181195
void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
182196
// Early exit for unhandled address space atomic instructions.
183197
switch (I.getPointerAddressSpace()) {
@@ -228,11 +242,14 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
228242

229243
// If the value operand is divergent, each lane is contributing a different
230244
// value to the atomic calculation. We can only optimize divergent values if
231-
// we have DPP available on our subtarget, and the atomic operation is 32
232-
// bits.
233-
if (ValDivergent &&
234-
(!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
235-
return;
245+
// we have DPP available on our subtarget (for DPP strategy), and the atomic
246+
// operation is 32 or 64 bits.
247+
if (ValDivergent) {
248+
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
249+
return;
250+
251+
if (!isLegalCrossLaneType(I.getType()))
252+
return;
236253
}
237254

238255
// If we get here, we can optimize the atomic using a single wavefront-wide
@@ -311,11 +328,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
311328

312329
// If the value operand is divergent, each lane is contributing a different
313330
// value to the atomic calculation. We can only optimize divergent values if
314-
// we have DPP available on our subtarget, and the atomic operation is 32
315-
// bits.
316-
if (ValDivergent &&
317-
(!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
318-
return;
331+
// we have DPP available on our subtarget (for DPP strategy), and the atomic
332+
// operation is 32 or 64 bits.
333+
if (ValDivergent) {
334+
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
335+
return;
336+
337+
if (!isLegalCrossLaneType(I.getType()))
338+
return;
319339
}
320340

321341
// If any of the other arguments to the intrinsic are divergent, we can't
@@ -748,7 +768,6 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
748768
// of each active lane in the wavefront. This will be our new value
749769
// which we will provide to the atomic operation.
750770
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
751-
assert(TyBitWidth == 32);
752771
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
753772
{NewV, LastLaneIdx});
754773
}

llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll

Lines changed: 1186 additions & 188 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Lines changed: 104 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -574,13 +574,44 @@ entry:
574574
define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
575575
; GFX6-LABEL: add_i32_varying_vdata:
576576
; GFX6: ; %bb.0: ; %entry
577-
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
578-
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
577+
; GFX6-NEXT: s_mov_b64 s[0:1], exec
578+
; GFX6-NEXT: s_mov_b32 s4, 0
579+
; GFX6-NEXT: ; implicit-def: $vgpr1
580+
; GFX6-NEXT: .LBB2_1: ; %ComputeLoop
581+
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
582+
; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
583+
; GFX6-NEXT: s_mov_b32 m0, s5
584+
; GFX6-NEXT: v_readlane_b32 s8, v0, s5
585+
; GFX6-NEXT: v_writelane_b32 v1, s4, m0
586+
; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
587+
; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
588+
; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
589+
; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
590+
; GFX6-NEXT: s_add_i32 s4, s4, s8
591+
; GFX6-NEXT: s_cbranch_vccnz .LBB2_1
592+
; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
593+
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
594+
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
595+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
596+
; GFX6-NEXT: ; implicit-def: $vgpr0
597+
; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
598+
; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
599+
; GFX6-NEXT: s_cbranch_execz .LBB2_4
600+
; GFX6-NEXT: ; %bb.3:
601+
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
602+
; GFX6-NEXT: v_mov_b32_e32 v0, s4
579603
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
580-
; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
604+
; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
605+
; GFX6-NEXT: .LBB2_4:
606+
; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
607+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
581608
; GFX6-NEXT: s_mov_b32 s3, 0xf000
582609
; GFX6-NEXT: s_mov_b32 s2, -1
583610
; GFX6-NEXT: s_waitcnt vmcnt(0)
611+
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
612+
; GFX6-NEXT: s_waitcnt expcnt(0)
613+
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
614+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
584615
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
585616
; GFX6-NEXT: s_endpgm
586617
;
@@ -937,15 +968,46 @@ entry:
937968
define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) {
938969
; GFX6-LABEL: struct_add_i32_varying_vdata:
939970
; GFX6: ; %bb.0: ; %entry
940-
; GFX6-NEXT: s_load_dword s8, s[2:3], 0x11
941-
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
942-
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
971+
; GFX6-NEXT: s_mov_b64 s[0:1], exec
972+
; GFX6-NEXT: s_mov_b32 s4, 0
973+
; GFX6-NEXT: ; implicit-def: $vgpr1
974+
; GFX6-NEXT: .LBB3_1: ; %ComputeLoop
975+
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
976+
; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
977+
; GFX6-NEXT: s_mov_b32 m0, s5
978+
; GFX6-NEXT: v_readlane_b32 s8, v0, s5
979+
; GFX6-NEXT: v_writelane_b32 v1, s4, m0
980+
; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
981+
; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
982+
; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
983+
; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
984+
; GFX6-NEXT: s_add_i32 s4, s4, s8
985+
; GFX6-NEXT: s_cbranch_vccnz .LBB3_1
986+
; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
987+
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
988+
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
989+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
990+
; GFX6-NEXT: ; implicit-def: $vgpr0
991+
; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
992+
; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
993+
; GFX6-NEXT: s_cbranch_execz .LBB3_4
994+
; GFX6-NEXT: ; %bb.3:
995+
; GFX6-NEXT: s_load_dword s5, s[2:3], 0x11
996+
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
997+
; GFX6-NEXT: v_mov_b32_e32 v0, s4
943998
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
944-
; GFX6-NEXT: v_mov_b32_e32 v1, s8
945-
; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
999+
; GFX6-NEXT: v_mov_b32_e32 v2, s5
1000+
; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
1001+
; GFX6-NEXT: .LBB3_4:
1002+
; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
1003+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
9461004
; GFX6-NEXT: s_mov_b32 s3, 0xf000
9471005
; GFX6-NEXT: s_mov_b32 s2, -1
9481006
; GFX6-NEXT: s_waitcnt vmcnt(0)
1007+
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
1008+
; GFX6-NEXT: s_waitcnt expcnt(0)
1009+
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
1010+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
9491011
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
9501012
; GFX6-NEXT: s_endpgm
9511013
;
@@ -2011,13 +2073,44 @@ entry:
20112073
define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
20122074
; GFX6-LABEL: sub_i32_varying_vdata:
20132075
; GFX6: ; %bb.0: ; %entry
2014-
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
2015-
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
2076+
; GFX6-NEXT: s_mov_b64 s[0:1], exec
2077+
; GFX6-NEXT: s_mov_b32 s4, 0
2078+
; GFX6-NEXT: ; implicit-def: $vgpr1
2079+
; GFX6-NEXT: .LBB7_1: ; %ComputeLoop
2080+
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
2081+
; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
2082+
; GFX6-NEXT: s_mov_b32 m0, s5
2083+
; GFX6-NEXT: v_readlane_b32 s8, v0, s5
2084+
; GFX6-NEXT: v_writelane_b32 v1, s4, m0
2085+
; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
2086+
; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
2087+
; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
2088+
; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
2089+
; GFX6-NEXT: s_add_i32 s4, s4, s8
2090+
; GFX6-NEXT: s_cbranch_vccnz .LBB7_1
2091+
; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
2092+
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2093+
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
2094+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2095+
; GFX6-NEXT: ; implicit-def: $vgpr0
2096+
; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
2097+
; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
2098+
; GFX6-NEXT: s_cbranch_execz .LBB7_4
2099+
; GFX6-NEXT: ; %bb.3:
2100+
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
2101+
; GFX6-NEXT: v_mov_b32_e32 v0, s4
20162102
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2017-
; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
2103+
; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
2104+
; GFX6-NEXT: .LBB7_4:
2105+
; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
2106+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
20182107
; GFX6-NEXT: s_mov_b32 s3, 0xf000
20192108
; GFX6-NEXT: s_mov_b32 s2, -1
20202109
; GFX6-NEXT: s_waitcnt vmcnt(0)
2110+
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
2111+
; GFX6-NEXT: s_waitcnt expcnt(0)
2112+
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
2113+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
20212114
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
20222115
; GFX6-NEXT: s_endpgm
20232116
;

0 commit comments

Comments
 (0)