Skip to content

Commit 8cd5604

Browse files
authored
[AMDGPU][AtomicExpand] Use full flat emulation if a target supports f64 global atomic add instruction (#142859)
If a target supports f64 global atomic add instruction, we can also use full flat emulation.
1 parent 478bdd8 commit 8cd5604

File tree

7 files changed

+1146
-283
lines changed

7 files changed

+1146
-283
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17541,9 +17541,11 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
1754117541
// where we only insert a check for private and still use the flat instruction
1754217542
// for global and shared.
1754317543

17544-
bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17545-
Subtarget->hasAtomicFaddInsts() &&
17546-
RMW->getType()->isFloatTy();
17544+
bool FullFlatEmulation =
17545+
RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17546+
((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
17547+
(Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
17548+
RMW->getType()->isDoubleTy()));
1754717549

1754817550
// If the return value isn't used, do not introduce a false use in the phi.
1754917551
bool ReturnValueIsUsed = !AI->use_empty();

llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll

Lines changed: 105 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,31 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
799799
; GFX90A-LABEL: optnone_atomicrmw_fadd_f64_expand:
800800
; GFX90A: ; %bb.0:
801801
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
802+
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
803+
; GFX90A-NEXT: s_mov_b32 s6, 32
804+
; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
805+
; GFX90A-NEXT: s_getpc_b64 s[6:7]
806+
; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
807+
; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
808+
; GFX90A-NEXT: s_cmp_eq_u32 s7, s4
809+
; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0
810+
; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
811+
; GFX90A-NEXT: s_mov_b64 s[4:5], -1
812+
; GFX90A-NEXT: s_mov_b32 s6, 1
813+
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
814+
; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7]
815+
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
816+
; GFX90A-NEXT: s_cbranch_vccnz .LBB5_3
817+
; GFX90A-NEXT: .LBB5_1: ; %Flow4
818+
; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
819+
; GFX90A-NEXT: s_mov_b32 s4, 1
820+
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], v4, s4
821+
; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5]
822+
; GFX90A-NEXT: s_cbranch_vccnz .LBB5_10
823+
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.shared
824+
; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v0, v[0:1]
825+
; GFX90A-NEXT: s_branch .LBB5_10
826+
; GFX90A-NEXT: .LBB5_3: ; %atomicrmw.check.private
802827
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
803828
; GFX90A-NEXT: s_mov_b32 s6, 32
804829
; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
@@ -813,50 +838,54 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
813838
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
814839
; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7]
815840
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
816-
; GFX90A-NEXT: s_cbranch_vccnz .LBB5_2
817-
; GFX90A-NEXT: s_branch .LBB5_3
818-
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.private
841+
; GFX90A-NEXT: s_cbranch_vccnz .LBB5_5
842+
; GFX90A-NEXT: s_branch .LBB5_6
843+
; GFX90A-NEXT: .LBB5_4: ; %atomicrmw.private
819844
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
820845
; GFX90A-NEXT: s_waitcnt vmcnt(0)
821846
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
822-
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
823-
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
824-
; GFX90A-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
825-
; GFX90A-NEXT: s_branch .LBB5_6
826-
; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.global
847+
; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
848+
; GFX90A-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen
849+
; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
850+
; GFX90A-NEXT: s_branch .LBB5_9
851+
; GFX90A-NEXT: .LBB5_5: ; %atomicrmw.global
852+
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
827853
; GFX90A-NEXT: s_getpc_b64 s[4:5]
828854
; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4
829855
; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12
830-
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
831-
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
856+
; GFX90A-NEXT: global_load_dwordx2 v[2:3], v2, s[4:5]
832857
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
833-
; GFX90A-NEXT: s_branch .LBB5_4
834-
; GFX90A-NEXT: .LBB5_3: ; %Flow
858+
; GFX90A-NEXT: s_branch .LBB5_7
859+
; GFX90A-NEXT: .LBB5_6: ; %Flow
835860
; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5]
836-
; GFX90A-NEXT: s_cbranch_vccnz .LBB5_1
837-
; GFX90A-NEXT: s_branch .LBB5_6
838-
; GFX90A-NEXT: .LBB5_4: ; %atomicrmw.start
861+
; GFX90A-NEXT: s_cbranch_vccnz .LBB5_4
862+
; GFX90A-NEXT: s_branch .LBB5_9
863+
; GFX90A-NEXT: .LBB5_7: ; %atomicrmw.start
839864
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
840-
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
865+
; GFX90A-NEXT: s_waitcnt vmcnt(0)
841866
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
842867
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
843868
; GFX90A-NEXT: s_getpc_b64 s[6:7]
844869
; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
845870
; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
846-
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
847-
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc
848-
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
871+
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
872+
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v6, v[2:5], s[6:7] glc
873+
; GFX90A-NEXT: s_waitcnt vmcnt(0)
849874
; GFX90A-NEXT: v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5]
850875
; GFX90A-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
851876
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
852-
; GFX90A-NEXT: s_cbranch_execnz .LBB5_4
853-
; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.end1
877+
; GFX90A-NEXT: s_cbranch_execnz .LBB5_7
878+
; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end1
854879
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
855880
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
856-
; GFX90A-NEXT: s_branch .LBB5_3
857-
; GFX90A-NEXT: .LBB5_6: ; %atomicrmw.phi
858-
; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.end
881+
; GFX90A-NEXT: s_branch .LBB5_6
882+
; GFX90A-NEXT: .LBB5_9: ; %Flow3
883+
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
884+
; GFX90A-NEXT: s_branch .LBB5_1
885+
; GFX90A-NEXT: .LBB5_10: ; %atomicrmw.phi
886+
; GFX90A-NEXT: ; %bb.11: ; %atomicrmw.end
859887
; GFX90A-NEXT: s_mov_b32 s4, 32
888+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
860889
; GFX90A-NEXT: v_lshrrev_b64 v[4:5], s4, v[2:3]
861890
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
862891
; GFX90A-NEXT: v_mov_b32_e32 v1, v4
@@ -866,6 +895,31 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
866895
; GFX942-LABEL: optnone_atomicrmw_fadd_f64_expand:
867896
; GFX942: ; %bb.0:
868897
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
898+
; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base
899+
; GFX942-NEXT: s_mov_b32 s2, 32
900+
; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
901+
; GFX942-NEXT: s_getpc_b64 s[2:3]
902+
; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
903+
; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
904+
; GFX942-NEXT: s_cmp_eq_u32 s3, s0
905+
; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
906+
; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
907+
; GFX942-NEXT: s_mov_b64 s[0:1], -1
908+
; GFX942-NEXT: s_mov_b32 s2, 1
909+
; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2
910+
; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3]
911+
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
912+
; GFX942-NEXT: s_cbranch_vccnz .LBB5_3
913+
; GFX942-NEXT: .LBB5_1: ; %Flow4
914+
; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
915+
; GFX942-NEXT: s_mov_b32 s0, 1
916+
; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], v4, s0
917+
; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
918+
; GFX942-NEXT: s_cbranch_vccnz .LBB5_10
919+
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.shared
920+
; GFX942-NEXT: ds_add_rtn_f64 v[2:3], v0, v[0:1]
921+
; GFX942-NEXT: s_branch .LBB5_10
922+
; GFX942-NEXT: .LBB5_3: ; %atomicrmw.check.private
869923
; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base
870924
; GFX942-NEXT: s_mov_b32 s2, 32
871925
; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
@@ -880,48 +934,52 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
880934
; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2
881935
; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3]
882936
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
883-
; GFX942-NEXT: s_cbranch_vccnz .LBB5_2
884-
; GFX942-NEXT: s_branch .LBB5_3
885-
; GFX942-NEXT: .LBB5_1: ; %atomicrmw.private
937+
; GFX942-NEXT: s_cbranch_vccnz .LBB5_5
938+
; GFX942-NEXT: s_branch .LBB5_6
939+
; GFX942-NEXT: .LBB5_4: ; %atomicrmw.private
886940
; GFX942-NEXT: scratch_load_dwordx2 v[2:3], off, s0
887941
; GFX942-NEXT: s_waitcnt vmcnt(0)
888-
; GFX942-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
889-
; GFX942-NEXT: scratch_store_dwordx2 off, v[0:1], s0
890-
; GFX942-NEXT: s_branch .LBB5_6
891-
; GFX942-NEXT: .LBB5_2: ; %atomicrmw.global
942+
; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
943+
; GFX942-NEXT: scratch_store_dwordx2 off, v[4:5], s0
944+
; GFX942-NEXT: s_branch .LBB5_9
945+
; GFX942-NEXT: .LBB5_5: ; %atomicrmw.global
946+
; GFX942-NEXT: v_mov_b32_e32 v2, 0
892947
; GFX942-NEXT: s_getpc_b64 s[0:1]
893948
; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4
894949
; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12
895-
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
896-
; GFX942-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
950+
; GFX942-NEXT: global_load_dwordx2 v[2:3], v2, s[0:1]
897951
; GFX942-NEXT: s_mov_b64 s[0:1], 0
898-
; GFX942-NEXT: s_branch .LBB5_4
899-
; GFX942-NEXT: .LBB5_3: ; %Flow
952+
; GFX942-NEXT: s_branch .LBB5_7
953+
; GFX942-NEXT: .LBB5_6: ; %Flow
900954
; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
901-
; GFX942-NEXT: s_cbranch_vccnz .LBB5_1
902-
; GFX942-NEXT: s_branch .LBB5_6
903-
; GFX942-NEXT: .LBB5_4: ; %atomicrmw.start
955+
; GFX942-NEXT: s_cbranch_vccnz .LBB5_4
956+
; GFX942-NEXT: s_branch .LBB5_9
957+
; GFX942-NEXT: .LBB5_7: ; %atomicrmw.start
904958
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
905-
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
959+
; GFX942-NEXT: s_waitcnt vmcnt(0)
906960
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
907961
; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
908962
; GFX942-NEXT: s_getpc_b64 s[2:3]
909963
; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
910964
; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
911-
; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
912-
; GFX942-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] sc0 sc1
913-
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
965+
; GFX942-NEXT: v_mov_b32_e32 v6, 0
966+
; GFX942-NEXT: global_atomic_cmpswap_x2 v[2:3], v6, v[2:5], s[2:3] sc0 sc1
967+
; GFX942-NEXT: s_waitcnt vmcnt(0)
914968
; GFX942-NEXT: v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5]
915969
; GFX942-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
916970
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
917-
; GFX942-NEXT: s_cbranch_execnz .LBB5_4
918-
; GFX942-NEXT: ; %bb.5: ; %atomicrmw.end1
971+
; GFX942-NEXT: s_cbranch_execnz .LBB5_7
972+
; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end1
919973
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
920974
; GFX942-NEXT: s_mov_b64 s[0:1], 0
921-
; GFX942-NEXT: s_branch .LBB5_3
922-
; GFX942-NEXT: .LBB5_6: ; %atomicrmw.phi
923-
; GFX942-NEXT: ; %bb.7: ; %atomicrmw.end
975+
; GFX942-NEXT: s_branch .LBB5_6
976+
; GFX942-NEXT: .LBB5_9: ; %Flow3
977+
; GFX942-NEXT: s_mov_b64 s[0:1], 0
978+
; GFX942-NEXT: s_branch .LBB5_1
979+
; GFX942-NEXT: .LBB5_10: ; %atomicrmw.phi
980+
; GFX942-NEXT: ; %bb.11: ; %atomicrmw.end
924981
; GFX942-NEXT: s_mov_b32 s0, 32
982+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
925983
; GFX942-NEXT: v_lshrrev_b64 v[4:5], s0, v[2:3]
926984
; GFX942-NEXT: v_mov_b32_e32 v0, v2
927985
; GFX942-NEXT: v_mov_b32_e32 v1, v4

0 commit comments

Comments
 (0)