@@ -799,6 +799,31 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
799
799
; GFX90A-LABEL: optnone_atomicrmw_fadd_f64_expand:
800
800
; GFX90A: ; %bb.0:
801
801
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
802
+ ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
803
+ ; GFX90A-NEXT: s_mov_b32 s6, 32
804
+ ; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
805
+ ; GFX90A-NEXT: s_getpc_b64 s[6:7]
806
+ ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
807
+ ; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
808
+ ; GFX90A-NEXT: s_cmp_eq_u32 s7, s4
809
+ ; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0
810
+ ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
811
+ ; GFX90A-NEXT: s_mov_b64 s[4:5], -1
812
+ ; GFX90A-NEXT: s_mov_b32 s6, 1
813
+ ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
814
+ ; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7]
815
+ ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
816
+ ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_3
817
+ ; GFX90A-NEXT: .LBB5_1: ; %Flow4
818
+ ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
819
+ ; GFX90A-NEXT: s_mov_b32 s4, 1
820
+ ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], v4, s4
821
+ ; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5]
822
+ ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_10
823
+ ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.shared
824
+ ; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v0, v[0:1]
825
+ ; GFX90A-NEXT: s_branch .LBB5_10
826
+ ; GFX90A-NEXT: .LBB5_3: ; %atomicrmw.check.private
802
827
; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
803
828
; GFX90A-NEXT: s_mov_b32 s6, 32
804
829
; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
@@ -813,50 +838,54 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
813
838
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
814
839
; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7]
815
840
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
816
- ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_2
817
- ; GFX90A-NEXT: s_branch .LBB5_3
818
- ; GFX90A-NEXT: .LBB5_1 : ; %atomicrmw.private
841
+ ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_5
842
+ ; GFX90A-NEXT: s_branch .LBB5_6
843
+ ; GFX90A-NEXT: .LBB5_4 : ; %atomicrmw.private
819
844
; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
820
845
; GFX90A-NEXT: s_waitcnt vmcnt(0)
821
846
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
822
- ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
823
- ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
824
- ; GFX90A-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
825
- ; GFX90A-NEXT: s_branch .LBB5_6
826
- ; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.global
847
+ ; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
848
+ ; GFX90A-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen
849
+ ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
850
+ ; GFX90A-NEXT: s_branch .LBB5_9
851
+ ; GFX90A-NEXT: .LBB5_5: ; %atomicrmw.global
852
+ ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
827
853
; GFX90A-NEXT: s_getpc_b64 s[4:5]
828
854
; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4
829
855
; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12
830
- ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
831
- ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
856
+ ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v2, s[4:5]
832
857
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
833
- ; GFX90A-NEXT: s_branch .LBB5_4
834
- ; GFX90A-NEXT: .LBB5_3 : ; %Flow
858
+ ; GFX90A-NEXT: s_branch .LBB5_7
859
+ ; GFX90A-NEXT: .LBB5_6 : ; %Flow
835
860
; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5]
836
- ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_1
837
- ; GFX90A-NEXT: s_branch .LBB5_6
838
- ; GFX90A-NEXT: .LBB5_4 : ; %atomicrmw.start
861
+ ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_4
862
+ ; GFX90A-NEXT: s_branch .LBB5_9
863
+ ; GFX90A-NEXT: .LBB5_7 : ; %atomicrmw.start
839
864
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
840
- ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
865
+ ; GFX90A-NEXT: s_waitcnt vmcnt(0)
841
866
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
842
867
; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
843
868
; GFX90A-NEXT: s_getpc_b64 s[6:7]
844
869
; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
845
870
; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
846
- ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
847
- ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7 ], v[2:5 ] glc
848
- ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
871
+ ; GFX90A-NEXT: v_mov_b32_e32 v6, 0
872
+ ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v6, v[2:5 ], s[6:7 ] glc
873
+ ; GFX90A-NEXT: s_waitcnt vmcnt(0)
849
874
; GFX90A-NEXT: v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5]
850
875
; GFX90A-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
851
876
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
852
- ; GFX90A-NEXT: s_cbranch_execnz .LBB5_4
853
- ; GFX90A-NEXT: ; %bb.5 : ; %atomicrmw.end1
877
+ ; GFX90A-NEXT: s_cbranch_execnz .LBB5_7
878
+ ; GFX90A-NEXT: ; %bb.8 : ; %atomicrmw.end1
854
879
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
855
880
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
856
- ; GFX90A-NEXT: s_branch .LBB5_3
857
- ; GFX90A-NEXT: .LBB5_6: ; %atomicrmw.phi
858
- ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.end
881
+ ; GFX90A-NEXT: s_branch .LBB5_6
882
+ ; GFX90A-NEXT: .LBB5_9: ; %Flow3
883
+ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
884
+ ; GFX90A-NEXT: s_branch .LBB5_1
885
+ ; GFX90A-NEXT: .LBB5_10: ; %atomicrmw.phi
886
+ ; GFX90A-NEXT: ; %bb.11: ; %atomicrmw.end
859
887
; GFX90A-NEXT: s_mov_b32 s4, 32
888
+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
860
889
; GFX90A-NEXT: v_lshrrev_b64 v[4:5], s4, v[2:3]
861
890
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
862
891
; GFX90A-NEXT: v_mov_b32_e32 v1, v4
@@ -866,6 +895,31 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
866
895
; GFX942-LABEL: optnone_atomicrmw_fadd_f64_expand:
867
896
; GFX942: ; %bb.0:
868
897
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
898
+ ; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base
899
+ ; GFX942-NEXT: s_mov_b32 s2, 32
900
+ ; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
901
+ ; GFX942-NEXT: s_getpc_b64 s[2:3]
902
+ ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
903
+ ; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
904
+ ; GFX942-NEXT: s_cmp_eq_u32 s3, s0
905
+ ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
906
+ ; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
907
+ ; GFX942-NEXT: s_mov_b64 s[0:1], -1
908
+ ; GFX942-NEXT: s_mov_b32 s2, 1
909
+ ; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2
910
+ ; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3]
911
+ ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
912
+ ; GFX942-NEXT: s_cbranch_vccnz .LBB5_3
913
+ ; GFX942-NEXT: .LBB5_1: ; %Flow4
914
+ ; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
915
+ ; GFX942-NEXT: s_mov_b32 s0, 1
916
+ ; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], v4, s0
917
+ ; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
918
+ ; GFX942-NEXT: s_cbranch_vccnz .LBB5_10
919
+ ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.shared
920
+ ; GFX942-NEXT: ds_add_rtn_f64 v[2:3], v0, v[0:1]
921
+ ; GFX942-NEXT: s_branch .LBB5_10
922
+ ; GFX942-NEXT: .LBB5_3: ; %atomicrmw.check.private
869
923
; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base
870
924
; GFX942-NEXT: s_mov_b32 s2, 32
871
925
; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
@@ -880,48 +934,52 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
880
934
; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2
881
935
; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3]
882
936
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
883
- ; GFX942-NEXT: s_cbranch_vccnz .LBB5_2
884
- ; GFX942-NEXT: s_branch .LBB5_3
885
- ; GFX942-NEXT: .LBB5_1 : ; %atomicrmw.private
937
+ ; GFX942-NEXT: s_cbranch_vccnz .LBB5_5
938
+ ; GFX942-NEXT: s_branch .LBB5_6
939
+ ; GFX942-NEXT: .LBB5_4 : ; %atomicrmw.private
886
940
; GFX942-NEXT: scratch_load_dwordx2 v[2:3], off, s0
887
941
; GFX942-NEXT: s_waitcnt vmcnt(0)
888
- ; GFX942-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
889
- ; GFX942-NEXT: scratch_store_dwordx2 off, v[0:1], s0
890
- ; GFX942-NEXT: s_branch .LBB5_6
891
- ; GFX942-NEXT: .LBB5_2: ; %atomicrmw.global
942
+ ; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
943
+ ; GFX942-NEXT: scratch_store_dwordx2 off, v[4:5], s0
944
+ ; GFX942-NEXT: s_branch .LBB5_9
945
+ ; GFX942-NEXT: .LBB5_5: ; %atomicrmw.global
946
+ ; GFX942-NEXT: v_mov_b32_e32 v2, 0
892
947
; GFX942-NEXT: s_getpc_b64 s[0:1]
893
948
; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4
894
949
; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12
895
- ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
896
- ; GFX942-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
950
+ ; GFX942-NEXT: global_load_dwordx2 v[2:3], v2, s[0:1]
897
951
; GFX942-NEXT: s_mov_b64 s[0:1], 0
898
- ; GFX942-NEXT: s_branch .LBB5_4
899
- ; GFX942-NEXT: .LBB5_3 : ; %Flow
952
+ ; GFX942-NEXT: s_branch .LBB5_7
953
+ ; GFX942-NEXT: .LBB5_6 : ; %Flow
900
954
; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
901
- ; GFX942-NEXT: s_cbranch_vccnz .LBB5_1
902
- ; GFX942-NEXT: s_branch .LBB5_6
903
- ; GFX942-NEXT: .LBB5_4 : ; %atomicrmw.start
955
+ ; GFX942-NEXT: s_cbranch_vccnz .LBB5_4
956
+ ; GFX942-NEXT: s_branch .LBB5_9
957
+ ; GFX942-NEXT: .LBB5_7 : ; %atomicrmw.start
904
958
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
905
- ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
959
+ ; GFX942-NEXT: s_waitcnt vmcnt(0)
906
960
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
907
961
; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
908
962
; GFX942-NEXT: s_getpc_b64 s[2:3]
909
963
; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
910
964
; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
911
- ; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
912
- ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7 ], v [2:5 ] sc0 sc1
913
- ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
965
+ ; GFX942-NEXT: v_mov_b32_e32 v6, 0
966
+ ; GFX942-NEXT: global_atomic_cmpswap_x2 v[2:3], v6, v[2:5 ], s [2:3 ] sc0 sc1
967
+ ; GFX942-NEXT: s_waitcnt vmcnt(0)
914
968
; GFX942-NEXT: v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5]
915
969
; GFX942-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
916
970
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
917
- ; GFX942-NEXT: s_cbranch_execnz .LBB5_4
918
- ; GFX942-NEXT: ; %bb.5 : ; %atomicrmw.end1
971
+ ; GFX942-NEXT: s_cbranch_execnz .LBB5_7
972
+ ; GFX942-NEXT: ; %bb.8 : ; %atomicrmw.end1
919
973
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
920
974
; GFX942-NEXT: s_mov_b64 s[0:1], 0
921
- ; GFX942-NEXT: s_branch .LBB5_3
922
- ; GFX942-NEXT: .LBB5_6: ; %atomicrmw.phi
923
- ; GFX942-NEXT: ; %bb.7: ; %atomicrmw.end
975
+ ; GFX942-NEXT: s_branch .LBB5_6
976
+ ; GFX942-NEXT: .LBB5_9: ; %Flow3
977
+ ; GFX942-NEXT: s_mov_b64 s[0:1], 0
978
+ ; GFX942-NEXT: s_branch .LBB5_1
979
+ ; GFX942-NEXT: .LBB5_10: ; %atomicrmw.phi
980
+ ; GFX942-NEXT: ; %bb.11: ; %atomicrmw.end
924
981
; GFX942-NEXT: s_mov_b32 s0, 32
982
+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
925
983
; GFX942-NEXT: v_lshrrev_b64 v[4:5], s0, v[2:3]
926
984
; GFX942-NEXT: v_mov_b32_e32 v0, v2
927
985
; GFX942-NEXT: v_mov_b32_e32 v1, v4
0 commit comments