@@ -574,13 +574,44 @@ entry:
574
574
define amdgpu_kernel void @add_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout ) {
575
575
; GFX6-LABEL: add_i32_varying_vdata:
576
576
; GFX6: ; %bb.0: ; %entry
577
- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
578
- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
577
+ ; GFX6-NEXT: s_mov_b64 s[0:1], exec
578
+ ; GFX6-NEXT: s_mov_b32 s4, 0
579
+ ; GFX6-NEXT: ; implicit-def: $vgpr1
580
+ ; GFX6-NEXT: .LBB2_1: ; %ComputeLoop
581
+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
582
+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
583
+ ; GFX6-NEXT: s_mov_b32 m0, s5
584
+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
585
+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
586
+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
587
+ ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
588
+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
589
+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
590
+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
591
+ ; GFX6-NEXT: s_cbranch_vccnz .LBB2_1
592
+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
593
+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
594
+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
595
+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
596
+ ; GFX6-NEXT: ; implicit-def: $vgpr0
597
+ ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
598
+ ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
599
+ ; GFX6-NEXT: s_cbranch_execz .LBB2_4
600
+ ; GFX6-NEXT: ; %bb.3:
601
+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
602
+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
579
603
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
580
- ; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
604
+ ; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
605
+ ; GFX6-NEXT: .LBB2_4:
606
+ ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
607
+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
581
608
; GFX6-NEXT: s_mov_b32 s3, 0xf000
582
609
; GFX6-NEXT: s_mov_b32 s2, -1
583
610
; GFX6-NEXT: s_waitcnt vmcnt(0)
611
+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
612
+ ; GFX6-NEXT: s_waitcnt expcnt(0)
613
+ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
614
+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
584
615
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
585
616
; GFX6-NEXT: s_endpgm
586
617
;
@@ -937,15 +968,46 @@ entry:
937
968
define amdgpu_kernel void @struct_add_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout , i32 %vindex ) {
938
969
; GFX6-LABEL: struct_add_i32_varying_vdata:
939
970
; GFX6: ; %bb.0: ; %entry
940
- ; GFX6-NEXT: s_load_dword s8, s[2:3], 0x11
941
- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
942
- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
971
+ ; GFX6-NEXT: s_mov_b64 s[0:1], exec
972
+ ; GFX6-NEXT: s_mov_b32 s4, 0
973
+ ; GFX6-NEXT: ; implicit-def: $vgpr1
974
+ ; GFX6-NEXT: .LBB3_1: ; %ComputeLoop
975
+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
976
+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
977
+ ; GFX6-NEXT: s_mov_b32 m0, s5
978
+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
979
+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
980
+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
981
+ ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
982
+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
983
+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
984
+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
985
+ ; GFX6-NEXT: s_cbranch_vccnz .LBB3_1
986
+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
987
+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
988
+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
989
+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
990
+ ; GFX6-NEXT: ; implicit-def: $vgpr0
991
+ ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
992
+ ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
993
+ ; GFX6-NEXT: s_cbranch_execz .LBB3_4
994
+ ; GFX6-NEXT: ; %bb.3:
995
+ ; GFX6-NEXT: s_load_dword s5, s[2:3], 0x11
996
+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
997
+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
943
998
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
944
- ; GFX6-NEXT: v_mov_b32_e32 v1, s8
945
- ; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
999
+ ; GFX6-NEXT: v_mov_b32_e32 v2, s5
1000
+ ; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
1001
+ ; GFX6-NEXT: .LBB3_4:
1002
+ ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
1003
+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
946
1004
; GFX6-NEXT: s_mov_b32 s3, 0xf000
947
1005
; GFX6-NEXT: s_mov_b32 s2, -1
948
1006
; GFX6-NEXT: s_waitcnt vmcnt(0)
1007
+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
1008
+ ; GFX6-NEXT: s_waitcnt expcnt(0)
1009
+ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
1010
+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
949
1011
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
950
1012
; GFX6-NEXT: s_endpgm
951
1013
;
@@ -2011,13 +2073,44 @@ entry:
2011
2073
define amdgpu_kernel void @sub_i32_varying_vdata (ptr addrspace (1 ) %out , ptr addrspace (8 ) %inout ) {
2012
2074
; GFX6-LABEL: sub_i32_varying_vdata:
2013
2075
; GFX6: ; %bb.0: ; %entry
2014
- ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
2015
- ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
2076
+ ; GFX6-NEXT: s_mov_b64 s[0:1], exec
2077
+ ; GFX6-NEXT: s_mov_b32 s4, 0
2078
+ ; GFX6-NEXT: ; implicit-def: $vgpr1
2079
+ ; GFX6-NEXT: .LBB7_1: ; %ComputeLoop
2080
+ ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
2081
+ ; GFX6-NEXT: s_ff1_i32_b64 s5, s[0:1]
2082
+ ; GFX6-NEXT: s_mov_b32 m0, s5
2083
+ ; GFX6-NEXT: v_readlane_b32 s8, v0, s5
2084
+ ; GFX6-NEXT: v_writelane_b32 v1, s4, m0
2085
+ ; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
2086
+ ; GFX6-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
2087
+ ; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0
2088
+ ; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
2089
+ ; GFX6-NEXT: s_add_i32 s4, s4, s8
2090
+ ; GFX6-NEXT: s_cbranch_vccnz .LBB7_1
2091
+ ; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
2092
+ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2093
+ ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
2094
+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2095
+ ; GFX6-NEXT: ; implicit-def: $vgpr0
2096
+ ; GFX6-NEXT: s_and_saveexec_b64 s[0:1], vcc
2097
+ ; GFX6-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
2098
+ ; GFX6-NEXT: s_cbranch_execz .LBB7_4
2099
+ ; GFX6-NEXT: ; %bb.3:
2100
+ ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0xd
2101
+ ; GFX6-NEXT: v_mov_b32_e32 v0, s4
2016
2102
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2017
- ; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
2103
+ ; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
2104
+ ; GFX6-NEXT: .LBB7_4:
2105
+ ; GFX6-NEXT: s_or_b64 exec, exec, s[0:1]
2106
+ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
2018
2107
; GFX6-NEXT: s_mov_b32 s3, 0xf000
2019
2108
; GFX6-NEXT: s_mov_b32 s2, -1
2020
2109
; GFX6-NEXT: s_waitcnt vmcnt(0)
2110
+ ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
2111
+ ; GFX6-NEXT: s_waitcnt expcnt(0)
2112
+ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
2113
+ ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2021
2114
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
2022
2115
; GFX6-NEXT: s_endpgm
2023
2116
;
0 commit comments