Skip to content

Commit 34b6285

Browse files
authored
[AMDGPU] Treat image_msaa_load as a sampler operation (#141726)
While image_msaa_load does not take a sampler, it can behave as if it does on some hardware. This has implications for wait counting and clausing.
1 parent 6d88343 commit 34b6285

File tree

5 files changed

+30
-13
lines changed

5 files changed

+30
-13
lines changed

llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ class SIInsertHardClauses {
116116
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
117117
if (BaseInfo->BVH)
118118
return HARDCLAUSE_BVH;
119-
if (BaseInfo->Sampler)
119+
if (BaseInfo->Sampler || BaseInfo->MSAA)
120120
return HARDCLAUSE_MIMG_SAMPLE;
121121
return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_MIMG_ATOMIC
122122
: HARDCLAUSE_MIMG_LOAD

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,9 +189,9 @@ VmemType getVmemType(const MachineInstr &Inst) {
189189
// We have to make an additional check for isVSAMPLE here since some
190190
// instructions don't have a sampler, but are still classified as sampler
191191
// instructions for the purposes of e.g. waitcnt.
192-
return BaseInfo->BVH ? VMEM_BVH
193-
: (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
194-
: VMEM_NOSAMPLER;
192+
bool HasSampler =
193+
BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst);
194+
return BaseInfo->BVH ? VMEM_BVH : HasSampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
195195
}
196196

197197
unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg
9090
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
9191
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
9292
; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
93-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
9493
; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
9594
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
9695
; GFX11-TRUE16-NEXT: ; return to shader part epilog
@@ -100,7 +99,6 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg
10099
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
101100
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
102101
; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
103-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
104102
; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
105103
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
106104
; GFX11-FAKE16-NEXT: ; return to shader part epilog
@@ -166,7 +164,6 @@ define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg
166164
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
167165
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
168166
; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
169-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
170167
; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
171168
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
172169
; GFX11-TRUE16-NEXT: ; return to shader part epilog
@@ -176,7 +173,6 @@ define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg
176173
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
177174
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
178175
; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
179-
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
180176
; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
181177
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
182178
; GFX11-FAKE16-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ body: |
5757
; GFX11: S_WAITCNT 1015
5858
; GFX1150-NEXT: S_WAITCNT 1015
5959
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
60-
; GCN-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
60+
; GCN-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
6161
$vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
62-
renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
62+
renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
6363
S_ENDPGM 0
6464
...
6565
---
@@ -69,12 +69,12 @@ machineFunctionInfo:
6969
body: |
7070
bb.0:
7171
; GCN-LABEL: name: waitcnt-load-sample
72-
; GCN: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
72+
; GCN: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
7373
; GFX11: S_WAITCNT 1015
7474
; GFX1150-NEXT: S_WAITCNT 1015
7575
; GFX12-NEXT: S_WAIT_LOADCNT 0
7676
; GCN-NEXT: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
77-
renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
77+
renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
7878
$vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
7979
S_ENDPGM 0
8080
...

llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ body: |
1313
; GFX11-NEXT: {{ $}}
1414
; GFX11-NEXT: S_WAITCNT 0
1515
; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V1_gfx11 killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
16-
; GFX11-NEXT: S_WAITCNT 1015
1716
; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
1817
; GFX11-NEXT: S_WAITCNT 1015
1918
; GFX11-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
@@ -22,3 +21,25 @@ body: |
2221
SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
2322
2423
...
24+
25+
---
26+
name: atomic_msaa
27+
tracksRegLiveness: true
28+
body: |
29+
bb.0:
30+
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
31+
32+
; GFX11-LABEL: name: atomic_msaa
33+
; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
34+
; GFX11-NEXT: {{ $}}
35+
; GFX11-NEXT: S_WAITCNT 0
36+
; GFX11-NEXT: dead $vgpr0 = IMAGE_ATOMIC_ADD_V1_V2_gfx11 killed $vgpr0, renamable $vgpr4_vgpr5, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 6, -1, 1, 0, -1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
37+
; GFX11-NEXT: S_WAITCNT 1015
38+
; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
39+
; GFX11-NEXT: S_WAITCNT 1015
40+
; GFX11-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
41+
dead $vgpr0 = IMAGE_ATOMIC_ADD_V1_V2_gfx11 killed $vgpr0, renamable $vgpr4_vgpr5, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 6, -1, 1, 0, -1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
42+
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
43+
SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
44+
45+
...

0 commit comments

Comments
 (0)