Skip to content

Commit ae96216

Browse files
AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize
Add rules for G_AMDGPU_BUFFER_LOAD and implement waterfall lowering for divergent operands that must be sgpr.
1 parent 64d7853 commit ae96216

18 files changed

+513
-242
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -117,45 +117,72 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) {
117117
return LLT::scalar(32);
118118
}
119119

120-
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
121-
const RegisterBankInfo &RBI);
120+
using ReadLaneFnTy =
121+
function_ref<MachineInstrBuilder(MachineIRBuilder &, Register, Register)>;
122+
123+
static Register buildReadLane(MachineIRBuilder &, Register,
124+
const RegisterBankInfo &, ReadLaneFnTy);
122125

123126
static void unmergeReadAnyLane(MachineIRBuilder &B,
124127
SmallVectorImpl<Register> &SgprDstParts,
125128
LLT UnmergeTy, Register VgprSrc,
126-
const RegisterBankInfo &RBI) {
129+
const RegisterBankInfo &RBI,
130+
ReadLaneFnTy BuildRL) {
127131
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
128132
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
129133
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
130-
SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
134+
SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL));
131135
}
132136
}
133137

134-
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
135-
const RegisterBankInfo &RBI) {
138+
static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc,
139+
const RegisterBankInfo &RBI,
140+
ReadLaneFnTy BuildRL) {
136141
LLT Ty = B.getMRI()->getType(VgprSrc);
137142
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
138143
if (Ty.getSizeInBits() == 32) {
139-
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
140-
.getReg(0);
144+
Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty});
145+
return BuildRL(B, SgprDst, VgprSrc).getReg(0);
141146
}
142147

143148
SmallVector<Register, 8> SgprDstParts;
144-
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
149+
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
150+
BuildRL);
145151

146152
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
147153
}
148154

149-
void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
150-
Register VgprSrc, const RegisterBankInfo &RBI) {
155+
static void buildReadLane(MachineIRBuilder &B, Register SgprDst,
156+
Register VgprSrc, const RegisterBankInfo &RBI,
157+
ReadLaneFnTy BuildReadLane) {
151158
LLT Ty = B.getMRI()->getType(VgprSrc);
152159
if (Ty.getSizeInBits() == 32) {
153-
B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
160+
BuildReadLane(B, SgprDst, VgprSrc);
154161
return;
155162
}
156163

157164
SmallVector<Register, 8> SgprDstParts;
158-
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
165+
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
166+
BuildReadLane);
159167

160168
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
161169
}
170+
171+
void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
172+
Register VgprSrc, const RegisterBankInfo &RBI) {
173+
return buildReadLane(
174+
B, SgprDst, VgprSrc, RBI,
175+
[](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
176+
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
177+
});
178+
}
179+
180+
void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
181+
Register VgprSrc, const RegisterBankInfo &RBI) {
182+
return buildReadLane(
183+
B, SgprDst, VgprSrc, RBI,
184+
[](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
185+
return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst)
186+
.addReg(VgprSrc);
187+
});
188+
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ class IntrinsicLaneMaskAnalyzer {
5151

5252
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
5353
const RegisterBankInfo &RBI);
54+
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
55+
const RegisterBankInfo &RBI);
5456
}
5557
}
5658

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 236 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ RegBankLegalizeHelper::RegBankLegalizeHelper(
3434
MachineIRBuilder &B, const MachineUniformityInfo &MUI,
3535
const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
3636
: ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
37-
MUI(MUI), RBI(RBI), RBLRules(RBLRules),
37+
MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
3838
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
3939
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
4040
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
@@ -57,6 +57,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
5757
lower(MI, Mapping, WaterfallSgprs);
5858
}
5959

60+
bool RegBankLegalizeHelper::executeInWaterfallLoop(
61+
MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
62+
SmallSet<Register, 4> &SGPROperandRegs) {
63+
// Track use registers which have already been expanded with a readfirstlane
64+
// sequence. This may have multiple uses if moving a sequence.
65+
DenseMap<Register, Register> WaterfalledRegMap;
66+
67+
MachineBasicBlock &MBB = B.getMBB();
68+
MachineFunction &MF = B.getMF();
69+
70+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
71+
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
72+
unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
73+
if (IsWave32) {
74+
MovExecOpc = AMDGPU::S_MOV_B32;
75+
MovExecTermOpc = AMDGPU::S_MOV_B32_term;
76+
XorTermOpc = AMDGPU::S_XOR_B32_term;
77+
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
78+
ExecReg = AMDGPU::EXEC_LO;
79+
} else {
80+
MovExecOpc = AMDGPU::S_MOV_B64;
81+
MovExecTermOpc = AMDGPU::S_MOV_B64_term;
82+
XorTermOpc = AMDGPU::S_XOR_B64_term;
83+
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
84+
ExecReg = AMDGPU::EXEC;
85+
}
86+
87+
#ifndef NDEBUG
88+
const int OrigRangeSize = std::distance(Range.begin(), Range.end());
89+
#endif
90+
91+
MachineRegisterInfo &MRI = *B.getMRI();
92+
Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
93+
Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
94+
95+
// Don't bother using generic instructions/registers for the exec mask.
96+
B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
97+
98+
Register SavedExec = MRI.createVirtualRegister(WaveRC);
99+
100+
// To insert the loop we need to split the block. Move everything before
101+
// this point to a new block, and insert a new empty block before this
102+
// instruction.
103+
MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
104+
MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
105+
MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
106+
MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
107+
MachineFunction::iterator MBBI(MBB);
108+
++MBBI;
109+
MF.insert(MBBI, LoopBB);
110+
MF.insert(MBBI, BodyBB);
111+
MF.insert(MBBI, RestoreExecBB);
112+
MF.insert(MBBI, RemainderBB);
113+
114+
LoopBB->addSuccessor(BodyBB);
115+
BodyBB->addSuccessor(RestoreExecBB);
116+
BodyBB->addSuccessor(LoopBB);
117+
118+
// Move the rest of the block into a new block.
119+
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
120+
RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
121+
122+
MBB.addSuccessor(LoopBB);
123+
RestoreExecBB->addSuccessor(RemainderBB);
124+
125+
B.setInsertPt(*LoopBB, LoopBB->end());
126+
127+
// +-MBB:------------+
128+
// | ... |
129+
// | %0 = G_INST_1 |
130+
// | %Dst = MI %Vgpr |
131+
// | %1 = G_INST_2 |
132+
// | ... |
133+
// +-----------------+
134+
// ->
135+
// +-MBB-------------------------------+
136+
// | ... |
137+
// | %0 = G_INST_1 |
138+
// | %SaveExecReg = S_MOV_B32 $exec_lo |
139+
// +----------------|------------------+
140+
// | /------------------------------|
141+
// V V |
142+
// +-LoopBB---------------------------------------------------------------+ |
143+
// | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
144+
// | instead of executing for each lane, see if other lanes had | |
145+
// | same value for %Vgpr and execute for them also. | |
146+
// | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
147+
// | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
148+
// | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
149+
// | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
150+
// +----------------|-----------------------------------------------------+ |
151+
// V |
152+
// +-BodyBB------------------------------------------------------------+ |
153+
// | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
154+
// | executed only for active lanes and written to Dst | |
155+
// | $exec = S_XOR_B32 $exec, %SavedExec | |
156+
// | set active lanes to 0 in SavedExec, lanes that did not write to | |
157+
// | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
158+
// | SI_WATERFALL_LOOP LoopBB |-----|
159+
// +----------------|--------------------------------------------------+
160+
// V
161+
// +-RestoreExecBB--------------------------+
162+
// | $exec_lo = S_MOV_B32_term %SaveExecReg |
163+
// +----------------|-----------------------+
164+
// V
165+
// +-RemainderBB:----------------------+
166+
// | %1 = G_INST_2 |
167+
// | ... |
168+
// +---------------------------------- +
169+
170+
// Move the instruction into the loop body. Note we moved everything after
171+
// Range.end() already into a new block, so Range.end() is no longer valid.
172+
BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
173+
174+
// Figure out the iterator range after splicing the instructions.
175+
MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
176+
auto NewEnd = BodyBB->end();
177+
assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
178+
179+
B.setMBB(*LoopBB);
180+
Register CondReg;
181+
182+
for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
183+
for (MachineOperand &Op : MI.all_uses()) {
184+
Register OldReg = Op.getReg();
185+
if (!SGPROperandRegs.count(OldReg))
186+
continue;
187+
188+
// See if we already processed this register in another instruction in
189+
// the sequence.
190+
auto OldVal = WaterfalledRegMap.find(OldReg);
191+
if (OldVal != WaterfalledRegMap.end()) {
192+
Op.setReg(OldVal->second);
193+
continue;
194+
}
195+
196+
Register OpReg = Op.getReg();
197+
LLT OpTy = MRI.getType(OpReg);
198+
199+
// TODO: support for agpr
200+
assert(MRI.getRegBank(OpReg) == VgprRB);
201+
Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
202+
buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
203+
204+
// Build the comparison(s), CurrentLaneReg == OpReg.
205+
unsigned OpSize = OpTy.getSizeInBits();
206+
unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
207+
LLT PartTy = LLT::scalar(PartSize);
208+
unsigned NumParts = OpSize / PartSize;
209+
SmallVector<Register, 8> OpParts;
210+
SmallVector<Register, 8> CurrentLaneParts;
211+
212+
if (NumParts == 1) {
213+
OpParts.push_back(OpReg);
214+
CurrentLaneParts.push_back(CurrentLaneReg);
215+
} else {
216+
auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
217+
auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
218+
for (unsigned i = 0; i < NumParts; ++i) {
219+
OpParts.push_back(UnmergeOp.getReg(i));
220+
CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
221+
}
222+
}
223+
224+
for (unsigned i = 0; i < NumParts; ++i) {
225+
Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
226+
B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
227+
228+
if (!CondReg)
229+
CondReg = CmpReg;
230+
else
231+
CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
232+
}
233+
234+
Op.setReg(CurrentLaneReg);
235+
236+
// Make sure we don't re-process this register again.
237+
WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
238+
}
239+
}
240+
241+
// Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
242+
Register CondRegLM =
243+
MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
244+
B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
245+
246+
// Update EXEC, save the original EXEC value to SavedExec.
247+
B.buildInstr(AndSaveExecOpc)
248+
.addDef(SavedExec)
249+
.addReg(CondRegLM, RegState::Kill);
250+
MRI.setSimpleHint(SavedExec, CondRegLM);
251+
252+
B.setInsertPt(*BodyBB, BodyBB->end());
253+
254+
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
255+
B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
256+
257+
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
258+
// s_cbranch_scc0?
259+
260+
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
261+
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
262+
263+
// Save the EXEC mask before the loop.
264+
B.setInsertPt(MBB, MBB.end());
265+
B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
266+
267+
// Restore the EXEC mask after the loop.
268+
B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
269+
B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
270+
271+
// Set the insert point after the original instruction, so any new
272+
// instructions will be in the remainder.
273+
B.setInsertPt(*RemainderBB, RemainderBB->begin());
274+
275+
return true;
276+
}
277+
60278
void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
61279
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
62280
MachineFunction &MF = B.getMF();
@@ -395,7 +613,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
395613

396614
switch (Mapping.LoweringMethod) {
397615
case DoNotLower:
398-
return;
616+
break;
399617
case VccExtToSel:
400618
return lowerVccExtToSel(MI);
401619
case UniExtToSel: {
@@ -531,7 +749,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
531749
}
532750
}
533751

534-
// TODO: executeInWaterfallLoop(... WaterfallSgprs)
752+
if (!WaterfallSgprs.empty()) {
753+
MachineBasicBlock::iterator I = MI.getIterator();
754+
executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
755+
}
535756
}
536757

537758
LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
@@ -543,6 +764,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
543764
case Vgpr16:
544765
return LLT::scalar(16);
545766
case Sgpr32:
767+
case Sgpr32_W:
546768
case Sgpr32Trunc:
547769
case Sgpr32AExt:
548770
case Sgpr32AExtBoolInReg:
@@ -578,6 +800,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
578800
case VgprV2S32:
579801
return LLT::fixed_vector(2, 32);
580802
case SgprV4S32:
803+
case SgprV4S32_W:
581804
case VgprV4S32:
582805
case UniInVgprV4S32:
583806
return LLT::fixed_vector(4, 32);
@@ -645,6 +868,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
645868
return VccRB;
646869
case Sgpr16:
647870
case Sgpr32:
871+
case Sgpr32_W:
648872
case Sgpr64:
649873
case SgprP1:
650874
case SgprP3:
@@ -653,6 +877,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
653877
case SgprV2S16:
654878
case SgprV2S32:
655879
case SgprV4S32:
880+
case SgprV4S32_W:
656881
case SgprB32:
657882
case SgprB64:
658883
case SgprB96:
@@ -894,6 +1119,14 @@ void RegBankLegalizeHelper::applyMappingSrc(
8941119
}
8951120
break;
8961121
}
1122+
// sgpr waterfall, scalars and vectors
1123+
case Sgpr32_W:
1124+
case SgprV4S32_W: {
1125+
assert(Ty == getTyFromID(MethodIDs[i]));
1126+
if (RB != SgprRB)
1127+
SgprWaterfallOperandRegs.insert(Reg);
1128+
break;
1129+
}
8971130
// sgpr and vgpr scalars with extend
8981131
case Sgpr32AExt: {
8991132
// Note: this ext allows S1, and it is meant to be combined away.

0 commit comments

Comments
 (0)