@@ -34,7 +34,7 @@ RegBankLegalizeHelper::RegBankLegalizeHelper(
34
34
MachineIRBuilder &B, const MachineUniformityInfo &MUI,
35
35
const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36
36
: ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
37
- MUI(MUI), RBI(RBI), RBLRules(RBLRules),
37
+ MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
38
38
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
39
39
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
40
40
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
@@ -57,6 +57,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
57
57
lower (MI, Mapping, WaterfallSgprs);
58
58
}
59
59
60
+ bool RegBankLegalizeHelper::executeInWaterfallLoop (
61
+ MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
62
+ SmallSet<Register, 4 > &SGPROperandRegs) {
63
+ // Track use registers which have already been expanded with a readfirstlane
64
+ // sequence. This may have multiple uses if moving a sequence.
65
+ DenseMap<Register, Register> WaterfalledRegMap;
66
+
67
+ MachineBasicBlock &MBB = B.getMBB ();
68
+ MachineFunction &MF = B.getMF ();
69
+
70
+ const SIRegisterInfo *TRI = ST.getRegisterInfo ();
71
+ const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass ();
72
+ unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
73
+ if (IsWave32) {
74
+ MovExecOpc = AMDGPU::S_MOV_B32;
75
+ MovExecTermOpc = AMDGPU::S_MOV_B32_term;
76
+ XorTermOpc = AMDGPU::S_XOR_B32_term;
77
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
78
+ ExecReg = AMDGPU::EXEC_LO;
79
+ } else {
80
+ MovExecOpc = AMDGPU::S_MOV_B64;
81
+ MovExecTermOpc = AMDGPU::S_MOV_B64_term;
82
+ XorTermOpc = AMDGPU::S_XOR_B64_term;
83
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
84
+ ExecReg = AMDGPU::EXEC;
85
+ }
86
+
87
+ #ifndef NDEBUG
88
+ const int OrigRangeSize = std::distance (Range.begin (), Range.end ());
89
+ #endif
90
+
91
+ MachineRegisterInfo &MRI = *B.getMRI ();
92
+ Register SaveExecReg = MRI.createVirtualRegister (WaveRC);
93
+ Register InitSaveExecReg = MRI.createVirtualRegister (WaveRC);
94
+
95
+ // Don't bother using generic instructions/registers for the exec mask.
96
+ B.buildInstr (TargetOpcode::IMPLICIT_DEF).addDef (InitSaveExecReg);
97
+
98
+ Register SavedExec = MRI.createVirtualRegister (WaveRC);
99
+
100
+ // To insert the loop we need to split the block. Move everything before
101
+ // this point to a new block, and insert a new empty block before this
102
+ // instruction.
103
+ MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock ();
104
+ MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock ();
105
+ MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock ();
106
+ MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock ();
107
+ MachineFunction::iterator MBBI (MBB);
108
+ ++MBBI;
109
+ MF.insert (MBBI, LoopBB);
110
+ MF.insert (MBBI, BodyBB);
111
+ MF.insert (MBBI, RestoreExecBB);
112
+ MF.insert (MBBI, RemainderBB);
113
+
114
+ LoopBB->addSuccessor (BodyBB);
115
+ BodyBB->addSuccessor (RestoreExecBB);
116
+ BodyBB->addSuccessor (LoopBB);
117
+
118
+ // Move the rest of the block into a new block.
119
+ RemainderBB->transferSuccessorsAndUpdatePHIs (&MBB);
120
+ RemainderBB->splice (RemainderBB->begin (), &MBB, Range.end (), MBB.end ());
121
+
122
+ MBB.addSuccessor (LoopBB);
123
+ RestoreExecBB->addSuccessor (RemainderBB);
124
+
125
+ B.setInsertPt (*LoopBB, LoopBB->end ());
126
+
127
+ // +-MBB:------------+
128
+ // | ... |
129
+ // | %0 = G_INST_1 |
130
+ // | %Dst = MI %Vgpr |
131
+ // | %1 = G_INST_2 |
132
+ // | ... |
133
+ // +-----------------+
134
+ // ->
135
+ // +-MBB-------------------------------+
136
+ // | ... |
137
+ // | %0 = G_INST_1 |
138
+ // | %SaveExecReg = S_MOV_B32 $exec_lo |
139
+ // +----------------|------------------+
140
+ // | /------------------------------|
141
+ // V V |
142
+ // +-LoopBB---------------------------------------------------------------+ |
143
+ // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
144
+ // | instead of executing for each lane, see if other lanes had | |
145
+ // | same value for %Vgpr and execute for them also. | |
146
+ // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
147
+ // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
148
+ // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
149
+ // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
150
+ // +----------------|-----------------------------------------------------+ |
151
+ // V |
152
+ // +-BodyBB------------------------------------------------------------+ |
153
+ // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
154
+ // | executed only for active lanes and written to Dst | |
155
+ // | $exec = S_XOR_B32 $exec, %SavedExec | |
156
+ // | set active lanes to 0 in SavedExec, lanes that did not write to | |
157
+ // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
158
+ // | SI_WATERFALL_LOOP LoopBB |-----|
159
+ // +----------------|--------------------------------------------------+
160
+ // V
161
+ // +-RestoreExecBB--------------------------+
162
+ // | $exec_lo = S_MOV_B32_term %SaveExecReg |
163
+ // +----------------|-----------------------+
164
+ // V
165
+ // +-RemainderBB:----------------------+
166
+ // | %1 = G_INST_2 |
167
+ // | ... |
168
+ // +---------------------------------- +
169
+
170
+ // Move the instruction into the loop body. Note we moved everything after
171
+ // Range.end() already into a new block, so Range.end() is no longer valid.
172
+ BodyBB->splice (BodyBB->end (), &MBB, Range.begin (), MBB.end ());
173
+
174
+ // Figure out the iterator range after splicing the instructions.
175
+ MachineBasicBlock::iterator NewBegin = Range.begin ()->getIterator ();
176
+ auto NewEnd = BodyBB->end ();
177
+ assert (std::distance (NewBegin, NewEnd) == OrigRangeSize);
178
+
179
+ B.setMBB (*LoopBB);
180
+ Register CondReg;
181
+
182
+ for (MachineInstr &MI : make_range (NewBegin, NewEnd)) {
183
+ for (MachineOperand &Op : MI.all_uses ()) {
184
+ Register OldReg = Op.getReg ();
185
+ if (!SGPROperandRegs.count (OldReg))
186
+ continue ;
187
+
188
+ // See if we already processed this register in another instruction in
189
+ // the sequence.
190
+ auto OldVal = WaterfalledRegMap.find (OldReg);
191
+ if (OldVal != WaterfalledRegMap.end ()) {
192
+ Op.setReg (OldVal->second );
193
+ continue ;
194
+ }
195
+
196
+ Register OpReg = Op.getReg ();
197
+ LLT OpTy = MRI.getType (OpReg);
198
+
199
+ // TODO: support for agpr
200
+ assert (MRI.getRegBank (OpReg) == VgprRB);
201
+ Register CurrentLaneReg = MRI.createVirtualRegister ({SgprRB, OpTy});
202
+ buildReadFirstLane (B, CurrentLaneReg, OpReg, RBI);
203
+
204
+ // Build the comparison(s), CurrentLaneReg == OpReg.
205
+ unsigned OpSize = OpTy.getSizeInBits ();
206
+ unsigned PartSize = (OpSize % 64 == 0 ) ? 64 : 32 ;
207
+ LLT PartTy = LLT::scalar (PartSize);
208
+ unsigned NumParts = OpSize / PartSize;
209
+ SmallVector<Register, 8 > OpParts;
210
+ SmallVector<Register, 8 > CurrentLaneParts;
211
+
212
+ if (NumParts == 1 ) {
213
+ OpParts.push_back (OpReg);
214
+ CurrentLaneParts.push_back (CurrentLaneReg);
215
+ } else {
216
+ auto UnmergeOp = B.buildUnmerge ({VgprRB, PartTy}, OpReg);
217
+ auto UnmergeCurrLane = B.buildUnmerge ({SgprRB, PartTy}, CurrentLaneReg);
218
+ for (unsigned i = 0 ; i < NumParts; ++i) {
219
+ OpParts.push_back (UnmergeOp.getReg (i));
220
+ CurrentLaneParts.push_back (UnmergeCurrLane.getReg (i));
221
+ }
222
+ }
223
+
224
+ for (unsigned i = 0 ; i < NumParts; ++i) {
225
+ Register CmpReg = MRI.createVirtualRegister (VccRB_S1);
226
+ B.buildICmp (CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
227
+
228
+ if (!CondReg)
229
+ CondReg = CmpReg;
230
+ else
231
+ CondReg = B.buildAnd (VccRB_S1, CondReg, CmpReg).getReg (0 );
232
+ }
233
+
234
+ Op.setReg (CurrentLaneReg);
235
+
236
+ // Make sure we don't re-process this register again.
237
+ WaterfalledRegMap.insert (std::pair (OldReg, Op.getReg ()));
238
+ }
239
+ }
240
+
241
+ // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
242
+ Register CondRegLM =
243
+ MRI.createVirtualRegister ({WaveRC, LLT::scalar (IsWave32 ? 32 : 64 )});
244
+ B.buildIntrinsic (Intrinsic::amdgcn_ballot, CondRegLM).addReg (CondReg);
245
+
246
+ // Update EXEC, save the original EXEC value to SavedExec.
247
+ B.buildInstr (AndSaveExecOpc)
248
+ .addDef (SavedExec)
249
+ .addReg (CondRegLM, RegState::Kill);
250
+ MRI.setSimpleHint (SavedExec, CondRegLM);
251
+
252
+ B.setInsertPt (*BodyBB, BodyBB->end ());
253
+
254
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
255
+ B.buildInstr (XorTermOpc).addDef (ExecReg).addReg (ExecReg).addReg (SavedExec);
256
+
257
+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
258
+ // s_cbranch_scc0?
259
+
260
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
261
+ B.buildInstr (AMDGPU::SI_WATERFALL_LOOP).addMBB (LoopBB);
262
+
263
+ // Save the EXEC mask before the loop.
264
+ B.setInsertPt (MBB, MBB.end ());
265
+ B.buildInstr (MovExecOpc).addDef (SaveExecReg).addReg (ExecReg);
266
+
267
+ // Restore the EXEC mask after the loop.
268
+ B.setInsertPt (*RestoreExecBB, RestoreExecBB->begin ());
269
+ B.buildInstr (MovExecTermOpc).addDef (ExecReg).addReg (SaveExecReg);
270
+
271
+ // Set the insert point after the original instruction, so any new
272
+ // instructions will be in the remainder.
273
+ B.setInsertPt (*RemainderBB, RemainderBB->begin ());
274
+
275
+ return true ;
276
+ }
277
+
60
278
void RegBankLegalizeHelper::splitLoad (MachineInstr &MI,
61
279
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
62
280
MachineFunction &MF = B.getMF ();
@@ -395,7 +613,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
395
613
396
614
switch (Mapping.LoweringMethod ) {
397
615
case DoNotLower:
398
- return ;
616
+ break ;
399
617
case VccExtToSel:
400
618
return lowerVccExtToSel (MI);
401
619
case UniExtToSel: {
@@ -531,7 +749,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
531
749
}
532
750
}
533
751
534
- // TODO: executeInWaterfallLoop(... WaterfallSgprs)
752
+ if (!WaterfallSgprs.empty ()) {
753
+ MachineBasicBlock::iterator I = MI.getIterator ();
754
+ executeInWaterfallLoop (B, make_range (I, std::next (I)), WaterfallSgprs);
755
+ }
535
756
}
536
757
537
758
LLT RegBankLegalizeHelper::getTyFromID (RegBankLLTMappingApplyID ID) {
@@ -543,6 +764,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
543
764
case Vgpr16:
544
765
return LLT::scalar (16 );
545
766
case Sgpr32:
767
+ case Sgpr32_W:
546
768
case Sgpr32Trunc:
547
769
case Sgpr32AExt:
548
770
case Sgpr32AExtBoolInReg:
@@ -578,6 +800,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
578
800
case VgprV2S32:
579
801
return LLT::fixed_vector (2 , 32 );
580
802
case SgprV4S32:
803
+ case SgprV4S32_W:
581
804
case VgprV4S32:
582
805
case UniInVgprV4S32:
583
806
return LLT::fixed_vector (4 , 32 );
@@ -645,6 +868,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
645
868
return VccRB;
646
869
case Sgpr16:
647
870
case Sgpr32:
871
+ case Sgpr32_W:
648
872
case Sgpr64:
649
873
case SgprP1:
650
874
case SgprP3:
@@ -653,6 +877,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
653
877
case SgprV2S16:
654
878
case SgprV2S32:
655
879
case SgprV4S32:
880
+ case SgprV4S32_W:
656
881
case SgprB32:
657
882
case SgprB64:
658
883
case SgprB96:
@@ -894,6 +1119,14 @@ void RegBankLegalizeHelper::applyMappingSrc(
894
1119
}
895
1120
break ;
896
1121
}
1122
+ // sgpr waterfall, scalars and vectors
1123
+ case Sgpr32_W:
1124
+ case SgprV4S32_W: {
1125
+ assert (Ty == getTyFromID (MethodIDs[i]));
1126
+ if (RB != SgprRB)
1127
+ SgprWaterfallOperandRegs.insert (Reg);
1128
+ break ;
1129
+ }
897
1130
// sgpr and vgpr scalars with extend
898
1131
case Sgpr32AExt: {
899
1132
// Note: this ext allows S1, and it is meant to be combined away.
0 commit comments