Skip to content

AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize #142790

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: users/petar-avramovic/ral-combine
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 40 additions & 13 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,45 +117,72 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) {
return LLT::scalar(32);
}

static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
const RegisterBankInfo &RBI);
using ReadLaneFnTy =
function_ref<MachineInstrBuilder(MachineIRBuilder &, Register, Register)>;

static Register buildReadLane(MachineIRBuilder &, Register,
const RegisterBankInfo &, ReadLaneFnTy);

static void unmergeReadAnyLane(MachineIRBuilder &B,
SmallVectorImpl<Register> &SgprDstParts,
LLT UnmergeTy, Register VgprSrc,
const RegisterBankInfo &RBI) {
const RegisterBankInfo &RBI,
ReadLaneFnTy BuildRL) {
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL));
}
}

static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
const RegisterBankInfo &RBI) {
static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc,
const RegisterBankInfo &RBI,
ReadLaneFnTy BuildRL) {
LLT Ty = B.getMRI()->getType(VgprSrc);
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
if (Ty.getSizeInBits() == 32) {
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
.getReg(0);
Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty});
return BuildRL(B, SgprDst, VgprSrc).getReg(0);
}

SmallVector<Register, 8> SgprDstParts;
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
BuildRL);

return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
}

void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
Register VgprSrc, const RegisterBankInfo &RBI) {
static void buildReadLane(MachineIRBuilder &B, Register SgprDst,
Register VgprSrc, const RegisterBankInfo &RBI,
ReadLaneFnTy BuildReadLane) {
LLT Ty = B.getMRI()->getType(VgprSrc);
if (Ty.getSizeInBits() == 32) {
B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
BuildReadLane(B, SgprDst, VgprSrc);
return;
}

SmallVector<Register, 8> SgprDstParts;
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
BuildReadLane);

B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
}

void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
Register VgprSrc, const RegisterBankInfo &RBI) {
return buildReadLane(
B, SgprDst, VgprSrc, RBI,
[](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
});
}

void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
Register VgprSrc, const RegisterBankInfo &RBI) {
return buildReadLane(
B, SgprDst, VgprSrc, RBI,
[](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for this PR, but we should really have an opcode for this too instead of having one being an intrinsic and one being a generic opcode

.addReg(VgprSrc);
});
}
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ class IntrinsicLaneMaskAnalyzer {

void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
}
}

Expand Down
239 changes: 236 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ RegBankLegalizeHelper::RegBankLegalizeHelper(
MachineIRBuilder &B, const MachineUniformityInfo &MUI,
const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
: ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
MUI(MUI), RBI(RBI), RBLRules(RBLRules),
MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
Expand All @@ -57,6 +57,224 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
lower(MI, Mapping, WaterfallSgprs);
}

bool RegBankLegalizeHelper::executeInWaterfallLoop(
MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
SmallSet<Register, 4> &SGPROperandRegs) {
// Track use registers which have already been expanded with a readfirstlane
// sequence. This may have multiple uses if moving a sequence.
DenseMap<Register, Register> WaterfalledRegMap;

MachineBasicBlock &MBB = B.getMBB();
MachineFunction &MF = B.getMF();

const SIRegisterInfo *TRI = ST.getRegisterInfo();
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
if (IsWave32) {
MovExecOpc = AMDGPU::S_MOV_B32;
MovExecTermOpc = AMDGPU::S_MOV_B32_term;
XorTermOpc = AMDGPU::S_XOR_B32_term;
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
ExecReg = AMDGPU::EXEC_LO;
} else {
MovExecOpc = AMDGPU::S_MOV_B64;
MovExecTermOpc = AMDGPU::S_MOV_B64_term;
XorTermOpc = AMDGPU::S_XOR_B64_term;
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
ExecReg = AMDGPU::EXEC;
}

#ifndef NDEBUG
const int OrigRangeSize = std::distance(Range.begin(), Range.end());
#endif

MachineRegisterInfo &MRI = *B.getMRI();
Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);

// Don't bother using generic instructions/registers for the exec mask.
B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);

Register SavedExec = MRI.createVirtualRegister(WaveRC);

// To insert the loop we need to split the block. Move everything before
// this point to a new block, and insert a new empty block before this
// instruction.
MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;
MF.insert(MBBI, LoopBB);
MF.insert(MBBI, BodyBB);
MF.insert(MBBI, RestoreExecBB);
MF.insert(MBBI, RemainderBB);

LoopBB->addSuccessor(BodyBB);
BodyBB->addSuccessor(RestoreExecBB);
BodyBB->addSuccessor(LoopBB);

// Move the rest of the block into a new block.
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());

MBB.addSuccessor(LoopBB);
RestoreExecBB->addSuccessor(RemainderBB);

B.setInsertPt(*LoopBB, LoopBB->end());

// +-MBB:------------+
// | ... |
// | %0 = G_INST_1 |
// | %Dst = MI %Vgpr |
// | %1 = G_INST_2 |
// | ... |
// +-----------------+
// ->
// +-MBB-------------------------------+
// | ... |
// | %0 = G_INST_1 |
// | %SaveExecReg = S_MOV_B32 $exec_lo |
// +----------------|------------------+
// | /------------------------------|
// V V |
// +-LoopBB---------------------------------------------------------------+ |
// | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
// | instead of executing for each lane, see if other lanes had | |
// | same value for %Vgpr and execute for them also. | |
// | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
// | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
// | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
// | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
// +----------------|-----------------------------------------------------+ |
// V |
// +-BodyBB------------------------------------------------------------+ |
// | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
// | executed only for active lanes and written to Dst | |
// | $exec = S_XOR_B32 $exec, %SavedExec | |
// | set active lanes to 0 in SavedExec, lanes that did not write to | |
// | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
// | SI_WATERFALL_LOOP LoopBB |-----|
// +----------------|--------------------------------------------------+
// V
// +-RestoreExecBB--------------------------+
// | $exec_lo = S_MOV_B32_term %SaveExecReg |
// +----------------|-----------------------+
// V
// +-RemainderBB:----------------------+
// | %1 = G_INST_2 |
// | ... |
// +---------------------------------- +

// Move the instruction into the loop body. Note we moved everything after
// Range.end() already into a new block, so Range.end() is no longer valid.
BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());

// Figure out the iterator range after splicing the instructions.
MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
auto NewEnd = BodyBB->end();
assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);

B.setMBB(*LoopBB);
Register CondReg;

for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
for (MachineOperand &Op : MI.all_uses()) {
Register OldReg = Op.getReg();
if (!SGPROperandRegs.count(OldReg))
continue;

// See if we already processed this register in another instruction in
// the sequence.
auto OldVal = WaterfalledRegMap.find(OldReg);
if (OldVal != WaterfalledRegMap.end()) {
Op.setReg(OldVal->second);
continue;
}

Register OpReg = Op.getReg();
LLT OpTy = MRI.getType(OpReg);

// TODO: support for agpr
assert(MRI.getRegBank(OpReg) == VgprRB);
Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);

// Build the comparison(s), CurrentLaneReg == OpReg.
unsigned OpSize = OpTy.getSizeInBits();
unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
LLT PartTy = LLT::scalar(PartSize);
unsigned NumParts = OpSize / PartSize;
SmallVector<Register, 8> OpParts;
SmallVector<Register, 8> CurrentLaneParts;

if (NumParts == 1) {
OpParts.push_back(OpReg);
CurrentLaneParts.push_back(CurrentLaneReg);
} else {
auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
for (unsigned i = 0; i < NumParts; ++i) {
OpParts.push_back(UnmergeOp.getReg(i));
CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
}
}

for (unsigned i = 0; i < NumParts; ++i) {
Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);

if (!CondReg)
CondReg = CmpReg;
else
CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
}

Op.setReg(CurrentLaneReg);

// Make sure we don't re-process this register again.
WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
}
}

// Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
Register CondRegLM =
MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);

// Update EXEC, save the original EXEC value to SavedExec.
B.buildInstr(AndSaveExecOpc)
.addDef(SavedExec)
.addReg(CondRegLM, RegState::Kill);
MRI.setSimpleHint(SavedExec, CondRegLM);

B.setInsertPt(*BodyBB, BodyBB->end());

// Update EXEC, switch all done bits to 0 and all todo bits to 1.
B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);

// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
// s_cbranch_scc0?

// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);

// Save the EXEC mask before the loop.
B.setInsertPt(MBB, MBB.end());
B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);

// Restore the EXEC mask after the loop.
B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);

// Set the insert point after the original instruction, so any new
// instructions will be in the remainder.
B.setInsertPt(*RemainderBB, RemainderBB->begin());

return true;
}

void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
MachineFunction &MF = B.getMF();
Expand Down Expand Up @@ -395,7 +613,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,

switch (Mapping.LoweringMethod) {
case DoNotLower:
return;
break;
case VccExtToSel:
return lowerVccExtToSel(MI);
case UniExtToSel: {
Expand Down Expand Up @@ -531,7 +749,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
}
}

// TODO: executeInWaterfallLoop(... WaterfallSgprs)
if (!WaterfallSgprs.empty()) {
MachineBasicBlock::iterator I = MI.getIterator();
executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
}
}

LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
Expand All @@ -543,6 +764,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case Vgpr16:
return LLT::scalar(16);
case Sgpr32:
case Sgpr32_W:
case Sgpr32Trunc:
case Sgpr32AExt:
case Sgpr32AExtBoolInReg:
Expand Down Expand Up @@ -578,6 +800,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case VgprV2S32:
return LLT::fixed_vector(2, 32);
case SgprV4S32:
case SgprV4S32_W:
case VgprV4S32:
case UniInVgprV4S32:
return LLT::fixed_vector(4, 32);
Expand Down Expand Up @@ -645,6 +868,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
return VccRB;
case Sgpr16:
case Sgpr32:
case Sgpr32_W:
case Sgpr64:
case SgprP1:
case SgprP3:
Expand All @@ -653,6 +877,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprV2S16:
case SgprV2S32:
case SgprV4S32:
case SgprV4S32_W:
case SgprB32:
case SgprB64:
case SgprB96:
Expand Down Expand Up @@ -894,6 +1119,14 @@ void RegBankLegalizeHelper::applyMappingSrc(
}
break;
}
// sgpr waterfall, scalars and vectors
case Sgpr32_W:
case SgprV4S32_W: {
assert(Ty == getTyFromID(MethodIDs[i]));
if (RB != SgprRB)
SgprWaterfallOperandRegs.insert(Reg);
break;
}
// sgpr and vgpr scalars with extend
case Sgpr32AExt: {
// Note: this ext allows S1, and it is meant to be combined away.
Expand Down
Loading
Loading