Skip to content

Commit 8ec4067

Browse files
authored
[VPlan] Implement unrolling as VPlan-to-VPlan transform. (#95842)
This patch implements explicit unrolling by UF as VPlan transform. In follow up patches this will allow simplifying VPTransform state (no need to store unrolled parts) as well as recipe execution (no need to generate code for multiple parts in an each recipe). It also allows for more general optimziations (e.g. avoid generating code for recipes that are uniform-across parts). It also unifies the logic dealing with unrolled parts in a single place, rather than spreading it out across multiple places (e.g. VPlan post processing for header-phi recipes previously.) In the initial implementation, a number of recipes still take the unrolled part as additional, optional argument, if their execution depends on the unrolled part. The computation for start/step values for scalable inductions changed slightly. Previously the step would be computed as scalar and then splatted, now vscale gets splatted and multiplied by the step in a vector mul. This has been split off #94339 which also includes changes to simplify VPTransfomState and recipes' ::execute. The current version mostly leaves existing ::execute untouched and instead sets VPTransfomState::UF to 1. A follow-up patch will clean up all references to VPTransformState::UF. Another follow-up patch will simplify VPTransformState to only store a single vector value per VPValue. PR: #95842
1 parent 6032fee commit 8ec4067

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1262
-562
lines changed

llvm/lib/Transforms/Vectorize/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ add_llvm_component_library(LLVMVectorize
1616
VPlanRecipes.cpp
1717
VPlanSLP.cpp
1818
VPlanTransforms.cpp
19+
VPlanUnroll.cpp
1920
VPlanVerifier.cpp
2021
VPlanUtils.cpp
2122

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,15 @@ class VPBuilder {
156156
DebugLoc DL, const Twine &Name = "") {
157157
return createInstruction(Opcode, Operands, DL, Name);
158158
}
159+
VPInstruction *createNaryOp(unsigned Opcode,
160+
std::initializer_list<VPValue *> Operands,
161+
std::optional<FastMathFlags> FMFs = {},
162+
DebugLoc DL = {}, const Twine &Name = "") {
163+
if (FMFs)
164+
return tryInsertInstruction(
165+
new VPInstruction(Opcode, Operands, *FMFs, DL, Name));
166+
return createInstruction(Opcode, Operands, DL, Name);
167+
}
159168

160169
VPInstruction *createOverflowingOp(unsigned Opcode,
161170
std::initializer_list<VPValue *> Operands,
@@ -164,6 +173,7 @@ class VPBuilder {
164173
return tryInsertInstruction(
165174
new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
166175
}
176+
167177
VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
168178
const Twine &Name = "") {
169179
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
@@ -223,6 +233,11 @@ class VPBuilder {
223233
return tryInsertInstruction(new VPScalarCastRecipe(Opcode, Op, ResultTy));
224234
}
225235

236+
VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op,
237+
Type *ResultTy) {
238+
return tryInsertInstruction(new VPWidenCastRecipe(Opcode, Op, ResultTy));
239+
}
240+
226241
VPScalarIVStepsRecipe *
227242
createScalarIVSteps(Instruction::BinaryOps InductionOpcode,
228243
FPMathOperator *FPBinOp, VPValue *IV, VPValue *Step) {

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7507,6 +7507,10 @@ LoopVectorizationPlanner::executePlan(
75077507
"expanded SCEVs to reuse can only be used during epilogue vectorization");
75087508
(void)IsEpilogueVectorization;
75097509

7510+
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
7511+
// cost model is complete for better cost estimates.
7512+
VPlanTransforms::unrollByUF(BestVPlan, BestUF,
7513+
OrigLoop->getHeader()->getModule()->getContext());
75107514
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
75117515

75127516
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
@@ -7625,7 +7629,7 @@ LoopVectorizationPlanner::executePlan(
76257629
if (MiddleTerm->isConditional() &&
76267630
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
76277631
// Assume that `Count % VectorTripCount` is equally distributed.
7628-
unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7632+
unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
76297633
assert(TripCount > 0 && "trip count should not be zero");
76307634
const uint32_t Weights[] = {1, TripCount - 1};
76317635
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
391391
->shouldEmitDebugInfoForProfiling() &&
392392
!EnableFSDiscriminator) {
393393
// FIXME: For scalable vectors, assume vscale=1.
394+
unsigned UF = Plan->getUF();
394395
auto NewDIL =
395396
DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
396397
if (NewDIL)
@@ -1018,6 +1019,10 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
10181019
/// Assumes a single pre-header basic-block was created for this. Introduce
10191020
/// additional basic-blocks as needed, and fill them all.
10201021
void VPlan::execute(VPTransformState *State) {
1022+
// Set UF to 1, as the unrollByUF VPlan transform already explicitly unrolled
1023+
// the VPlan.
1024+
// TODO: Remove State::UF and all uses.
1025+
State->UF = 1;
10211026
// Initialize CFG state.
10221027
State->CFG.PrevVPBB = nullptr;
10231028
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
@@ -1093,6 +1098,10 @@ void VPlan::execute(VPTransformState *State) {
10931098
// consistent placement of all induction updates.
10941099
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
10951100
Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
1101+
1102+
// Use the steps for the last part as backedge value for the induction.
1103+
if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
1104+
Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand(), 0));
10961105
continue;
10971106
}
10981107

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,7 @@ class VPBlockBase {
532532
VPBlocksTy &getSuccessors() { return Successors; }
533533

534534
iterator_range<VPBlockBase **> successors() { return Successors; }
535+
iterator_range<VPBlockBase **> predecessors() { return Predecessors; }
535536

536537
const VPBlocksTy &getPredecessors() const { return Predecessors; }
537538
VPBlocksTy &getPredecessors() { return Predecessors; }
@@ -724,6 +725,11 @@ class VPLiveOut : public VPUser {
724725

725726
PHINode *getPhi() const { return Phi; }
726727

728+
/// Live-outs are marked as only using the first part during the transition
729+
/// to unrolling directly on VPlan.
730+
/// TODO: Remove after unroller transition.
731+
bool onlyFirstPartUsed(const VPValue *Op) const override { return true; }
732+
727733
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
728734
/// Print the VPLiveOut to \p O.
729735
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
@@ -1226,11 +1232,24 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
12261232
#endif
12271233
};
12281234

1235+
/// Helper to access the operand that contains the unroll part for this recipe
1236+
/// after unrolling.
1237+
template <unsigned PartOpIdx> class VPUnrollPartAccessor {
1238+
protected:
1239+
/// Return the VPValue operand containing the unroll part or null if there is
1240+
/// no such operand.
1241+
VPValue *getUnrollPartOperand(VPUser &U) const;
1242+
1243+
/// Return the unroll part.
1244+
unsigned getUnrollPart(VPUser &U) const;
1245+
};
1246+
12291247
/// This is a concrete Recipe that models a single VPlan-level instruction.
12301248
/// While as any Recipe it may generate a sequence of IR instructions when
12311249
/// executed, these instructions would always form a single-def expression as
12321250
/// the VPInstruction is also a single def-use vertex.
1233-
class VPInstruction : public VPRecipeWithIRFlags {
1251+
class VPInstruction : public VPRecipeWithIRFlags,
1252+
public VPUnrollPartAccessor<1> {
12341253
friend class VPlanSlp;
12351254

12361255
public:
@@ -1764,7 +1783,8 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
17641783
/// A recipe to compute the pointers for widened memory accesses of IndexTy for
17651784
/// all parts. If IsReverse is true, compute pointers for accessing the input in
17661785
/// reverse order per part.
1767-
class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
1786+
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
1787+
public VPUnrollPartAccessor<1> {
17681788
Type *IndexedTy;
17691789
bool IsReverse;
17701790

@@ -1789,7 +1809,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
17891809
bool onlyFirstPartUsed(const VPValue *Op) const override {
17901810
assert(is_contained(operands(), Op) &&
17911811
"Op must be an operand of the recipe");
1792-
assert(getNumOperands() == 1 && "must have a single operand");
1812+
assert(getNumOperands() <= 2 && "must have at most two operands");
17931813
return true;
17941814
}
17951815

@@ -1948,6 +1968,12 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
19481968
VPValue *getVFValue() { return getOperand(2); }
19491969
const VPValue *getVFValue() const { return getOperand(2); }
19501970

1971+
VPValue *getSplatVFValue() {
1972+
// If the recipe has been unrolled (4 operands), return the VPValue for the
1973+
// induction increment.
1974+
return getNumOperands() == 5 ? getOperand(3) : nullptr;
1975+
}
1976+
19511977
/// Returns the first defined value as TruncInst, if it is one or nullptr
19521978
/// otherwise.
19531979
TruncInst *getTruncInst() { return Trunc; }
@@ -1967,9 +1993,17 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
19671993
Type *getScalarType() const {
19681994
return Trunc ? Trunc->getType() : IV->getType();
19691995
}
1996+
1997+
/// Returns the VPValue representing the value of this induction at
1998+
/// the last unrolled part, if it exists. Returns itself if unrolling did not
1999+
/// take place.
2000+
VPValue *getLastUnrolledPartOperand() {
2001+
return getNumOperands() == 5 ? getOperand(4) : this;
2002+
}
19702003
};
19712004

1972-
class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
2005+
class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe,
2006+
public VPUnrollPartAccessor<3> {
19732007
const InductionDescriptor &IndDesc;
19742008

19752009
bool IsScalarAfterVectorization;
@@ -2006,6 +2040,13 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
20062040
/// Returns the induction descriptor for the recipe.
20072041
const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
20082042

2043+
/// Returns the VPValue representing the value of this induction at
2044+
/// the first unrolled part, if it exists. Returns itself if unrolling did not
2045+
/// take place.
2046+
VPValue *getFirstUnrolledPartOperand() {
2047+
return getUnrollPart(*this) == 0 ? this : getOperand(2);
2048+
}
2049+
20092050
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
20102051
/// Print the recipe.
20112052
void print(raw_ostream &O, const Twine &Indent,
@@ -2088,7 +2129,8 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
20882129
/// A recipe for handling reduction phis. The start value is the first operand
20892130
/// of the recipe and the incoming value from the backedge is the second
20902131
/// operand.
2091-
class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
2132+
class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
2133+
public VPUnrollPartAccessor<2> {
20922134
/// Descriptor for the reduction.
20932135
const RecurrenceDescriptor &RdxDesc;
20942136

@@ -2907,7 +2949,10 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
29072949
~VPActiveLaneMaskPHIRecipe() override = default;
29082950

29092951
VPActiveLaneMaskPHIRecipe *clone() override {
2910-
return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
2952+
auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
2953+
if (getNumOperands() == 2)
2954+
R->addOperand(getOperand(1));
2955+
return R;
29112956
}
29122957

29132958
VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
@@ -2966,7 +3011,8 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
29663011
};
29673012

29683013
/// A Recipe for widening the canonical induction variable of the vector loop.
2969-
class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
3014+
class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe,
3015+
public VPUnrollPartAccessor<1> {
29703016
public:
29713017
VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
29723018
: VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {}
@@ -3052,7 +3098,8 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
30523098

30533099
/// A recipe for handling phi nodes of integer and floating-point inductions,
30543100
/// producing their scalar values.
3055-
class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
3101+
class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
3102+
public VPUnrollPartAccessor<2> {
30563103
Instruction::BinaryOps InductionOpcode;
30573104

30583105
public:
@@ -3548,6 +3595,11 @@ class VPlan {
35483595

35493596
bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
35503597

3598+
unsigned getUF() const {
3599+
assert(UFs.size() == 1 && "Expected a single UF");
3600+
return UFs[0];
3601+
}
3602+
35513603
void setUF(unsigned UF) {
35523604
assert(hasUF(UF) && "Cannot set the UF not already in plan");
35533605
UFs.clear();
@@ -3732,6 +3784,22 @@ class VPBlockUtils {
37323784
connectBlocks(BlockPtr, NewBlock);
37333785
}
37343786

3787+
/// Insert disconnected block \p NewBlock before \p Blockptr. First
3788+
/// disconnects all predecessors of \p BlockPtr and connects them to \p
3789+
/// NewBlock. Add \p NewBlock as predecessor of \p BlockPtr and \p BlockPtr as
3790+
/// successor of \p NewBlock.
3791+
static void insertBlockBefore(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
3792+
assert(NewBlock->getSuccessors().empty() &&
3793+
NewBlock->getPredecessors().empty() &&
3794+
"Can't insert new block with predecessors or successors.");
3795+
NewBlock->setParent(BlockPtr->getParent());
3796+
for (VPBlockBase *Pred : to_vector(BlockPtr->predecessors())) {
3797+
disconnectBlocks(Pred, BlockPtr);
3798+
connectBlocks(Pred, NewBlock);
3799+
}
3800+
connectBlocks(NewBlock, BlockPtr);
3801+
}
3802+
37353803
/// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
37363804
/// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
37373805
/// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ struct UnaryRecipe_match {
144144
return DefR && match(DefR);
145145
}
146146

147+
bool match(const VPSingleDefRecipe *R) {
148+
return match(static_cast<const VPRecipeBase *>(R));
149+
}
150+
147151
bool match(const VPRecipeBase *R) {
148152
if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R))
149153
return false;

0 commit comments

Comments
 (0)