-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LoopVectorizer] Prune VFs based on plan register pressure #132190
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9c95cba
956b905
6c751e7
3a203c4
eb3e94d
2f90fdf
7a5ffcf
c2e710e
4ccf89a
39f648f
d83c873
fea958d
a4395b5
084e513
e8abf0e
42268cd
bad1cf9
1a9d361
43294b9
637b0fc
bcd0608
1751e7c
a5cf131
fa7b725
ce446a8
71ac823
6d71404
9511885
c9b3d13
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -996,12 +996,15 @@ class LoopVectorizationCostModel { | |
/// Holds the maximum number of concurrent live intervals in the loop. | ||
/// The key is ClassID of target-provided register class. | ||
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; | ||
}; | ||
|
||
/// \return Returns information about the register usages of the loop for the | ||
/// given vectorization factors. | ||
SmallVector<RegisterUsage, 8> | ||
calculateRegisterUsage(ArrayRef<ElementCount> VFs); | ||
/// Check if any of the tracked live intervals exceeds the number of | ||
/// available registers for the target. | ||
bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const { | ||
return any_of(MaxLocalUsers, [&TTI](auto &LU) { | ||
return LU.second > TTI.getNumberOfRegisters(LU.first); | ||
}); | ||
} | ||
}; | ||
|
||
/// Collect values we want to ignore in the cost model. | ||
void collectValuesToIgnore(); | ||
|
@@ -4013,29 +4016,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( | |
auto MaxVectorElementCountMaxBW = ElementCount::get( | ||
llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), | ||
ComputeScalableMaxVF); | ||
MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); | ||
|
||
// Collect all viable vectorization factors larger than the default MaxVF | ||
// (i.e. MaxVectorElementCount). | ||
SmallVector<ElementCount, 8> VFs; | ||
for (ElementCount VS = MaxVectorElementCount * 2; | ||
ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) | ||
VFs.push_back(VS); | ||
|
||
// For each VF calculate its register usage. | ||
auto RUs = calculateRegisterUsage(VFs); | ||
|
||
// Select the largest VF which doesn't require more registers than existing | ||
// ones. | ||
for (int I = RUs.size() - 1; I >= 0; --I) { | ||
const auto &MLU = RUs[I].MaxLocalUsers; | ||
if (all_of(MLU, [&](decltype(MLU.front()) &LU) { | ||
return LU.second <= TTI.getNumberOfRegisters(LU.first); | ||
})) { | ||
MaxVF = VFs[I]; | ||
break; | ||
} | ||
} | ||
MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); | ||
|
||
if (ElementCount MinVF = | ||
TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { | ||
if (ElementCount::isKnownLT(MaxVF, MinVF)) { | ||
|
@@ -4360,6 +4342,15 @@ static bool hasReplicatorRegion(VPlan &Plan) { | |
} | ||
|
||
#ifndef NDEBUG | ||
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs | ||
/// by calculating the highest number of values that are live at a single | ||
/// location as a rough estimate. Returns the register usage for each VF in \p | ||
/// VFs. | ||
static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> | ||
calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs, | ||
const TargetTransformInfo &TTI, | ||
const SmallPtrSetImpl<const Value *> &ValuesToIgnore); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Checking: This function was already defined, but it was used in another context. If so, which one? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used in and defined above |
||
|
||
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { | ||
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); | ||
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); | ||
|
@@ -4383,11 +4374,19 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { | |
} | ||
|
||
for (auto &P : VPlans) { | ||
for (ElementCount VF : P->vectorFactors()) { | ||
ArrayRef<ElementCount> VFs(P->vectorFactors().begin(), | ||
P->vectorFactors().end()); | ||
auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore); | ||
for (auto [VF, RU] : zip_equal(VFs, RUs)) { | ||
// The cost for scalar VF=1 is already calculated, so ignore it. | ||
if (VF.isScalar()) | ||
continue; | ||
|
||
/// Don't consider the VF if it exceeds the number of registers for the | ||
/// target. | ||
if (RU.exceedsMaxNumRegs(TTI)) | ||
continue; | ||
|
||
InstructionCost C = CM.expectedCost(VF); | ||
|
||
// Add on other costs that are modelled in VPlan, but not in the legacy | ||
|
@@ -4859,9 +4858,13 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs, | |
isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe, | ||
VPScalarIVStepsRecipe>(R) || | ||
(isa<VPInstruction>(R) && | ||
all_of(cast<VPSingleDefRecipe>(R)->users(), [&](VPUser *U) { | ||
return cast<VPRecipeBase>(U)->usesScalars(R->getVPSingleValue()); | ||
}))) { | ||
all_of(cast<VPSingleDefRecipe>(R)->users(), | ||
[&](VPUser *U) { | ||
return cast<VPRecipeBase>(U)->usesScalars( | ||
R->getVPSingleValue()); | ||
})) || | ||
(isa<VPReductionPHIRecipe>(R) && | ||
(cast<VPReductionPHIRecipe>(R))->isInLoop())) { | ||
unsigned ClassID = TTI.getRegisterClassForType( | ||
false, TypeInfo.inferScalarType(R->getVPSingleValue())); | ||
// FIXME: The target might use more than one register for the type | ||
|
@@ -5234,213 +5237,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, | |
return 1; | ||
} | ||
|
||
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> | ||
LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { | ||
// This function calculates the register usage by measuring the highest number | ||
// of values that are alive at a single location. Obviously, this is a very | ||
// rough estimation. We scan the loop in a topological order in order and | ||
// assign a number to each instruction. We use RPO to ensure that defs are | ||
// met before their users. We assume that each instruction that has in-loop | ||
// users starts an interval. We record every time that an in-loop value is | ||
// used, so we have a list of the first and last occurrences of each | ||
// instruction. Next, we transpose this data structure into a multi map that | ||
// holds the list of intervals that *end* at a specific location. This multi | ||
// map allows us to perform a linear search. We scan the instructions linearly | ||
// and record each time that a new interval starts, by placing it in a set. | ||
// If we find this value in the multi-map then we remove it from the set. | ||
// The max register usage is the maximum size of the set. | ||
// We also search for instructions that are defined outside the loop, but are | ||
// used inside the loop. We need this number separately from the max-interval | ||
// usage number because when we unroll, loop-invariant values do not take | ||
// more registers. | ||
LoopBlocksDFS DFS(TheLoop); | ||
DFS.perform(LI); | ||
|
||
RegisterUsage RU; | ||
|
||
// Each 'key' in the map opens a new interval. The values | ||
// of the map are the index of the 'last seen' usage of the | ||
// instruction that is the key. | ||
using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>; | ||
|
||
// Maps instruction to its index. | ||
SmallVector<Instruction *, 64> IdxToInstr; | ||
// Marks the end of each interval. | ||
IntervalMap EndPoint; | ||
// Saves the list of instruction indices that are used in the loop. | ||
SmallPtrSet<Instruction *, 8> Ends; | ||
// Saves the list of values that are used in the loop but are defined outside | ||
// the loop (not including non-instruction values such as arguments and | ||
// constants). | ||
SmallSetVector<Instruction *, 8> LoopInvariants; | ||
|
||
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { | ||
for (Instruction &I : BB->instructionsWithoutDebug()) { | ||
IdxToInstr.push_back(&I); | ||
|
||
// Save the end location of each USE. | ||
for (Value *U : I.operands()) { | ||
auto *Instr = dyn_cast<Instruction>(U); | ||
|
||
// Ignore non-instruction values such as arguments, constants, etc. | ||
// FIXME: Might need some motivation why these values are ignored. If | ||
// for example an argument is used inside the loop it will increase the | ||
// register pressure (so shouldn't we add it to LoopInvariants). | ||
if (!Instr) | ||
continue; | ||
|
||
// If this instruction is outside the loop then record it and continue. | ||
if (!TheLoop->contains(Instr)) { | ||
LoopInvariants.insert(Instr); | ||
continue; | ||
} | ||
|
||
// Overwrite previous end points. | ||
EndPoint[Instr] = IdxToInstr.size(); | ||
Ends.insert(Instr); | ||
} | ||
} | ||
} | ||
|
||
// Saves the list of intervals that end with the index in 'key'. | ||
using InstrList = SmallVector<Instruction *, 2>; | ||
SmallDenseMap<unsigned, InstrList, 16> TransposeEnds; | ||
|
||
// Transpose the EndPoints to a list of values that end at each index. | ||
for (auto &Interval : EndPoint) | ||
TransposeEnds[Interval.second].push_back(Interval.first); | ||
|
||
SmallPtrSet<Instruction *, 8> OpenIntervals; | ||
SmallVector<RegisterUsage, 8> RUs(VFs.size()); | ||
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); | ||
|
||
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); | ||
|
||
const auto &TTICapture = TTI; | ||
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { | ||
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || | ||
(VF.isScalable() && | ||
!TTICapture.isElementTypeLegalForScalableVector(Ty))) | ||
return 0; | ||
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); | ||
}; | ||
|
||
collectInLoopReductions(); | ||
|
||
for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) { | ||
Instruction *I = IdxToInstr[Idx]; | ||
|
||
// Remove all of the instructions that end at this location. | ||
InstrList &List = TransposeEnds[Idx]; | ||
for (Instruction *ToRemove : List) | ||
OpenIntervals.erase(ToRemove); | ||
|
||
// Ignore instructions that are never used within the loop and do not have | ||
// side-effects. | ||
if (!Ends.count(I) && !I->mayHaveSideEffects()) | ||
continue; | ||
|
||
// Skip ignored values. | ||
if (ValuesToIgnore.count(I)) | ||
continue; | ||
|
||
// For each VF find the maximum usage of registers. | ||
for (unsigned J = 0, E = VFs.size(); J < E; ++J) { | ||
// Count the number of registers used, per register class, given all open | ||
// intervals. | ||
// Note that elements in this SmallMapVector will be default constructed | ||
// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if | ||
// there is no previous entry for ClassID. | ||
SmallMapVector<unsigned, unsigned, 4> RegUsage; | ||
|
||
if (VFs[J].isScalar()) { | ||
for (auto *Inst : OpenIntervals) { | ||
unsigned ClassID = | ||
TTI.getRegisterClassForType(false, Inst->getType()); | ||
// FIXME: The target might use more than one register for the type | ||
// even in the scalar case. | ||
RegUsage[ClassID] += 1; | ||
} | ||
} else { | ||
collectNonVectorizedAndSetWideningDecisions(VFs[J]); | ||
for (auto *Inst : OpenIntervals) { | ||
// Skip ignored values for VF > 1. | ||
if (VecValuesToIgnore.count(Inst)) | ||
continue; | ||
if (isScalarAfterVectorization(Inst, VFs[J])) { | ||
unsigned ClassID = | ||
TTI.getRegisterClassForType(false, Inst->getType()); | ||
// FIXME: The target might use more than one register for the type | ||
// even in the scalar case. | ||
RegUsage[ClassID] += 1; | ||
} else { | ||
unsigned ClassID = | ||
TTI.getRegisterClassForType(true, Inst->getType()); | ||
RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]); | ||
} | ||
} | ||
} | ||
|
||
for (const auto &Pair : RegUsage) { | ||
auto &Entry = MaxUsages[J][Pair.first]; | ||
Entry = std::max(Entry, Pair.second); | ||
} | ||
} | ||
|
||
LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " | ||
<< OpenIntervals.size() << '\n'); | ||
|
||
// Add the current instruction to the list of open intervals. | ||
OpenIntervals.insert(I); | ||
} | ||
|
||
for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { | ||
// Note that elements in this SmallMapVector will be default constructed | ||
// as 0. So we can use "Invariant[ClassID] += n" in the code below even if | ||
// there is no previous entry for ClassID. | ||
SmallMapVector<unsigned, unsigned, 4> Invariant; | ||
|
||
for (auto *Inst : LoopInvariants) { | ||
// FIXME: The target might use more than one register for the type | ||
// even in the scalar case. | ||
bool IsScalar = all_of(Inst->users(), [&](User *U) { | ||
auto *I = cast<Instruction>(U); | ||
return TheLoop != LI->getLoopFor(I->getParent()) || | ||
isScalarAfterVectorization(I, VFs[Idx]); | ||
}); | ||
|
||
ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; | ||
unsigned ClassID = | ||
TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); | ||
Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); | ||
} | ||
|
||
LLVM_DEBUG({ | ||
dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; | ||
dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() | ||
<< " item\n"; | ||
for (const auto &pair : MaxUsages[Idx]) { | ||
dbgs() << "LV(REG): RegisterClass: " | ||
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second | ||
<< " registers\n"; | ||
} | ||
dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() | ||
<< " item\n"; | ||
for (const auto &pair : Invariant) { | ||
dbgs() << "LV(REG): RegisterClass: " | ||
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second | ||
<< " registers\n"; | ||
} | ||
}); | ||
|
||
RU.LoopInvariantRegs = Invariant; | ||
RU.MaxLocalUsers = MaxUsages[Idx]; | ||
RUs[Idx] = RU; | ||
} | ||
|
||
return RUs; | ||
} | ||
|
||
bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, | ||
ElementCount VF) { | ||
// TODO: Cost model for emulated masked load/store is completely | ||
|
@@ -7621,7 +7417,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { | |
} | ||
|
||
for (auto &P : VPlans) { | ||
for (ElementCount VF : P->vectorFactors()) { | ||
ArrayRef<ElementCount> VFs(P->vectorFactors().begin(), | ||
P->vectorFactors().end()); | ||
auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore); | ||
for (auto [VF, RU] : zip_equal(VFs, RUs)) { | ||
if (VF.isScalar()) | ||
continue; | ||
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { | ||
|
@@ -7642,6 +7441,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { | |
|
||
InstructionCost Cost = cost(*P, VF); | ||
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); | ||
|
||
if (RU.exceedsMaxNumRegs(TTI)) { | ||
LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width " | ||
<< VF << " because it uses too many registers\n"); | ||
continue; | ||
} | ||
|
||
if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) | ||
BestFactor = CurrentFactor; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: move below
#ifndef NDEBUG
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.