Skip to content

Commit 6f92339

Browse files
authored
[LV] Compute register usage for interleaving on VPlan. (#126437)
Add a version of calculateRegisterUsage that works estimates register usage for a VPlan. This mostly just ports the existing code, with some updates to figure out what recipes will generate vectors vs scalars. There are number of changes in the computed register usages, but they should be more accurate w.r.t. to the generated vector code. There are the following changes: * Scalar usage increases in most cases by 1, as we always create a scalar canonical IV, which is alive across the loop and is not considered by the legacy implementation * Output is ordered by insertion, now scalar registers are added first due the canonical IV phi. * Using the VPlan, we now also more precisely know if an induction will be vectorized or scalarized. Depends on #126415 PR: #126437
1 parent 556fb4c commit 6f92339

File tree

15 files changed

+451
-460
lines changed

15 files changed

+451
-460
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 232 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -992,7 +992,8 @@ class LoopVectorizationCostModel {
992992
/// If interleave count has been specified by metadata it will be returned.
993993
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
994994
/// are the selected vectorization factor and the cost of the selected VF.
995-
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
995+
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
996+
InstructionCost LoopCost);
996997

997998
/// Memory access instruction may be vectorized in more than one way.
998999
/// Form of instruction after vectorization depends on cost.
@@ -4873,8 +4874,233 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48734874
}
48744875
}
48754876

4877+
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
4878+
/// by calculating the highest number of values that are live at a single
4879+
/// location as a rough estimate. Returns the register usage for each VF in \p
4880+
/// VFs.
4881+
static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
4882+
calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
4883+
const TargetTransformInfo &TTI,
4884+
const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4885+
// Each 'key' in the map opens a new interval. The values
4886+
// of the map are the index of the 'last seen' usage of the
4887+
// recipe that is the key.
4888+
using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
4889+
4890+
// Maps indices to recipes.
4891+
SmallVector<VPRecipeBase *, 64> Idx2Recipe;
4892+
// Marks the end of each interval.
4893+
IntervalMap EndPoint;
4894+
// Saves the list of recipe indices that are used in the loop.
4895+
SmallPtrSet<VPRecipeBase *, 8> Ends;
4896+
// Saves the list of values that are used in the loop but are defined outside
4897+
// the loop (not including non-recipe values such as arguments and
4898+
// constants).
4899+
SmallSetVector<VPValue *, 8> LoopInvariants;
4900+
LoopInvariants.insert(&Plan.getVectorTripCount());
4901+
4902+
// We scan the loop in a topological order in order and assign a number to
4903+
// each recipe. We use RPO to ensure that defs are met before their users. We
4904+
// assume that each recipe that has in-loop users starts an interval. We
4905+
// record every time that an in-loop value is used, so we have a list of the
4906+
// first and last occurrences of each recipe.
4907+
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
4908+
Plan.getVectorLoopRegion());
4909+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4910+
if (!VPBB->getParent())
4911+
break;
4912+
for (VPRecipeBase &R : *VPBB) {
4913+
Idx2Recipe.push_back(&R);
4914+
4915+
// Save the end location of each USE.
4916+
for (VPValue *U : R.operands()) {
4917+
auto *DefR = U->getDefiningRecipe();
4918+
4919+
// Ignore non-recipe values such as arguments, constants, etc.
4920+
// FIXME: Might need some motivation why these values are ignored. If
4921+
// for example an argument is used inside the loop it will increase the
4922+
// register pressure (so shouldn't we add it to LoopInvariants).
4923+
if (!DefR && (!U->getLiveInIRValue() ||
4924+
!isa<Instruction>(U->getLiveInIRValue())))
4925+
continue;
4926+
4927+
// If this recipe is outside the loop then record it and continue.
4928+
if (!DefR) {
4929+
LoopInvariants.insert(U);
4930+
continue;
4931+
}
4932+
4933+
// Overwrite previous end points.
4934+
EndPoint[DefR] = Idx2Recipe.size();
4935+
Ends.insert(DefR);
4936+
}
4937+
}
4938+
if (VPBB == Plan.getVectorLoopRegion()->getExiting()) {
4939+
// VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4940+
// exiting block, where their increment will get materialized eventually.
4941+
for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
4942+
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4943+
EndPoint[&R] = Idx2Recipe.size();
4944+
Ends.insert(&R);
4945+
}
4946+
}
4947+
}
4948+
}
4949+
4950+
// Saves the list of intervals that end with the index in 'key'.
4951+
using RecipeList = SmallVector<VPRecipeBase *, 2>;
4952+
SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
4953+
4954+
// Next, we transpose the EndPoints into a multi map that holds the list of
4955+
// intervals that *end* at a specific location.
4956+
for (auto &Interval : EndPoint)
4957+
TransposeEnds[Interval.second].push_back(Interval.first);
4958+
4959+
SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
4960+
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> RUs(VFs.size());
4961+
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
4962+
4963+
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
4964+
4965+
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4966+
4967+
const auto &TTICapture = TTI;
4968+
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4969+
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
4970+
(VF.isScalable() &&
4971+
!TTICapture.isElementTypeLegalForScalableVector(Ty)))
4972+
return 0;
4973+
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
4974+
};
4975+
4976+
// We scan the instructions linearly and record each time that a new interval
4977+
// starts, by placing it in a set. If we find this value in TransposEnds then
4978+
// we remove it from the set. The max register usage is the maximum register
4979+
// usage of the recipes of the set.
4980+
for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) {
4981+
VPRecipeBase *R = Idx2Recipe[Idx];
4982+
4983+
// Remove all of the recipes that end at this location.
4984+
RecipeList &List = TransposeEnds[Idx];
4985+
for (VPRecipeBase *ToRemove : List)
4986+
OpenIntervals.erase(ToRemove);
4987+
4988+
// Ignore recipes that are never used within the loop and do not have side
4989+
// effects.
4990+
if (!Ends.count(R) && !R->mayHaveSideEffects())
4991+
continue;
4992+
4993+
// Skip recipes for ignored values.
4994+
// TODO: Should mark recipes for ephemeral values that cannot be removed
4995+
// explictly in VPlan.
4996+
if (isa<VPSingleDefRecipe>(R) &&
4997+
ValuesToIgnore.contains(
4998+
cast<VPSingleDefRecipe>(R)->getUnderlyingValue()))
4999+
continue;
5000+
5001+
// For each VF find the maximum usage of registers.
5002+
for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5003+
// Count the number of registers used, per register class, given all open
5004+
// intervals.
5005+
// Note that elements in this SmallMapVector will be default constructed
5006+
// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5007+
// there is no previous entry for ClassID.
5008+
SmallMapVector<unsigned, unsigned, 4> RegUsage;
5009+
5010+
for (auto *R : OpenIntervals) {
5011+
// Skip recipes that weren't present in the original loop.
5012+
// TODO: Remove after removing the legacy
5013+
// LoopVectorizationCostModel::calculateRegisterUsage
5014+
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
5015+
VPBranchOnMaskRecipe>(R))
5016+
continue;
5017+
5018+
if (VFs[J].isScalar() ||
5019+
isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
5020+
VPScalarIVStepsRecipe>(R) ||
5021+
(isa<VPInstruction>(R) &&
5022+
all_of(cast<VPSingleDefRecipe>(R)->users(), [&](VPUser *U) {
5023+
return cast<VPRecipeBase>(U)->usesScalars(R->getVPSingleValue());
5024+
}))) {
5025+
unsigned ClassID = TTI.getRegisterClassForType(
5026+
false, TypeInfo.inferScalarType(R->getVPSingleValue()));
5027+
// FIXME: The target might use more than one register for the type
5028+
// even in the scalar case.
5029+
RegUsage[ClassID] += 1;
5030+
} else {
5031+
for (VPValue *DefV : R->definedValues()) {
5032+
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
5033+
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
5034+
RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
5035+
}
5036+
}
5037+
}
5038+
5039+
for (const auto &Pair : RegUsage) {
5040+
auto &Entry = MaxUsages[J][Pair.first];
5041+
Entry = std::max(Entry, Pair.second);
5042+
}
5043+
}
5044+
5045+
LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5046+
<< OpenIntervals.size() << '\n');
5047+
5048+
// Add the current recipe to the list of open intervals.
5049+
OpenIntervals.insert(R);
5050+
}
5051+
5052+
// We also search for instructions that are defined outside the loop, but are
5053+
// used inside the loop. We need this number separately from the max-interval
5054+
// usage number because when we unroll, loop-invariant values do not take
5055+
// more register.
5056+
LoopVectorizationCostModel::RegisterUsage RU;
5057+
for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5058+
// Note that elements in this SmallMapVector will be default constructed
5059+
// as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5060+
// there is no previous entry for ClassID.
5061+
SmallMapVector<unsigned, unsigned, 4> Invariant;
5062+
5063+
for (auto *In : LoopInvariants) {
5064+
// FIXME: The target might use more than one register for the type
5065+
// even in the scalar case.
5066+
bool IsScalar = all_of(In->users(), [&](VPUser *U) {
5067+
return cast<VPRecipeBase>(U)->usesScalars(In);
5068+
});
5069+
5070+
ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5071+
unsigned ClassID = TTI.getRegisterClassForType(
5072+
VF.isVector(), TypeInfo.inferScalarType(In));
5073+
Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF);
5074+
}
5075+
5076+
LLVM_DEBUG({
5077+
dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5078+
dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5079+
<< " item\n";
5080+
for (const auto &pair : MaxUsages[Idx]) {
5081+
dbgs() << "LV(REG): RegisterClass: "
5082+
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5083+
<< " registers\n";
5084+
}
5085+
dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5086+
<< " item\n";
5087+
for (const auto &pair : Invariant) {
5088+
dbgs() << "LV(REG): RegisterClass: "
5089+
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5090+
<< " registers\n";
5091+
}
5092+
});
5093+
5094+
RU.LoopInvariantRegs = Invariant;
5095+
RU.MaxLocalUsers = MaxUsages[Idx];
5096+
RUs[Idx] = RU;
5097+
}
5098+
5099+
return RUs;
5100+
}
5101+
48765102
unsigned
4877-
LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5103+
LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48785104
InstructionCost LoopCost) {
48795105
// -- The interleave heuristics --
48805106
// We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4924,7 +5150,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49245150
return 1;
49255151
}
49265152

4927-
RegisterUsage R = calculateRegisterUsage({VF})[0];
5153+
RegisterUsage R =
5154+
::calculateRegisterUsage(Plan, {VF}, TTI, ValuesToIgnore)[0];
49285155
// We divide by these constants so assume that we have at least one
49295156
// instruction that uses at least one register.
49305157
for (auto &Pair : R.MaxLocalUsers) {
@@ -5175,7 +5402,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
51755402
// We also search for instructions that are defined outside the loop, but are
51765403
// used inside the loop. We need this number separately from the max-interval
51775404
// usage number because when we unroll, loop-invariant values do not take
5178-
// more register.
5405+
// more registers.
51795406
LoopBlocksDFS DFS(TheLoop);
51805407
DFS.perform(LI);
51815408

@@ -10755,7 +10982,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1075510982
AddBranchWeights, CM.CostKind);
1075610983
if (LVP.hasPlanWithVF(VF.Width)) {
1075710984
// Select the interleave count.
10758-
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10985+
IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
1075910986

1076010987
unsigned SelectedIC = std::max(IC, UserIC);
1076110988
// Optimistically generate runtime checks if they are needed. Drop them if

llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ target triple = "aarch64"
88
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
99
; CHECK: LV(REG): VF = 32
1010
; CHECK-NEXT: LV(REG): Found max usage: 2 item
11+
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
1112
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
12-
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
1313

1414
define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
1515
entry:
@@ -31,8 +31,8 @@ loop:
3131
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
3232
; CHECK: LV(REG): VF = 64
3333
; CHECK-NEXT: LV(REG): Found max usage: 2 item
34+
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
3435
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
35-
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
3636

3737
define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
3838
entry:

0 commit comments

Comments
 (0)