Skip to content

[LoopVectorizer] Prune VFs based on plan register pressure #132190

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
May 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
9c95cba
[LoopVectorizer] Prune VFs based on plan register pressure
SamTebbs33 Mar 11, 2025
956b905
Format
SamTebbs33 Mar 20, 2025
6c751e7
Ignore in-loop reductions
SamTebbs33 Mar 21, 2025
3a203c4
Simpify in-loop checking
SamTebbs33 Mar 21, 2025
eb3e94d
Re-add tripcount test
SamTebbs33 Mar 25, 2025
2f90fdf
Revert scalable-call.ll changes
SamTebbs33 Mar 26, 2025
7a5ffcf
Set MaxVF without loop if MaxVectorElementCount <= MaxVectorElementCo…
SamTebbs33 Mar 26, 2025
c2e710e
Move calculateRegisterUsage out of cost model
SamTebbs33 Mar 26, 2025
4ccf89a
Separate out scaled reduction changes
SamTebbs33 Mar 26, 2025
39f648f
Fix RISCV tests
SamTebbs33 Apr 3, 2025
d83c873
Use zip_equal
SamTebbs33 Apr 8, 2025
fea958d
Don't add VF to profitable list if uses too many registers
SamTebbs33 Apr 8, 2025
a4395b5
Format
SamTebbs33 Apr 10, 2025
084e513
Prune high register pressure VFs in legacy cost model
SamTebbs33 Apr 11, 2025
e8abf0e
Format
SamTebbs33 Apr 14, 2025
42268cd
Revert neon test changes
SamTebbs33 Apr 14, 2025
bad1cf9
Forward declare and move back calculate function
SamTebbs33 Apr 15, 2025
1a9d361
Use ArrayRef
SamTebbs33 Apr 15, 2025
43294b9
Use auto
SamTebbs33 Apr 15, 2025
637b0fc
Use ArrayRef in calculateRegisterUsage
SamTebbs33 Apr 15, 2025
bcd0608
Rebase and add RegisterUsage.anyGreaterThanNumberOfregisters
SamTebbs33 Apr 17, 2025
1751e7c
Add in-loop reduction comment and rename to exceedsMaxNumRegs
SamTebbs33 Apr 17, 2025
a5cf131
NFC: Only capture TTI in lambda
SamTebbs33 Apr 22, 2025
fa7b725
NFC: Assign directly to MaxVF
SamTebbs33 Apr 22, 2025
ce446a8
NFC: Move comment below ifndef NDEBUG
SamTebbs33 Apr 22, 2025
71ac823
Improve inloop reduction detection
SamTebbs33 Apr 22, 2025
6d71404
Change debug message and add const
SamTebbs33 Apr 23, 2025
9511885
Fix AVX512F check statements
SamTebbs33 May 13, 2025
c9b3d13
Rebase
SamTebbs33 May 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
286 changes: 46 additions & 240 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -996,12 +996,15 @@ class LoopVectorizationCostModel {
/// Holds the maximum number of concurrent live intervals in the loop.
/// The key is ClassID of target-provided register class.
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
};

/// \return Returns information about the register usages of the loop for the
/// given vectorization factors.
SmallVector<RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs);
/// Check if any of the tracked live intervals exceeds the number of
/// available registers for the target.
bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const {
return any_of(MaxLocalUsers, [&TTI](auto &LU) {
return LU.second > TTI.getNumberOfRegisters(LU.first);
});
}
};

/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();
Expand Down Expand Up @@ -4013,29 +4016,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
auto MaxVectorElementCountMaxBW = ElementCount::get(
llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
ComputeScalableMaxVF);
MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);

// Collect all viable vectorization factors larger than the default MaxVF
// (i.e. MaxVectorElementCount).
SmallVector<ElementCount, 8> VFs;
for (ElementCount VS = MaxVectorElementCount * 2;
ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
VFs.push_back(VS);

// For each VF calculate its register usage.
auto RUs = calculateRegisterUsage(VFs);

// Select the largest VF which doesn't require more registers than existing
// ones.
for (int I = RUs.size() - 1; I >= 0; --I) {
const auto &MLU = RUs[I].MaxLocalUsers;
if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
return LU.second <= TTI.getNumberOfRegisters(LU.first);
})) {
MaxVF = VFs[I];
break;
}
}
MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);

if (ElementCount MinVF =
TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
if (ElementCount::isKnownLT(MaxVF, MinVF)) {
Expand Down Expand Up @@ -4360,6 +4342,15 @@ static bool hasReplicatorRegion(VPlan &Plan) {
}

#ifndef NDEBUG
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
/// by calculating the highest number of values that are live at a single
/// location as a rough estimate. Returns the register usage for each VF in \p
/// VFs.
Comment on lines +4345 to +4348
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: move below #ifndef NDEBUG

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
const TargetTransformInfo &TTI,
const SmallPtrSetImpl<const Value *> &ValuesToIgnore);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking: This function was already defined, but it was used in another context. If so, which one?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's used in and defined above selectInterleaveCount . I originally moved the function from there to above selectVectorizationFactor but that produced a big diff that was harder to review.


VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
Expand All @@ -4383,11 +4374,19 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
}

for (auto &P : VPlans) {
for (ElementCount VF : P->vectorFactors()) {
ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
P->vectorFactors().end());
auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore);
for (auto [VF, RU] : zip_equal(VFs, RUs)) {
// The cost for scalar VF=1 is already calculated, so ignore it.
if (VF.isScalar())
continue;

/// Don't consider the VF if it exceeds the number of registers for the
/// target.
if (RU.exceedsMaxNumRegs(TTI))
continue;

InstructionCost C = CM.expectedCost(VF);

// Add on other costs that are modelled in VPlan, but not in the legacy
Expand Down Expand Up @@ -4859,9 +4858,13 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
VPScalarIVStepsRecipe>(R) ||
(isa<VPInstruction>(R) &&
all_of(cast<VPSingleDefRecipe>(R)->users(), [&](VPUser *U) {
return cast<VPRecipeBase>(U)->usesScalars(R->getVPSingleValue());
}))) {
all_of(cast<VPSingleDefRecipe>(R)->users(),
[&](VPUser *U) {
return cast<VPRecipeBase>(U)->usesScalars(
R->getVPSingleValue());
})) ||
(isa<VPReductionPHIRecipe>(R) &&
(cast<VPReductionPHIRecipe>(R))->isInLoop())) {
unsigned ClassID = TTI.getRegisterClassForType(
false, TypeInfo.inferScalarType(R->getVPSingleValue()));
// FIXME: The target might use more than one register for the type
Expand Down Expand Up @@ -5234,213 +5237,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
return 1;
}

SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
// This function calculates the register usage by measuring the highest number
// of values that are alive at a single location. Obviously, this is a very
// rough estimation. We scan the loop in a topological order in order and
// assign a number to each instruction. We use RPO to ensure that defs are
// met before their users. We assume that each instruction that has in-loop
// users starts an interval. We record every time that an in-loop value is
// used, so we have a list of the first and last occurrences of each
// instruction. Next, we transpose this data structure into a multi map that
// holds the list of intervals that *end* at a specific location. This multi
// map allows us to perform a linear search. We scan the instructions linearly
// and record each time that a new interval starts, by placing it in a set.
// If we find this value in the multi-map then we remove it from the set.
// The max register usage is the maximum size of the set.
// We also search for instructions that are defined outside the loop, but are
// used inside the loop. We need this number separately from the max-interval
// usage number because when we unroll, loop-invariant values do not take
// more registers.
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);

RegisterUsage RU;

// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
// instruction that is the key.
using IntervalMap = SmallDenseMap<Instruction *, unsigned, 16>;

// Maps instruction to its index.
SmallVector<Instruction *, 64> IdxToInstr;
// Marks the end of each interval.
IntervalMap EndPoint;
// Saves the list of instruction indices that are used in the loop.
SmallPtrSet<Instruction *, 8> Ends;
// Saves the list of values that are used in the loop but are defined outside
// the loop (not including non-instruction values such as arguments and
// constants).
SmallSetVector<Instruction *, 8> LoopInvariants;

for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
for (Instruction &I : BB->instructionsWithoutDebug()) {
IdxToInstr.push_back(&I);

// Save the end location of each USE.
for (Value *U : I.operands()) {
auto *Instr = dyn_cast<Instruction>(U);

// Ignore non-instruction values such as arguments, constants, etc.
// FIXME: Might need some motivation why these values are ignored. If
// for example an argument is used inside the loop it will increase the
// register pressure (so shouldn't we add it to LoopInvariants).
if (!Instr)
continue;

// If this instruction is outside the loop then record it and continue.
if (!TheLoop->contains(Instr)) {
LoopInvariants.insert(Instr);
continue;
}

// Overwrite previous end points.
EndPoint[Instr] = IdxToInstr.size();
Ends.insert(Instr);
}
}
}

// Saves the list of intervals that end with the index in 'key'.
using InstrList = SmallVector<Instruction *, 2>;
SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;

// Transpose the EndPoints to a list of values that end at each index.
for (auto &Interval : EndPoint)
TransposeEnds[Interval.second].push_back(Interval.first);

SmallPtrSet<Instruction *, 8> OpenIntervals;
SmallVector<RegisterUsage, 8> RUs(VFs.size());
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());

LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");

const auto &TTICapture = TTI;
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
(VF.isScalable() &&
!TTICapture.isElementTypeLegalForScalableVector(Ty)))
return 0;
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
};

collectInLoopReductions();

for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
Instruction *I = IdxToInstr[Idx];

// Remove all of the instructions that end at this location.
InstrList &List = TransposeEnds[Idx];
for (Instruction *ToRemove : List)
OpenIntervals.erase(ToRemove);

// Ignore instructions that are never used within the loop and do not have
// side-effects.
if (!Ends.count(I) && !I->mayHaveSideEffects())
continue;

// Skip ignored values.
if (ValuesToIgnore.count(I))
continue;

// For each VF find the maximum usage of registers.
for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
// Count the number of registers used, per register class, given all open
// intervals.
// Note that elements in this SmallMapVector will be default constructed
// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
// there is no previous entry for ClassID.
SmallMapVector<unsigned, unsigned, 4> RegUsage;

if (VFs[J].isScalar()) {
for (auto *Inst : OpenIntervals) {
unsigned ClassID =
TTI.getRegisterClassForType(false, Inst->getType());
// FIXME: The target might use more than one register for the type
// even in the scalar case.
RegUsage[ClassID] += 1;
}
} else {
collectNonVectorizedAndSetWideningDecisions(VFs[J]);
for (auto *Inst : OpenIntervals) {
// Skip ignored values for VF > 1.
if (VecValuesToIgnore.count(Inst))
continue;
if (isScalarAfterVectorization(Inst, VFs[J])) {
unsigned ClassID =
TTI.getRegisterClassForType(false, Inst->getType());
// FIXME: The target might use more than one register for the type
// even in the scalar case.
RegUsage[ClassID] += 1;
} else {
unsigned ClassID =
TTI.getRegisterClassForType(true, Inst->getType());
RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
}
}
}

for (const auto &Pair : RegUsage) {
auto &Entry = MaxUsages[J][Pair.first];
Entry = std::max(Entry, Pair.second);
}
}

LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
<< OpenIntervals.size() << '\n');

// Add the current instruction to the list of open intervals.
OpenIntervals.insert(I);
}

for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
// Note that elements in this SmallMapVector will be default constructed
// as 0. So we can use "Invariant[ClassID] += n" in the code below even if
// there is no previous entry for ClassID.
SmallMapVector<unsigned, unsigned, 4> Invariant;

for (auto *Inst : LoopInvariants) {
// FIXME: The target might use more than one register for the type
// even in the scalar case.
bool IsScalar = all_of(Inst->users(), [&](User *U) {
auto *I = cast<Instruction>(U);
return TheLoop != LI->getLoopFor(I->getParent()) ||
isScalarAfterVectorization(I, VFs[Idx]);
});

ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
unsigned ClassID =
TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
}

LLVM_DEBUG({
dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
<< " item\n";
for (const auto &pair : MaxUsages[Idx]) {
dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
<< " registers\n";
}
dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
<< " item\n";
for (const auto &pair : Invariant) {
dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
<< " registers\n";
}
});

RU.LoopInvariantRegs = Invariant;
RU.MaxLocalUsers = MaxUsages[Idx];
RUs[Idx] = RU;
}

return RUs;
}

bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
ElementCount VF) {
// TODO: Cost model for emulated masked load/store is completely
Expand Down Expand Up @@ -7621,7 +7417,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
}

for (auto &P : VPlans) {
for (ElementCount VF : P->vectorFactors()) {
ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
P->vectorFactors().end());
auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore);
for (auto [VF, RU] : zip_equal(VFs, RUs)) {
if (VF.isScalar())
continue;
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
Expand All @@ -7642,6 +7441,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {

InstructionCost Cost = cost(*P, VF);
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);

if (RU.exceedsMaxNumRegs(TTI)) {
LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
<< VF << " because it uses too many registers\n");
continue;
}

if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
BestFactor = CurrentFactor;

Expand Down
Loading