Skip to content

[LV] Reduce register usage for scaled reductions #133090

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Apr 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4874,6 +4874,16 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
}
}

/// Get the VF scaling factor applied to the recipe's output, if the recipe has
/// one.
static unsigned getVFScaleFactor(VPRecipeBase *R) {
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
return RR->getVFScaleFactor();
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
return RR->getVFScaleFactor();
return 1;
}

/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
/// by calculating the highest number of values that are live at a single
/// location as a rough estimate. Returns the register usage for each VF in \p
Expand Down Expand Up @@ -5028,10 +5038,19 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
// even in the scalar case.
RegUsage[ClassID] += 1;
} else {
// The output from scaled phis and scaled reductions actually has
// fewer lanes than the VF.
unsigned ScaleFactor = getVFScaleFactor(R);
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
LLVM_DEBUG(if (VF != VFs[J]) {
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: for the tests that check the debug output, can a check for this line be added?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately the VF that has partial reductions is pruned before the register usage is calculated. I can add one as part of #132190 once this is merged, though.

<< " for " << *R << "\n";
});

for (VPValue *DefV : R->definedValues()) {
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
}
}
}
Expand Down Expand Up @@ -9137,8 +9156,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);

if (getScalingForReduction(Instr))
return tryToCreatePartialReduction(Instr, Operands);
if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());

if (!shouldWiden(Instr, Range))
return nullptr;
Expand All @@ -9162,7 +9181,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(

VPRecipeBase *
VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
ArrayRef<VPValue *> Operands) {
ArrayRef<VPValue *> Operands,
unsigned ScaleFactor) {
assert(Operands.size() == 2 &&
"Unexpected number of operands for partial reduction");

Expand Down Expand Up @@ -9195,7 +9215,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator,
Reduction);
ScaleFactor, Reduction);
}

void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ class VPRecipeBuilder {
/// Create and return a partial reduction recipe for a reduction instruction
/// along with binary operation and reduction phi operands.
VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
ArrayRef<VPValue *> Operands);
ArrayRef<VPValue *> Operands,
unsigned ScaleFactor);

/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
Expand Down
9 changes: 4 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,10 @@ static cl::opt<bool> PrintVPlansInDotFormat(
#define DEBUG_TYPE "loop-vectorize"

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
const VPInstruction *Instr = dyn_cast<VPInstruction>(&V);
VPSlotTracker SlotTracker(
(Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
V.print(OS, SlotTracker);
raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of adding this operator, you can instead replace the one above operator<<(raw_ostem &OS, const VPValue &V). If this operator supports VPRecipeBase then that also removes the need for a cast in SLPVectorizer.cpp and the one you added in VPlanTest.cpp.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

const VPBasicBlock *Parent = R.getParent();
VPSlotTracker SlotTracker(Parent ? Parent->getPlan() : nullptr);
R.print(OS, "", SlotTracker);
return OS;
}
#endif
Expand Down
18 changes: 14 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -2033,6 +2033,9 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;

/// Get the factor that the VF of this recipe's output should be scaled by.
unsigned getVFScaleFactor() const { return VFScaleFactor; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps good to have comments on both new functions added?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
Expand Down Expand Up @@ -2063,17 +2066,21 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// scalar value.
class VPPartialReductionRecipe : public VPSingleDefRecipe {
unsigned Opcode;
/// The divisor by which the VF of this recipe's output should be divided
/// during execution.
unsigned VFScaleFactor;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wuld be good to add a comment here

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


public:
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
VPValue *Op1)
VPValue *Op1, unsigned VFScaleFactor)
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
ReductionInst) {}
VFScaleFactor, ReductionInst) {}
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
unsigned VFScaleFactor,
Instruction *ReductionInst = nullptr)
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
Opcode(Opcode) {
Opcode(Opcode), VFScaleFactor(VFScaleFactor) {
[[maybe_unused]] auto *AccumulatorRecipe =
getOperand(1)->getDefiningRecipe();
assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
Expand All @@ -2084,7 +2091,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {

VPPartialReductionRecipe *clone() override {
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
getUnderlyingInstr());
VFScaleFactor, getUnderlyingInstr());
}

VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
Expand All @@ -2099,6 +2106,9 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
/// Get the binary op's opcode.
unsigned getOpcode() const { return Opcode; }

/// Get the factor that the VF of this recipe's output should be scaled by.
unsigned getVFScaleFactor() const { return VFScaleFactor; }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -516,8 +516,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());

LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
<< *cast<VPInstruction>(Values[0]) << "\n");
LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << Values[0]
<< "\n");
addCombined(Values, VPI);
return VPI;
}
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPlanValue.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ class VPValue {
typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
typedef DenseMap<VPValue *, Value *> VPValue2ValueTy;

raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
raw_ostream &operator<<(raw_ostream &OS, const VPRecipeBase &R);

/// This class augments VPValue with operands which provide the inverse def-use
/// edges from VPValue's users to their defs.
Expand Down
Loading