Skip to content

Commit b658a2e

Browse files
authored
[LV] Reduce register usage for scaled reductions (#133090)
This PR accounts for scaled reductions in `calculateRegisterUsage` to reflect the fact that the number of lanes in their output is smaller than the VF. Depends on #126437
1 parent 5b384c3 commit b658a2e

File tree

10 files changed

+372
-124
lines changed

10 files changed

+372
-124
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4878,6 +4878,16 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48784878
}
48794879
}
48804880

4881+
/// Get the VF scaling factor applied to the recipe's output, if the recipe has
4882+
/// one.
4883+
static unsigned getVFScaleFactor(VPRecipeBase *R) {
4884+
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
4885+
return RR->getVFScaleFactor();
4886+
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
4887+
return RR->getVFScaleFactor();
4888+
return 1;
4889+
}
4890+
48814891
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
48824892
/// by calculating the highest number of values that are live at a single
48834893
/// location as a rough estimate. Returns the register usage for each VF in \p
@@ -5032,10 +5042,19 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
50325042
// even in the scalar case.
50335043
RegUsage[ClassID] += 1;
50345044
} else {
5045+
// The output from scaled phis and scaled reductions actually has
5046+
// fewer lanes than the VF.
5047+
unsigned ScaleFactor = getVFScaleFactor(R);
5048+
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
5049+
LLVM_DEBUG(if (VF != VFs[J]) {
5050+
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
5051+
<< " for " << *R << "\n";
5052+
});
5053+
50355054
for (VPValue *DefV : R->definedValues()) {
50365055
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
50375056
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
5038-
RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
5057+
RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
50395058
}
50405059
}
50415060
}
@@ -9141,8 +9160,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
91419160
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
91429161
return tryToWidenMemory(Instr, Operands, Range);
91439162

9144-
if (getScalingForReduction(Instr))
9145-
return tryToCreatePartialReduction(Instr, Operands);
9163+
if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
9164+
return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
91469165

91479166
if (!shouldWiden(Instr, Range))
91489167
return nullptr;
@@ -9166,7 +9185,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
91669185

91679186
VPRecipeBase *
91689187
VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
9169-
ArrayRef<VPValue *> Operands) {
9188+
ArrayRef<VPValue *> Operands,
9189+
unsigned ScaleFactor) {
91709190
assert(Operands.size() == 2 &&
91719191
"Unexpected number of operands for partial reduction");
91729192

@@ -9199,7 +9219,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
91999219
BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
92009220
}
92019221
return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator,
9202-
Reduction);
9222+
ScaleFactor, Reduction);
92039223
}
92049224

92059225
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,8 @@ class VPRecipeBuilder {
178178
/// Create and return a partial reduction recipe for a reduction instruction
179179
/// along with binary operation and reduction phi operands.
180180
VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
181-
ArrayRef<VPValue *> Operands);
181+
ArrayRef<VPValue *> Operands,
182+
unsigned ScaleFactor);
182183

183184
/// Set the recipe created for given ingredient.
184185
void setRecipe(Instruction *I, VPRecipeBase *R) {

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,10 @@ static cl::opt<bool> PrintVPlansInDotFormat(
6464
#define DEBUG_TYPE "loop-vectorize"
6565

6666
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
67-
raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
68-
const VPInstruction *Instr = dyn_cast<VPInstruction>(&V);
69-
VPSlotTracker SlotTracker(
70-
(Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
71-
V.print(OS, SlotTracker);
67+
raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) {
68+
const VPBasicBlock *Parent = R.getParent();
69+
VPSlotTracker SlotTracker(Parent ? Parent->getPlan() : nullptr);
70+
R.print(OS, "", SlotTracker);
7271
return OS;
7372
}
7473
#endif

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2047,6 +2047,9 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
20472047
/// Generate the phi/select nodes.
20482048
void execute(VPTransformState &State) override;
20492049

2050+
/// Get the factor that the VF of this recipe's output should be scaled by.
2051+
unsigned getVFScaleFactor() const { return VFScaleFactor; }
2052+
20502053
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
20512054
/// Print the recipe.
20522055
void print(raw_ostream &O, const Twine &Indent,
@@ -2077,17 +2080,21 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
20772080
/// scalar value.
20782081
class VPPartialReductionRecipe : public VPSingleDefRecipe {
20792082
unsigned Opcode;
2083+
/// The divisor by which the VF of this recipe's output should be divided
2084+
/// during execution.
2085+
unsigned VFScaleFactor;
20802086

20812087
public:
20822088
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
2083-
VPValue *Op1)
2089+
VPValue *Op1, unsigned VFScaleFactor)
20842090
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
2085-
ReductionInst) {}
2091+
VFScaleFactor, ReductionInst) {}
20862092
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
2093+
unsigned VFScaleFactor,
20872094
Instruction *ReductionInst = nullptr)
20882095
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
20892096
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
2090-
Opcode(Opcode) {
2097+
Opcode(Opcode), VFScaleFactor(VFScaleFactor) {
20912098
[[maybe_unused]] auto *AccumulatorRecipe =
20922099
getOperand(1)->getDefiningRecipe();
20932100
assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
@@ -2098,7 +2105,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
20982105

20992106
VPPartialReductionRecipe *clone() override {
21002107
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
2101-
getUnderlyingInstr());
2108+
VFScaleFactor, getUnderlyingInstr());
21022109
}
21032110

21042111
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
@@ -2113,6 +2120,9 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
21132120
/// Get the binary op's opcode.
21142121
unsigned getOpcode() const { return Opcode; }
21152122

2123+
/// Get the factor that the VF of this recipe's output should be scaled by.
2124+
unsigned getVFScaleFactor() const { return VFScaleFactor; }
2125+
21162126
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
21172127
/// Print the recipe.
21182128
void print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanSLP.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -516,8 +516,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
516516
auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
517517
auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
518518

519-
LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
520-
<< *cast<VPInstruction>(Values[0]) << "\n");
519+
LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << Values[0]
520+
<< "\n");
521521
addCombined(Values, VPI);
522522
return VPI;
523523
}

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ class VPValue {
194194
typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
195195
typedef DenseMap<VPValue *, Value *> VPValue2ValueTy;
196196

197-
raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
197+
raw_ostream &operator<<(raw_ostream &OS, const VPRecipeBase &R);
198198

199199
/// This class augments VPValue with operands which provide the inverse def-use
200200
/// edges from VPValue's users to their defs.

0 commit comments

Comments
 (0)