-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[LV] Reduce register usage for scaled reductions #133090
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Reduce register usage for scaled reductions #133090
Conversation
@llvm/pr-subscribers-vectorizers Author: Sam Tebbs (SamTebbs33) ChangesThis PR accounts for scaled reductions in Patch is 56.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133090.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c9f314c0ba481..da701ef9ff1a2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5026,10 +5026,23 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
// even in the scalar case.
RegUsage[ClassID] += 1;
} else {
+ // The output from scaled phis and scaled reductions actually have
+ // fewer lanes than the VF.
+ auto VF = VFs[J];
+ if (auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R))
+ VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor());
+ else if (auto *PartialReductionR =
+ dyn_cast<VPPartialReductionRecipe>(R))
+ VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor());
+ if (VF != VFs[J])
+ LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J]
+ << " to " << VF << " for ";
+ R->dump(););
+
for (VPValue *DefV : R->definedValues()) {
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
- RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
+ RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
}
}
}
@@ -8963,8 +8976,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
- if (getScalingForReduction(Instr))
- return tryToCreatePartialReduction(Instr, Operands);
+ if (auto ScaleFactor = getScalingForReduction(Instr))
+ return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8988,7 +9001,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
VPRecipeBase *
VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
- ArrayRef<VPValue *> Operands) {
+ ArrayRef<VPValue *> Operands,
+ unsigned ScaleFactor) {
assert(Operands.size() == 2 &&
"Unexpected number of operands for partial reduction");
@@ -9021,7 +9035,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator,
- Reduction);
+ ScaleFactor, Reduction);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 334cfbad8bd7c..fd0064a34c4c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -178,7 +178,8 @@ class VPRecipeBuilder {
/// Create and return a partial reduction recipe for a reduction instruction
/// along with binary operation and reduction phi operands.
VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
- ArrayRef<VPValue *> Operands);
+ ArrayRef<VPValue *> Operands,
+ unsigned ScaleFactor);
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 80b3d2a760293..d84efb1bd6850 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2001,6 +2001,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
+ unsigned getVFScaleFactor() const { return VFScaleFactor; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -2031,17 +2033,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// scalar value.
class VPPartialReductionRecipe : public VPSingleDefRecipe {
unsigned Opcode;
+ unsigned ScaleFactor;
public:
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
- VPValue *Op1)
+ VPValue *Op1, unsigned ScaleFactor)
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
- ReductionInst) {}
+ ScaleFactor, ReductionInst) {}
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
+ unsigned ScaleFactor,
Instruction *ReductionInst = nullptr)
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
- Opcode(Opcode) {
+ Opcode(Opcode), ScaleFactor(ScaleFactor) {
[[maybe_unused]] auto *AccumulatorRecipe =
getOperand(1)->getDefiningRecipe();
assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
@@ -2052,7 +2056,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
VPPartialReductionRecipe *clone() override {
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
- getUnderlyingInstr());
+ ScaleFactor, getUnderlyingInstr());
}
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
@@ -2067,6 +2071,8 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
/// Get the binary op's opcode.
unsigned getOpcode() const { return Opcode; }
+ unsigned getScaleFactor() const { return ScaleFactor; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index de710bfbf8561..ab1cf84dba67d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -780,10 +780,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
@@ -792,6 +792,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
@@ -805,45 +809,81 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP40]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP46]])
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP48]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP52]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = mul nsw <16 x i32> [[TMP50]], [[TMP53]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP54]])
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP55]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP37]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP56]], [[TMP39]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP41]])
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP47]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP49]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP51]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
-; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
-; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
-; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE10]], [[PARTIAL_REDUCE13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 75705fdfc23e5..9eaec9353589c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -3177,6 +3177,420 @@ for.exit: ; preds = %for.body
ret i32 %add
}
+define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 %n) #1 {
+; CHECK-INTERLEAVE1-LABEL: define dso_local void @dotp_high_register_pressure(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[SUM:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CMP100:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP100]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-INTERLEAVE1: for.body.lr.ph:
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 4
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B_12:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 8
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 12
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 16
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 20
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 24
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX67:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 28
+; CHECK-INTERLEAVE1-NEXT: [[SUM_PROMOTED:%.*]] = load i32, ptr [[SUM]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX13_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B_12_PROMOTED:%.*]] = load i32, ptr [[GEP_B_12]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX31_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX31]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX40_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX40]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58_PROMOTED1:%.*]] = load i32, ptr [[ARRAYIDX5...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Sam Tebbs (SamTebbs33) ChangesThis PR accounts for scaled reductions in Patch is 56.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133090.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c9f314c0ba481..da701ef9ff1a2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5026,10 +5026,23 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
// even in the scalar case.
RegUsage[ClassID] += 1;
} else {
+ // The output from scaled phis and scaled reductions actually have
+ // fewer lanes than the VF.
+ auto VF = VFs[J];
+ if (auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R))
+ VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor());
+ else if (auto *PartialReductionR =
+ dyn_cast<VPPartialReductionRecipe>(R))
+ VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor());
+ if (VF != VFs[J])
+ LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J]
+ << " to " << VF << " for ";
+ R->dump(););
+
for (VPValue *DefV : R->definedValues()) {
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
- RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
+ RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
}
}
}
@@ -8963,8 +8976,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
- if (getScalingForReduction(Instr))
- return tryToCreatePartialReduction(Instr, Operands);
+ if (auto ScaleFactor = getScalingForReduction(Instr))
+ return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8988,7 +9001,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
VPRecipeBase *
VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
- ArrayRef<VPValue *> Operands) {
+ ArrayRef<VPValue *> Operands,
+ unsigned ScaleFactor) {
assert(Operands.size() == 2 &&
"Unexpected number of operands for partial reduction");
@@ -9021,7 +9035,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator,
- Reduction);
+ ScaleFactor, Reduction);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 334cfbad8bd7c..fd0064a34c4c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -178,7 +178,8 @@ class VPRecipeBuilder {
/// Create and return a partial reduction recipe for a reduction instruction
/// along with binary operation and reduction phi operands.
VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
- ArrayRef<VPValue *> Operands);
+ ArrayRef<VPValue *> Operands,
+ unsigned ScaleFactor);
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 80b3d2a760293..d84efb1bd6850 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2001,6 +2001,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
+ unsigned getVFScaleFactor() const { return VFScaleFactor; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -2031,17 +2033,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// scalar value.
class VPPartialReductionRecipe : public VPSingleDefRecipe {
unsigned Opcode;
+ unsigned ScaleFactor;
public:
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
- VPValue *Op1)
+ VPValue *Op1, unsigned ScaleFactor)
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
- ReductionInst) {}
+ ScaleFactor, ReductionInst) {}
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
+ unsigned ScaleFactor,
Instruction *ReductionInst = nullptr)
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
- Opcode(Opcode) {
+ Opcode(Opcode), ScaleFactor(ScaleFactor) {
[[maybe_unused]] auto *AccumulatorRecipe =
getOperand(1)->getDefiningRecipe();
assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
@@ -2052,7 +2056,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
VPPartialReductionRecipe *clone() override {
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
- getUnderlyingInstr());
+ ScaleFactor, getUnderlyingInstr());
}
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
@@ -2067,6 +2071,8 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
/// Get the binary op's opcode.
unsigned getOpcode() const { return Opcode; }
+ unsigned getScaleFactor() const { return ScaleFactor; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index de710bfbf8561..ab1cf84dba67d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -780,10 +780,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-INTERLEAVED-NEXT: entry:
-; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-INTERLEAVED: vector.ph:
-; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-INTERLEAVED: vector.body:
@@ -792,6 +792,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
@@ -805,45 +809,81 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP40]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP46]])
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP48]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP52]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = mul nsw <16 x i32> [[TMP50]], [[TMP53]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP54]])
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP55]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP37]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP56]], [[TMP39]]
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP41]])
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP47]], align 1
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP49]]
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
-; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP51]])
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-INTERLEAVED: middle.block:
-; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
-; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
-; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
-; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE10]], [[PARTIAL_REDUCE13]]
+; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]]
+; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-INTERLEAVED: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 75705fdfc23e5..9eaec9353589c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -3177,6 +3177,420 @@ for.exit: ; preds = %for.body
ret i32 %add
}
+define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 %n) #1 {
+; CHECK-INTERLEAVE1-LABEL: define dso_local void @dotp_high_register_pressure(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[SUM:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CMP100:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP100]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK-INTERLEAVE1: for.body.lr.ph:
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 4
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B_12:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 8
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 12
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 16
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 20
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 24
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX67:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 28
+; CHECK-INTERLEAVE1-NEXT: [[SUM_PROMOTED:%.*]] = load i32, ptr [[SUM]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX13_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[GEP_B_12_PROMOTED:%.*]] = load i32, ptr [[GEP_B_12]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX31_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX31]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX40_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX40]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58_PROMOTED1:%.*]] = load i32, ptr [[ARRAYIDX5...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks generally good to me so far, with a few nitpicks.
@@ -2031,17 +2033,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, | |||
/// scalar value. | |||
class VPPartialReductionRecipe : public VPSingleDefRecipe { | |||
unsigned Opcode; | |||
unsigned ScaleFactor; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Could this be VFScaleFactor
to match the equivalent in VPReductionPHIRecipe
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
else if (auto *PartialReductionR = | ||
dyn_cast<VPPartialReductionRecipe>(R)) | ||
VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor()); | ||
if (VF != VFs[J]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: If the condition is only used for debug output then can it be moved to inside the LLVM_DEBUG
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you pre-commit this test, so we can see how the output changes before and after the changes in LoopVectorize.cpp
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea, done.
// The output from scaled phis and scaled reductions actually have | ||
// fewer lanes than the VF. | ||
auto VF = VFs[J]; | ||
if (auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[Idle thought, feel free to ignore]
I wonder if there's precedent to add a getVFScaleFactor
or equivalent to the base recipe class (or one of the other subclasses), and allow any recipe to override it instead of explicitly checking for every type that could scale the VF.
Likely not yet, and almost certainly not in this patch, but maybe something to consider in the future?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that's a nice idea. We could add a VPScaledRecipe
class. I agree with doing it afterwards.
✅ With the latest revision this PR passed the C/C++ code formatter. |
@@ -5026,10 +5026,24 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs, | |||
// even in the scalar case. | |||
RegUsage[ClassID] += 1; | |||
} else { | |||
// The output from scaled phis and scaled reductions actually have | |||
// fewer lanes than the VF. | |||
auto VF = VFs[J]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: please don't use auto
when it's not obvious from the context what the type is. (I'm reviewing this code and can't see from the limited context that Github gives me what the type of VF
is)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -3177,6 +3177,420 @@ for.exit: ; preds = %for.body | |||
ret i32 %add | |||
} | |||
|
|||
define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 %n) #1 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to add a new RUN line that prints out the debug output from LV and have a CHECK line for the register pressure?
If so, then I wonder if there's a need to add another test, and perhaps the CHECK lines for lower register pressure are sufficient.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added.
b11c763
to
c72adbe
Compare
f29503a
to
b905806
Compare
if (isa<VPPartialReductionRecipe, VPReductionPHIRecipe>(R)) { | ||
auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R); | ||
auto *PartialReductionR = | ||
ReductionR ? nullptr : dyn_cast<VPPartialReductionRecipe>(R); | ||
unsigned ScaleFactor = ReductionR ? ReductionR->getVFScaleFactor() | ||
: PartialReductionR->getVFScaleFactor(); | ||
return ScaleFactor; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Am I missing something, or can this just be:
if (isa<VPPartialReductionRecipe, VPReductionPHIRecipe>(R)) { | |
auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R); | |
auto *PartialReductionR = | |
ReductionR ? nullptr : dyn_cast<VPPartialReductionRecipe>(R); | |
unsigned ScaleFactor = ReductionR ? ReductionR->getVFScaleFactor() | |
: PartialReductionR->getVFScaleFactor(); | |
return ScaleFactor; | |
} | |
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R)) | |
return RR->getVFScaleFactor(); | |
if (auto *RR =dyn_cast<VPPartialReductionRecipe>(R)) | |
return RR->getVFScaleFactor(); |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
unsigned ScaleFactor = getVFScaleFactor(R); | ||
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); | ||
LLVM_DEBUG(if (VF != VFs[J]) { | ||
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: for the tests that check the debug output, can a check for this line be added?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately the VF that has partial reductions is pruned before the register usage is calculated. I can add one as part of #132190 once this is merged, though.
@@ -71,6 +71,11 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { | |||
V.print(OS, SlotTracker); | |||
return OS; | |||
} | |||
raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of adding this operator, you can instead replace the one above operator<<(raw_ostem &OS, const VPValue &V)
. If this operator supports VPRecipeBase
then that also removes the need for a cast in SLPVectorizer.cpp and the one you added in VPlanTest.cpp.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -2063,17 +2066,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, | |||
/// scalar value. | |||
class VPPartialReductionRecipe : public VPSingleDefRecipe { | |||
unsigned Opcode; | |||
unsigned VFScaleFactor; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wuld be good to add a comment here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -71,6 +71,11 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { | |||
V.print(OS, SlotTracker); | |||
return OS; | |||
} | |||
raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) { | |||
VPSlotTracker SlotTracker(R.getParent()->getPlan()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Parent
may be nullptr if the recipe hasn't been added to a plan yet. Would be good to account for that in the general util
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -5014,7 +5024,6 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs, | |||
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe, | |||
VPBranchOnMaskRecipe>(R)) | |||
continue; | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unrelated change
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -2,6 +2,7 @@ | |||
; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 | |||
; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED | |||
; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW | |||
; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize --disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-REGS |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just checkoing the debug output for a single function, right? Might be better to move to the existing llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
if possible
@@ -516,8 +516,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { | |||
auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr(); | |||
auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); | |||
|
|||
LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " | |||
<< *cast<VPInstruction>(Values[0]) << "\n"); | |||
LLVM_DEBUG(dbgs() << "Create VPInstruction " << cast<VPValue>(*VPI) << " " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems unfortunate, due to VPI being both a VPValue & VPInstruction?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Exactly, it was an ambiguous call to operator<<
. Sander's latest suggestion to replace the VPValue print function with the VPRecipeBase one makes it unnecessary though.
@@ -9137,8 +9155,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe( | |||
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) | |||
return tryToWidenMemory(Instr, Operands, Range); | |||
|
|||
if (getScalingForReduction(Instr)) | |||
return tryToCreatePartialReduction(Instr, Operands); | |||
if (auto ScaleFactor = getScalingForReduction(Instr)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (auto ScaleFactor = getScalingForReduction(Instr)) | |
if (unsigned ScaleFactor = getScalingForReduction(Instr)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks @SamTebbs33
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
@@ -2033,6 +2033,9 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, | |||
/// Generate the phi/select nodes. | |||
void execute(VPTransformState &State) override; | |||
|
|||
/// Get the factor that the VF of this recipe's output should be scaled by |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// Get the factor that the VF of this recipe's output should be scaled by | |
/// Get the factor that the VF of this recipe's output should be scaled by. |
@@ -2099,6 +2106,9 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { | |||
/// Get the binary op's opcode. | |||
unsigned getOpcode() const { return Opcode; } | |||
|
|||
/// Get the factor that the VF of this recipe's output should be scaled by |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// Get the factor that the VF of this recipe's output should be scaled by | |
/// Get the factor that the VF of this recipe's output should be scaled by. |
This PR accounts for scaled reductions in `calculateRegisterUsage` to reflect the fact that the number of lanes in their output is smaller than the VF. Depends on llvm#126437
This PR accounts for scaled reductions in
calculateRegisterUsage
to reflect the fact that the number of lanes in their output is smaller than the VF.Depends on #126437