Skip to content

Commit 5f81501

Browse files
committed
Separate out scaled reduction changes
1 parent 410afea commit 5f81501

File tree

5 files changed

+20
-495
lines changed

5 files changed

+20
-495
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5026,23 +5026,10 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
50265026
// even in the scalar case.
50275027
RegUsage[ClassID] += 1;
50285028
} else {
5029-
// The output from scaled phis and scaled reductions actually have
5030-
// fewer lanes than the VF.
5031-
auto VF = VFs[J];
5032-
if (auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R))
5033-
VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor());
5034-
else if (auto *PartialReductionR =
5035-
dyn_cast<VPPartialReductionRecipe>(R))
5036-
VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor());
5037-
if (VF != VFs[J])
5038-
LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J]
5039-
<< " to " << VF << " for ";
5040-
R->dump(););
5041-
50425029
for (VPValue *DefV : R->definedValues()) {
50435030
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
50445031
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
5045-
RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
5032+
RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
50465033
}
50475034
}
50485035
}
@@ -8976,8 +8963,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
89768963
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
89778964
return tryToWidenMemory(Instr, Operands, Range);
89788965

8979-
if (auto ScaleFactor = getScalingForReduction(Instr))
8980-
return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
8966+
if (getScalingForReduction(Instr))
8967+
return tryToCreatePartialReduction(Instr, Operands);
89818968

89828969
if (!shouldWiden(Instr, Range))
89838970
return nullptr;
@@ -9001,8 +8988,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
90018988

90028989
VPRecipeBase *
90038990
VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
9004-
ArrayRef<VPValue *> Operands,
9005-
unsigned ScaleFactor) {
8991+
ArrayRef<VPValue *> Operands) {
90068992
assert(Operands.size() == 2 &&
90078993
"Unexpected number of operands for partial reduction");
90088994

@@ -9035,7 +9021,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
90359021
BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
90369022
}
90379023
return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator,
9038-
ScaleFactor, Reduction);
9024+
Reduction);
90399025
}
90409026

90419027
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,7 @@ class VPRecipeBuilder {
178178
/// Create and return a partial reduction recipe for a reduction instruction
179179
/// along with binary operation and reduction phi operands.
180180
VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
181-
ArrayRef<VPValue *> Operands,
182-
unsigned ScaleFactor);
181+
ArrayRef<VPValue *> Operands);
183182

184183
/// Set the recipe created for given ingredient.
185184
void setRecipe(Instruction *I, VPRecipeBase *R) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2001,8 +2001,6 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
20012001
/// Generate the phi/select nodes.
20022002
void execute(VPTransformState &State) override;
20032003

2004-
unsigned getVFScaleFactor() const { return VFScaleFactor; }
2005-
20062004
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
20072005
/// Print the recipe.
20082006
void print(raw_ostream &O, const Twine &Indent,
@@ -2033,19 +2031,17 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
20332031
/// scalar value.
20342032
class VPPartialReductionRecipe : public VPSingleDefRecipe {
20352033
unsigned Opcode;
2036-
unsigned ScaleFactor;
20372034

20382035
public:
20392036
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
2040-
VPValue *Op1, unsigned ScaleFactor)
2037+
VPValue *Op1)
20412038
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
2042-
ScaleFactor, ReductionInst) {}
2039+
ReductionInst) {}
20432040
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
2044-
unsigned ScaleFactor,
20452041
Instruction *ReductionInst = nullptr)
20462042
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
20472043
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
2048-
Opcode(Opcode), ScaleFactor(ScaleFactor) {
2044+
Opcode(Opcode) {
20492045
[[maybe_unused]] auto *AccumulatorRecipe =
20502046
getOperand(1)->getDefiningRecipe();
20512047
assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
@@ -2056,7 +2052,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
20562052

20572053
VPPartialReductionRecipe *clone() override {
20582054
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
2059-
ScaleFactor, getUnderlyingInstr());
2055+
getUnderlyingInstr());
20602056
}
20612057

20622058
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
@@ -2071,8 +2067,6 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
20712067
/// Get the binary op's opcode.
20722068
unsigned getOpcode() const { return Opcode; }
20732069

2074-
unsigned getScaleFactor() const { return ScaleFactor; }
2075-
20762070
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
20772071
/// Print the recipe.
20782072
void print(raw_ostream &O, const Twine &Indent,

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll

Lines changed: 10 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -780,10 +780,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
780780
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
781781
; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
782782
; CHECK-INTERLEAVED-NEXT: entry:
783-
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32
783+
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
784784
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
785785
; CHECK-INTERLEAVED: vector.ph:
786-
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32
786+
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
787787
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
788788
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
789789
; CHECK-INTERLEAVED: vector.body:
@@ -792,10 +792,6 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
792792
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
793793
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
794794
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
795-
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
796-
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
797-
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY]] ]
798-
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
799795
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
800796
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
801797
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
@@ -809,81 +805,45 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
809805
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
810806
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
811807
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
812-
; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
813808
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
814-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
815809
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
816-
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
817810
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
818-
; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
819811
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
820-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
821812
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
822-
; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
823813
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
824-
; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP40]]
825-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
826-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP46]])
814+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
827815
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
828-
; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
829816
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
830-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP48]], align 1
831817
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
832-
; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
833818
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
834-
; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
835819
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
836-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP52]], align 1
837820
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
838-
; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
839821
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
840-
; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = mul nsw <16 x i32> [[TMP50]], [[TMP53]]
841-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP21]])
842-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP54]])
822+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
843823
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
844-
; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
845824
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
846-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP55]], align 1
847825
; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
848-
; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
849826
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
850-
; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
851827
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
852-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP37]], align 1
853828
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
854-
; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
855829
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
856-
; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP56]], [[TMP39]]
857-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP26]])
858-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP41]])
830+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
859831
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
860-
; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
861832
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
862-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
863833
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
864-
; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
865834
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
866-
; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
867835
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
868-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP47]], align 1
869836
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
870-
; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
871837
; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
872-
; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP49]]
873838
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
874-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP51]])
875-
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
839+
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
876840
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
877841
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
878842
; CHECK-INTERLEAVED: middle.block:
879-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE10]], [[PARTIAL_REDUCE13]]
880-
; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
881-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]]
882-
; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
883-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
884-
; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
885-
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
886-
; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
843+
; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
844+
; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
845+
; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
846+
; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
887847
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
888848
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
889849
; CHECK-INTERLEAVED: scalar.ph:

0 commit comments

Comments
 (0)