From a994bf2e1f8caa6b884696b12dafd56791e7f227 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 26 Mar 2025 14:01:59 +0000 Subject: [PATCH 01/19] [LV] Reduce register usage for scaled reductions --- .../Transforms/Vectorize/LoopVectorize.cpp | 25 +++- .../Transforms/Vectorize/VPRecipeBuilder.h | 3 +- llvm/lib/Transforms/Vectorize/VPlan.h | 14 ++- .../partial-reduce-dot-product-neon.ll | 116 ++++++++++++------ .../AArch64/partial-reduce-dot-product.ll | 114 ++++++++--------- 5 files changed, 166 insertions(+), 106 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 141a4fd83c833..d07fbe241ed10 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5014,7 +5014,6 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, if (isa(R)) continue; - if (VFs[J].isScalar() || isa(R) || @@ -5028,10 +5027,23 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, // even in the scalar case. RegUsage[ClassID] += 1; } else { + // The output from scaled phis and scaled reductions actually have + // fewer lanes than the VF. + auto VF = VFs[J]; + if (auto *ReductionR = dyn_cast(R)) + VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor()); + else if (auto *PartialReductionR = + dyn_cast(R)) + VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor()); + if (VF != VFs[J]) + LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J] + << " to " << VF << " for "; + R->dump();); + for (VPValue *DefV : R->definedValues()) { Type *ScalarTy = TypeInfo.inferScalarType(DefV); unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy); - RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]); + RegUsage[ClassID] += GetRegUsage(ScalarTy, VF); } } } @@ -9137,8 +9149,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe( if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); - if (getScalingForReduction(Instr)) - return tryToCreatePartialReduction(Instr, Operands); + if (auto ScaleFactor = getScalingForReduction(Instr)) + return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); if (!shouldWiden(Instr, Range)) return nullptr; @@ -9162,7 +9174,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe( VPRecipeBase * VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef Operands) { + ArrayRef Operands, + unsigned ScaleFactor) { assert(Operands.size() == 2 && "Unexpected number of operands for partial reduction"); @@ -9195,7 +9208,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc()); } return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator, - Reduction); + ScaleFactor, Reduction); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 334cfbad8bd7c..fd0064a34c4c9 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -178,7 +178,8 @@ class VPRecipeBuilder { /// Create and return a partial reduction recipe for a reduction instruction /// along with binary operation and reduction phi operands. VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef Operands); + ArrayRef Operands, + unsigned ScaleFactor); /// Set the recipe created for given ingredient. void setRecipe(Instruction *I, VPRecipeBase *R) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index da7aef73f9df3..35bb4ce85a6d8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2033,6 +2033,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Generate the phi/select nodes. void execute(VPTransformState &State) override; + unsigned getVFScaleFactor() const { return VFScaleFactor; } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2063,17 +2065,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// scalar value. class VPPartialReductionRecipe : public VPSingleDefRecipe { unsigned Opcode; + unsigned ScaleFactor; public: VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, - VPValue *Op1) + VPValue *Op1, unsigned ScaleFactor) : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, - ReductionInst) {} + ScaleFactor, ReductionInst) {} VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, + unsigned ScaleFactor, Instruction *ReductionInst = nullptr) : VPSingleDefRecipe(VPDef::VPPartialReductionSC, ArrayRef({Op0, Op1}), ReductionInst), - Opcode(Opcode) { + Opcode(Opcode), ScaleFactor(ScaleFactor) { [[maybe_unused]] auto *AccumulatorRecipe = getOperand(1)->getDefiningRecipe(); assert((isa(AccumulatorRecipe) || @@ -2084,7 +2088,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { VPPartialReductionRecipe *clone() override { return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), - getUnderlyingInstr()); + ScaleFactor, getUnderlyingInstr()); } VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) @@ -2099,6 +2103,8 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { /// Get the binary op's opcode. unsigned getOpcode() const { return Opcode; } + unsigned getScaleFactor() const { return ScaleFactor; } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index a8476dbddb3c2..a55a5813044cb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -770,10 +770,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -782,6 +782,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1 @@ -794,45 +798,81 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]]) -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]]) -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]]) +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP25]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP27]] +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP28]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]]) +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP31]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]] +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]]) +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP41]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP45]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP46]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP47]] +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP48]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP49]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE10]], [[PARTIAL_REDUCE13]] +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]] +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]] +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index bbcc6db020307..541cf034e0f3a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -3256,65 +3256,65 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI14:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[WIDE_LOAD16]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nsw i64 [[INDEX]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC25:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC26:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC27:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC28:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC29:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC30:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC25]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add <4 x i32> [[TMP21]], [[VEC_PHI14]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC26]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add <4 x i32> [[TMP27]], [[VEC_PHI12]] -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC27]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP34]] = add <4 x i32> [[TMP33]], [[VEC_PHI10]] -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <4 x i8> [[STRIDED_VEC28]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <4 x i32> [[TMP37]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP40]] = add <4 x i32> [[TMP39]], [[VEC_PHI8]] -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <4 x i8> [[STRIDED_VEC29]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP46]] = add <4 x i32> [[TMP45]], [[VEC_PHI6]] -; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <4 x i8> [[STRIDED_VEC30]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = mul nsw <4 x i32> [[TMP49]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP52]] = add <4 x i32> [[TMP51]], [[VEC_PHI4]] -; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext <4 x i8> [[STRIDED_VEC31]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP58]] = add <4 x i32> [[TMP57]], [[VEC_PHI2]] -; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext <4 x i8> [[STRIDED_VEC32]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw <4 x i32> [[TMP61]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add <4 x i32> [[TMP63]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <4 x i8> [[STRIDED_VEC]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP15]] = add <4 x i32> [[TMP14]], [[VEC_PHI7]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <4 x i8> [[STRIDED_VEC8]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP18]] = add <4 x i32> [[TMP17]], [[VEC_PHI6]] +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC9]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add <4 x i32> [[TMP20]], [[VEC_PHI5]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = sext <4 x i8> [[STRIDED_VEC10]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP22]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <4 x i32> [[TMP23]], [[VEC_PHI4]] +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC11]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <4 x i32> [[TMP26]], [[VEC_PHI3]] +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <4 x i8> [[STRIDED_VEC12]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP30]] = add <4 x i32> [[TMP29]], [[VEC_PHI2]] +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC13]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add <4 x i32> [[TMP32]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <4 x i8> [[STRIDED_VEC14]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP36]] = add <4 x i32> [[TMP35]], [[VEC_PHI]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP64]]) -; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP58]]) -; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP52]]) -; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP46]]) -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]]) -; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP34]]) -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP28]]) -; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP22]]) +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]]) +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]]) +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]]) +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]]) +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]]) +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP21]]) +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]]) +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: From 82151a8f22044e42302417a6fcbfade2f2a66cfd Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Mon, 31 Mar 2025 15:23:46 +0100 Subject: [PATCH 02/19] Rename to getVFScaleFactor --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlan.h | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d07fbe241ed10..ff96e98ef9cfa 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5034,7 +5034,7 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor()); else if (auto *PartialReductionR = dyn_cast(R)) - VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor()); + VF = VF.divideCoefficientBy(PartialReductionR->getVFScaleFactor()); if (VF != VFs[J]) LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF << " for "; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 35bb4ce85a6d8..6928bea4bb092 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2065,19 +2065,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// scalar value. class VPPartialReductionRecipe : public VPSingleDefRecipe { unsigned Opcode; - unsigned ScaleFactor; + unsigned VFScaleFactor; public: VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, - VPValue *Op1, unsigned ScaleFactor) + VPValue *Op1, unsigned VFScaleFactor) : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, - ScaleFactor, ReductionInst) {} + VFScaleFactor, ReductionInst) {} VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, - unsigned ScaleFactor, + unsigned VFScaleFactor, Instruction *ReductionInst = nullptr) : VPSingleDefRecipe(VPDef::VPPartialReductionSC, ArrayRef({Op0, Op1}), ReductionInst), - Opcode(Opcode), ScaleFactor(ScaleFactor) { + Opcode(Opcode), VFScaleFactor(VFScaleFactor) { [[maybe_unused]] auto *AccumulatorRecipe = getOperand(1)->getDefiningRecipe(); assert((isa(AccumulatorRecipe) || @@ -2088,7 +2088,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { VPPartialReductionRecipe *clone() override { return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), - ScaleFactor, getUnderlyingInstr()); + VFScaleFactor, getUnderlyingInstr()); } VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) @@ -2103,7 +2103,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { /// Get the binary op's opcode. unsigned getOpcode() const { return Opcode; } - unsigned getScaleFactor() const { return ScaleFactor; } + unsigned getVFScaleFactor() const { return VFScaleFactor; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. From f5d5ed6f63cef1dec268ef19d5b553d6c91036eb Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Mon, 31 Mar 2025 15:06:54 +0100 Subject: [PATCH 03/19] Put if statement inside DEBUG --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ff96e98ef9cfa..05a8e999c6c2b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5035,10 +5035,11 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, else if (auto *PartialReductionR = dyn_cast(R)) VF = VF.divideCoefficientBy(PartialReductionR->getVFScaleFactor()); - if (VF != VFs[J]) - LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J] - << " to " << VF << " for "; - R->dump();); + LLVM_DEBUG(if (VF != VFs[J]) { + dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF + << " for "; + R->dump(); + }); for (VPValue *DefV : R->definedValues()) { Type *ScalarTy = TypeInfo.inferScalarType(DefV); From 54af7d2f245d9f7323052d04526a4d3ce55b9da4 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 2 Apr 2025 16:54:27 +0100 Subject: [PATCH 04/19] auto -> ElementCount --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 05a8e999c6c2b..0c4cd232d91ef 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5029,7 +5029,7 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, } else { // The output from scaled phis and scaled reductions actually have // fewer lanes than the VF. - auto VF = VFs[J]; + ElementCount VF = VFs[J]; if (auto *ReductionR = dyn_cast(R)) VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor()); else if (auto *PartialReductionR = From b6b9063ba685ef270b3248637ac11a04cfbf9495 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 2 Apr 2025 16:54:43 +0100 Subject: [PATCH 05/19] Add register pressure debug output checks --- .../AArch64/partial-reduce-dot-product-neon.ll | 7 +++++++ .../LoopVectorize/AArch64/partial-reduce-dot-product.ll | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index a55a5813044cb..c02e67f66a76a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -2,6 +2,7 @@ ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize --disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-REGS target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" @@ -947,6 +948,12 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: ; +; CHECK-REGS: LV: Checking a loop in 'dotp_unrolled' from +; CHECK-REGS: LV(REG): VF = 16 +; CHECK-REGS-NEXT: LV(REG): Found max usage: 2 item +; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers +; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers +; CHECK-REGS-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 541cf034e0f3a..d17ac3dac8b9e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -2,6 +2,7 @@ ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize --disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-REGS target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" @@ -3420,6 +3421,12 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: ; +; CHECK-REGS-LABEL: LV: Checking a loop in 'dotp_high_register_pressure' from +; CHECK-REGS: LV(REG): VF = 16 +; CHECK-REGS-NEXT: LV(REG): Found max usage: 2 item +; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers +; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers +; CHECK-REGS-NEXT: LV(REG): Found invariant usage: 1 item entry: %cmp100 = icmp sgt i32 %n, 0 br i1 %cmp100, label %for.body.lr.ph, label %for.cond.cleanup From a6951248f86d30ac14ef0582e138ef69a73c51e3 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 3 Apr 2025 17:35:00 +0100 Subject: [PATCH 06/19] Add REQUIRES: asserts --- .../LoopVectorize/AArch64/partial-reduce-dot-product.ll | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index d17ac3dac8b9e..2a9c488747244 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1,4 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4 +; REQUIRES: asserts + ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW From f05ebee46409a29beadb5543d4261db10844148f Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Mon, 7 Apr 2025 14:50:05 +0100 Subject: [PATCH 07/19] Update tests after rebasing on fhahn's changes --- .../AArch64/partial-reduce-dot-product-neon.ll | 2 +- .../AArch64/partial-reduce-dot-product.ll | 6 +++--- .../Transforms/LoopVectorize/X86/pr47437.ll | 18 +++++++++--------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index c02e67f66a76a..696d9eaf9a9cf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -953,7 +953,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-REGS-NEXT: LV(REG): Found max usage: 2 item ; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers ; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers -; CHECK-REGS-NEXT: LV(REG): Found invariant usage: 1 item +; CHECK-REGS-NEXT: LV(REG): Found invariant usage: 0 item entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 2a9c488747244..ccb539ad23456 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -3423,12 +3423,12 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: ; -; CHECK-REGS-LABEL: LV: Checking a loop in 'dotp_high_register_pressure' from +; CHECK-REGS-LABEL: LV: Checking a loop in 'not_dotp_high_register_pressure' from ; CHECK-REGS: LV(REG): VF = 16 ; CHECK-REGS-NEXT: LV(REG): Found max usage: 2 item ; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers -; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers -; CHECK-REGS-NEXT: LV(REG): Found invariant usage: 1 item +; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 40 registers +; CHECK-REGS-NEXT: LV(REG): Found invariant usage: 2 item entry: %cmp100 = icmp sgt i32 %n, 0 br i1 %cmp100, label %for.body.lr.ph, label %for.cond.cleanup diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index c4291507e8d97..71e000a0272cc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -212,16 +212,16 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32> ; AVX1-NEXT: [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]] ; AVX1-NEXT: [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]] -; AVX1-NEXT: [[TMP48:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP44]] -; AVX1-NEXT: [[TMP49:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP45]] -; AVX1-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]] -; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 0 -; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 4 -; AVX1-NEXT: store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4 -; AVX1-NEXT: store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4 +; AVX1-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP44]] +; AVX1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP45]] +; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 4 +; AVX1-NEXT: store <4 x i32> [[TMP19]], ptr [[TMP25]], align 4 +; AVX1-NEXT: store <4 x i32> [[TMP20]], ptr [[TMP26]], align 4 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX1-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] From ff432c92f832fe14d400049c20871fffac6df48c Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Tue, 8 Apr 2025 10:16:11 +0100 Subject: [PATCH 08/19] Use one block for scaling factor consideration --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0c4cd232d91ef..e9f733eebe09d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5030,11 +5030,15 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, // The output from scaled phis and scaled reductions actually have // fewer lanes than the VF. ElementCount VF = VFs[J]; - if (auto *ReductionR = dyn_cast(R)) - VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor()); - else if (auto *PartialReductionR = - dyn_cast(R)) - VF = VF.divideCoefficientBy(PartialReductionR->getVFScaleFactor()); + if (isa(R)) { + auto *ReductionR = dyn_cast(R); + auto *PartialReductionR = + ReductionR ? nullptr : dyn_cast(R); + unsigned ScaleFactor = ReductionR + ? ReductionR->getVFScaleFactor() + : PartialReductionR->getVFScaleFactor(); + VF = VF.divideCoefficientBy(ScaleFactor); + } LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF << " for "; From 0f68427099a951179a83c4b7f283fb32e9d8c3b8 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Tue, 8 Apr 2025 09:30:46 +0100 Subject: [PATCH 09/19] Add comments for getVFScaleFactor --- llvm/lib/Transforms/Vectorize/VPlan.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6928bea4bb092..fd5a66d73e288 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2033,6 +2033,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Generate the phi/select nodes. void execute(VPTransformState &State) override; + /// Get the factor that the VF of this recipe's output should be scaled by unsigned getVFScaleFactor() const { return VFScaleFactor; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2103,6 +2104,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { /// Get the binary op's opcode. unsigned getOpcode() const { return Opcode; } + /// Get the factor that the VF of this recipe's output should be scaled by unsigned getVFScaleFactor() const { return VFScaleFactor; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) From 84273625c2287355971ddbda0e48095a512fe81d Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Tue, 8 Apr 2025 09:27:40 +0100 Subject: [PATCH 10/19] Add missing asserts requirement --- .../LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index 696d9eaf9a9cf..70bc05922306f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -1,4 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4 +; REQUIRES: asserts + ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW From b905806ab8616ca300564a8e05bb1ae27a5a7b1c Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 9 Apr 2025 15:13:23 +0100 Subject: [PATCH 11/19] Add << overload for VPDef and getVFScaleFactor utility function --- .../Transforms/Vectorize/LoopVectorize.cpp | 31 +++++++++++-------- llvm/lib/Transforms/Vectorize/VPlan.cpp | 9 ++++++ llvm/lib/Transforms/Vectorize/VPlanSLP.cpp | 4 +-- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../Transforms/Vectorize/VPlanTest.cpp | 2 +- 5 files changed, 31 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e9f733eebe09d..51d018e444161 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4874,6 +4874,20 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { } } +/// Get the VF scaling factor applied to the recipe's output, if the recipe has +/// one. +static unsigned getVFScaleFactor(VPRecipeBase *R) { + if (isa(R)) { + auto *ReductionR = dyn_cast(R); + auto *PartialReductionR = + ReductionR ? nullptr : dyn_cast(R); + unsigned ScaleFactor = ReductionR ? ReductionR->getVFScaleFactor() + : PartialReductionR->getVFScaleFactor(); + return ScaleFactor; + } + return 1; +} + /// Estimate the register usage for \p Plan and vectorization factors in \p VFs /// by calculating the highest number of values that are live at a single /// location as a rough estimate. Returns the register usage for each VF in \p @@ -5027,22 +5041,13 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, // even in the scalar case. RegUsage[ClassID] += 1; } else { - // The output from scaled phis and scaled reductions actually have + // The output from scaled phis and scaled reductions actually has // fewer lanes than the VF. - ElementCount VF = VFs[J]; - if (isa(R)) { - auto *ReductionR = dyn_cast(R); - auto *PartialReductionR = - ReductionR ? nullptr : dyn_cast(R); - unsigned ScaleFactor = ReductionR - ? ReductionR->getVFScaleFactor() - : PartialReductionR->getVFScaleFactor(); - VF = VF.divideCoefficientBy(ScaleFactor); - } + unsigned ScaleFactor = getVFScaleFactor(R); + ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF - << " for "; - R->dump(); + << " for " << *R << "\n"; }); for (VPValue *DefV : R->definedValues()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 9474e7a171dff..26895b20ec708 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -71,6 +71,15 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { V.print(OS, SlotTracker); return OS; } +raw_ostream &llvm::operator<<(raw_ostream &OS, const VPDef &D) { + /// If this def has a single value, we can cast it to an instruction and use + /// its plan for the slot tracker. + if (const VPValue *Val = D.getVPSingleValue()) + return OS << *Val; + VPSlotTracker SlotTracker(nullptr); + D.print(OS, "", SlotTracker); + return OS; +} #endif Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder, diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index e943c7a29eb83..85e13a499471c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -516,8 +516,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef Values) { auto *Inst = cast(Values[0])->getUnderlyingInstr(); auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); - LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " - << *cast(Values[0]) << "\n"); + LLVM_DEBUG(dbgs() << "Create VPInstruction " << cast(*VPI) << " " + << Values[0] << "\n"); addCombined(Values, VPI); return VPI; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index d322fdfa727e4..6f0de80578913 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -195,6 +195,7 @@ typedef DenseMap Value2VPValueTy; typedef DenseMap VPValue2ValueTy; raw_ostream &operator<<(raw_ostream &OS, const VPValue &V); +raw_ostream &operator<<(raw_ostream &OS, const VPDef &D); /// This class augments VPValue with operands which provide the inverse def-use /// edges from VPValue's users to their defs. diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index cb7545171744e..a2c66f939adb0 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -812,7 +812,7 @@ Successor(s): ir-bb { std::string I4Dump; raw_string_ostream OS(I4Dump); - OS << *I4; + OS << *cast(I4); EXPECT_EQ("EMIT vp<%5> = mul vp<%3>, vp<%2>", I4Dump); } } From 5f14165168e2056ef40c99cd32a89f5a27aceda4 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 10 Apr 2025 10:15:33 +0100 Subject: [PATCH 12/19] Simplify utility function --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 51d018e444161..b595908abb325 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4877,14 +4877,10 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { /// Get the VF scaling factor applied to the recipe's output, if the recipe has /// one. static unsigned getVFScaleFactor(VPRecipeBase *R) { - if (isa(R)) { - auto *ReductionR = dyn_cast(R); - auto *PartialReductionR = - ReductionR ? nullptr : dyn_cast(R); - unsigned ScaleFactor = ReductionR ? ReductionR->getVFScaleFactor() - : PartialReductionR->getVFScaleFactor(); - return ScaleFactor; - } + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor(); + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor(); return 1; } From 296e3ce09da48bd64b79746e76db0dfd11fbb44c Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 10 Apr 2025 10:58:28 +0100 Subject: [PATCH 13/19] Fix test by printing VPRecipeBase --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 10 +++------- llvm/lib/Transforms/Vectorize/VPlanValue.h | 2 +- llvm/unittests/Transforms/Vectorize/VPlanTest.cpp | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 26895b20ec708..34aa3ff1a744a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -71,13 +71,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { V.print(OS, SlotTracker); return OS; } -raw_ostream &llvm::operator<<(raw_ostream &OS, const VPDef &D) { - /// If this def has a single value, we can cast it to an instruction and use - /// its plan for the slot tracker. - if (const VPValue *Val = D.getVPSingleValue()) - return OS << *Val; - VPSlotTracker SlotTracker(nullptr); - D.print(OS, "", SlotTracker); +raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) { + VPSlotTracker SlotTracker(R.getParent()->getPlan()); + R.print(OS, "", SlotTracker); return OS; } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 6f0de80578913..33a049cc74aeb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -195,7 +195,7 @@ typedef DenseMap Value2VPValueTy; typedef DenseMap VPValue2ValueTy; raw_ostream &operator<<(raw_ostream &OS, const VPValue &V); -raw_ostream &operator<<(raw_ostream &OS, const VPDef &D); +raw_ostream &operator<<(raw_ostream &OS, const VPRecipeBase &R); /// This class augments VPValue with operands which provide the inverse def-use /// edges from VPValue's users to their defs. diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index a2c66f939adb0..3c81376eb83e0 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -812,7 +812,7 @@ Successor(s): ir-bb { std::string I4Dump; raw_string_ostream OS(I4Dump); - OS << *cast(I4); + OS << *cast(I4); EXPECT_EQ("EMIT vp<%5> = mul vp<%3>, vp<%2>", I4Dump); } } From c38fcabf65fb8c5d96fd9adc2000bdea99cb2662 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 10 Apr 2025 15:19:41 +0100 Subject: [PATCH 14/19] Re-add new line --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b595908abb325..e07caeb0a2e1f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5024,6 +5024,7 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, if (isa(R)) continue; + if (VFs[J].isScalar() || isa(R) || From e6061f2c0b416f294f070f2d90d577d9ce1fb4fe Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 10 Apr 2025 15:19:56 +0100 Subject: [PATCH 15/19] auto -> std::optional --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e07caeb0a2e1f..8648401f4b23b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9156,7 +9156,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe( if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); - if (auto ScaleFactor = getScalingForReduction(Instr)) + if (std::optional ScaleFactor = getScalingForReduction(Instr)) return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); if (!shouldWiden(Instr, Range)) From f1e7e9be3e4b2d13dec7de512a4f669cd4eba534 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 10 Apr 2025 15:20:15 +0100 Subject: [PATCH 16/19] Replace VPValue printer with VPRecipeBase --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 10 ++-------- llvm/lib/Transforms/Vectorize/VPlanSLP.cpp | 4 ++-- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 - llvm/unittests/Transforms/Vectorize/VPlanTest.cpp | 2 +- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 34aa3ff1a744a..eb4394171d3be 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -64,15 +64,9 @@ static cl::opt PrintVPlansInDotFormat( #define DEBUG_TYPE "loop-vectorize" #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { - const VPInstruction *Instr = dyn_cast(&V); - VPSlotTracker SlotTracker( - (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); - V.print(OS, SlotTracker); - return OS; -} raw_ostream &llvm::operator<<(raw_ostream &OS, const VPRecipeBase &R) { - VPSlotTracker SlotTracker(R.getParent()->getPlan()); + const VPBasicBlock *Parent = R.getParent(); + VPSlotTracker SlotTracker(Parent ? Parent->getPlan() : nullptr); R.print(OS, "", SlotTracker); return OS; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index 85e13a499471c..c4d79526721ec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -516,8 +516,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef Values) { auto *Inst = cast(Values[0])->getUnderlyingInstr(); auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); - LLVM_DEBUG(dbgs() << "Create VPInstruction " << cast(*VPI) << " " - << Values[0] << "\n"); + LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << Values[0] + << "\n"); addCombined(Values, VPI); return VPI; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 33a049cc74aeb..70ce58900555c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -194,7 +194,6 @@ class VPValue { typedef DenseMap Value2VPValueTy; typedef DenseMap VPValue2ValueTy; -raw_ostream &operator<<(raw_ostream &OS, const VPValue &V); raw_ostream &operator<<(raw_ostream &OS, const VPRecipeBase &R); /// This class augments VPValue with operands which provide the inverse def-use diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 3c81376eb83e0..cb7545171744e 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -812,7 +812,7 @@ Successor(s): ir-bb { std::string I4Dump; raw_string_ostream OS(I4Dump); - OS << *cast(I4); + OS << *I4; EXPECT_EQ("EMIT vp<%5> = mul vp<%3>, vp<%2>", I4Dump); } } From bde39b46fa02c362b2e01902952e355c5fd6baf3 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 10 Apr 2025 15:20:48 +0100 Subject: [PATCH 17/19] Add VFScaleFactor comment --- llvm/lib/Transforms/Vectorize/VPlan.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index fd5a66d73e288..89a0c75b39c3d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2066,6 +2066,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// scalar value. class VPPartialReductionRecipe : public VPSingleDefRecipe { unsigned Opcode; + /// The divisor by which the VF of this recipe's output should be divided + /// during execution. unsigned VFScaleFactor; public: From f58b5e1a175b28208839e4e2fa0d3b7c3f58b4e9 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 10 Apr 2025 15:22:04 +0100 Subject: [PATCH 18/19] Move reg usage tests to reg-usage.ll --- .../partial-reduce-dot-product-neon.ll | 10 - .../AArch64/partial-reduce-dot-product.ll | 10 - .../LoopVectorize/AArch64/reg-usage.ll | 180 ++++++++++++++++++ 3 files changed, 180 insertions(+), 20 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index 70bc05922306f..e6687fe767c0a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -1,10 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4 -; REQUIRES: asserts - ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW -; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize --disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-REGS target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" @@ -949,13 +946,6 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: -; -; CHECK-REGS: LV: Checking a loop in 'dotp_unrolled' from -; CHECK-REGS: LV(REG): VF = 16 -; CHECK-REGS-NEXT: LV(REG): Found max usage: 2 item -; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers -; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers -; CHECK-REGS-NEXT: LV(REG): Found invariant usage: 0 item entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index ccb539ad23456..1b22523e9f5bd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1,10 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4 -; REQUIRES: asserts - ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW -; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize --disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-REGS target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" @@ -3422,13 +3419,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: -; -; CHECK-REGS-LABEL: LV: Checking a loop in 'not_dotp_high_register_pressure' from -; CHECK-REGS: LV(REG): VF = 16 -; CHECK-REGS-NEXT: LV(REG): Found max usage: 2 item -; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers -; CHECK-REGS-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 40 registers -; CHECK-REGS-NEXT: LV(REG): Found invariant usage: 2 item entry: %cmp100 = icmp sgt i32 %n, 0 br i1 %cmp100, label %for.body.lr.ph, label %for.cond.cleanup diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index ab12ca4254a09..2e6efec297d61 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -68,3 +68,183 @@ loop: exit: ret void } + +define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 %n) #1 { +; CHECK-LABEL: LV: Checking a loop in 'dotp_high_register_pressure' from +; CHECK: LV(REG): VF = 16 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 40 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 2 item +entry: + %cmp100 = icmp sgt i32 %n, 0 + br i1 %cmp100, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %arrayidx13 = getelementptr inbounds nuw i8, ptr %sum, i64 4 + %gep.b.12 = getelementptr inbounds nuw i8, ptr %sum, i64 8 + %arrayidx31 = getelementptr inbounds nuw i8, ptr %sum, i64 12 + %arrayidx40 = getelementptr inbounds nuw i8, ptr %sum, i64 16 + %arrayidx49 = getelementptr inbounds nuw i8, ptr %sum, i64 20 + %arrayidx58 = getelementptr inbounds nuw i8, ptr %sum, i64 24 + %arrayidx67 = getelementptr inbounds nuw i8, ptr %sum, i64 28 + %sum.promoted = load i32, ptr %sum, align 4 + %arrayidx13.promoted = load i32, ptr %arrayidx13, align 4 + %gep.b.12.promoted = load i32, ptr %gep.b.12, align 4 + %arrayidx31.promoted = load i32, ptr %arrayidx31, align 4 + %arrayidx40.promoted = load i32, ptr %arrayidx40, align 4 + %arrayidx49.promoted = load i32, ptr %arrayidx49, align 4 + %arrayidx58.promoted = load i32, ptr %arrayidx58, align 4 + %arrayidx67.promoted = load i32, ptr %arrayidx67, align 4 + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body + %add.lcssa = phi i32 [ %add.1, %for.body ] + %add.2.lcssa = phi i32 [ %add.2, %for.body ] + %add.3.lcssa = phi i32 [ %add.3, %for.body ] + %add.4.lcssa = phi i32 [ %add.4, %for.body ] + %add.5.lcssa = phi i32 [ %add.5, %for.body ] + %add.6.lcssa = phi i32 [ %add.6, %for.body ] + %add.7.lcssa = phi i32 [ %add.7, %for.body ] + %add.8.lcssa = phi i32 [ %add.8, %for.body ] + store i32 %add.lcssa, ptr %sum, align 4 + store i32 %add.2.lcssa, ptr %arrayidx13, align 4 + store i32 %add.3.lcssa, ptr %gep.b.12, align 4 + store i32 %add.4.lcssa, ptr %arrayidx31, align 4 + store i32 %add.5.lcssa, ptr %arrayidx40, align 4 + store i32 %add.6.lcssa, ptr %arrayidx49, align 4 + store i32 %add.7.lcssa, ptr %arrayidx58, align 4 + store i32 %add.8.lcssa, ptr %arrayidx67, align 4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %0 = phi i32 [ %arrayidx67.promoted, %for.body.lr.ph ], [ %add.8, %for.body ] + %1 = phi i32 [ %arrayidx58.promoted, %for.body.lr.ph ], [ %add.7, %for.body ] + %2 = phi i32 [ %arrayidx49.promoted, %for.body.lr.ph ], [ %add.6, %for.body ] + %3 = phi i32 [ %arrayidx40.promoted, %for.body.lr.ph ], [ %add.5, %for.body ] + %4 = phi i32 [ %arrayidx31.promoted, %for.body.lr.ph ], [ %add.4, %for.body ] + %5 = phi i32 [ %gep.b.12.promoted, %for.body.lr.ph ], [ %add.3, %for.body ] + %6 = phi i32 [ %arrayidx13.promoted, %for.body.lr.ph ], [ %add.2, %for.body ] + %7 = phi i32 [ %sum.promoted, %for.body.lr.ph ], [ %add.1, %for.body ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %load.a = load i8, ptr %arrayidx, align 1 + %ext.a = zext i8 %load.a to i32 + %9 = shl nsw i64 %indvars.iv, 3 + %gep.b.1 = getelementptr inbounds nuw i8, ptr %b, i64 %9 + %load.b.1 = load i8, ptr %gep.b.1, align 1 + %ext.b.1 = sext i8 %load.b.1 to i32 + %mul.1 = mul nsw i32 %ext.b.1, %ext.a + %add.1 = add nsw i32 %mul.1, %7 + %11 = or disjoint i64 %9, 1 + %gep.b.2 = getelementptr inbounds nuw i8, ptr %b, i64 %11 + %load.b.2 = load i8, ptr %gep.b.2, align 1 + %ext.b.2 = sext i8 %load.b.2 to i32 + %mul.2 = mul nsw i32 %ext.b.2, %ext.a + %add.2 = add nsw i32 %mul.2, %6 + %13 = or disjoint i64 %9, 2 + %gep.b.3 = getelementptr inbounds nuw i8, ptr %b, i64 %13 + %load.b.3 = load i8, ptr %gep.b.3, align 1 + %ext.b.3 = sext i8 %load.b.3 to i32 + %mul.3 = mul nsw i32 %ext.b.3, %ext.a + %add.3 = add nsw i32 %mul.3, %5 + %15 = or disjoint i64 %9, 3 + %gep.b.4 = getelementptr inbounds nuw i8, ptr %b, i64 %15 + %load.b.4 = load i8, ptr %gep.b.4, align 1 + %ext.b.4 = sext i8 %load.b.4 to i32 + %mul.4 = mul nsw i32 %ext.b.4, %ext.a + %add.4 = add nsw i32 %mul.4, %4 + %17 = or disjoint i64 %9, 4 + %gep.b.5 = getelementptr inbounds nuw i8, ptr %b, i64 %17 + %load.b.5 = load i8, ptr %gep.b.5, align 1 + %ext.b.5 = sext i8 %load.b.5 to i32 + %mul.5 = mul nsw i32 %ext.b.5, %ext.a + %add.5 = add nsw i32 %mul.5, %3 + %19 = or disjoint i64 %9, 5 + %gep.b.6 = getelementptr inbounds nuw i8, ptr %b, i64 %19 + %load.b.6 = load i8, ptr %gep.b.6, align 1 + %ext.b.6 = sext i8 %load.b.6 to i32 + %mul.6 = mul nsw i32 %ext.b.6, %ext.a + %add.6 = add nsw i32 %mul.6, %2 + %21 = or disjoint i64 %9, 6 + %gep.b.7 = getelementptr inbounds nuw i8, ptr %b, i64 %21 + %load.b.7 = load i8, ptr %gep.b.7, align 1 + %ext.b.7 = sext i8 %load.b.7 to i32 + %mul.7 = mul nsw i32 %ext.b.7, %ext.a + %add.7 = add nsw i32 %mul.7, %1 + %23 = or disjoint i64 %9, 7 + %gep.b.8 = getelementptr inbounds nuw i8, ptr %b, i64 %23 + %load.b.8 = load i8, ptr %gep.b.8, align 1 + %ext.b.8 = sext i8 %load.b.8 to i32 + %mul.8 = mul nsw i32 %ext.b.8, %ext.a + %add.8 = add nsw i32 %mul.8, %0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body +} + +define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { +; CHECK-LABEL: LV: Checking a loop in 'dotp_unrolled' from +; CHECK: LV(REG): VF = 16 +; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ] + %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ] + %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ] + %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ] + %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv + %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv + %offset.1 = or disjoint i64 %iv, 1 + %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1 + %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1 + %offset.2 = or disjoint i64 %iv, 2 + %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2 + %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2 + %offset.3 = or disjoint i64 %iv, 3 + %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3 + %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3 + %load.a0 = load i8, ptr %gep.a0, align 1 + %ext.a0 = sext i8 %load.a0 to i32 + %load.b0 = load i8, ptr %gep.b0, align 1 + %ext.b0 = sext i8 %load.b0 to i32 + %mul.a0 = mul nsw i32 %ext.b0, %ext.a0 + %add.a0 = add nsw i32 %mul.a0, %accum0 + %load.a1 = load i8, ptr %gep.a1, align 1 + %ext.a1 = sext i8 %load.a1 to i32 + %load.b1 = load i8, ptr %gep.b1, align 1 + %ext.b1 = sext i8 %load.b1 to i32 + %mul.a1 = mul nsw i32 %ext.a1, %ext.b1 + %add.a1 = add nsw i32 %mul.a1, %accum1 + %load.a2 = load i8, ptr %gep.a2, align 1 + %ext.a2 = sext i8 %load.a2 to i32 + %load.b2 = load i8, ptr %gep.b2, align 1 + %ext.b2 = sext i8 %load.b2 to i32 + %mul.a2 = mul nsw i32 %ext.a2, %ext.b2 + %add.a2 = add nsw i32 %mul.a2, %accum2 + %load.a3 = load i8, ptr %gep.a3, align 1 + %ext.a3 = sext i8 %load.a3 to i32 + %load.b3 = load i8, ptr %gep.b3, align 1 + %ext.b3 = sext i8 %load.b3 to i32 + %mul.a3 = mul nsw i32 %ext.a3, %ext.b3 + %add.a3 = add nsw i32 %mul.a3, %accum3 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %num_in + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + %result0 = add nsw i32 %add.a0, %add.a1 + %result1 = add nsw i32 %add.a2, %add.a3 + %result = add nsw i32 %result0, %result1 + ret i32 %result +} From aefea413d5dadeb3ea95034009d14ee484cb908b Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Fri, 11 Apr 2025 09:45:18 +0100 Subject: [PATCH 19/19] Add full stops --- llvm/lib/Transforms/Vectorize/VPlan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 89a0c75b39c3d..efb0789bc7041 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2033,7 +2033,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Generate the phi/select nodes. void execute(VPTransformState &State) override; - /// Get the factor that the VF of this recipe's output should be scaled by + /// Get the factor that the VF of this recipe's output should be scaled by. unsigned getVFScaleFactor() const { return VFScaleFactor; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2106,7 +2106,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { /// Get the binary op's opcode. unsigned getOpcode() const { return Opcode; } - /// Get the factor that the VF of this recipe's output should be scaled by + /// Get the factor that the VF of this recipe's output should be scaled by. unsigned getVFScaleFactor() const { return VFScaleFactor; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)