diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d6af8a1435d07..5a3d358a405f9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -996,12 +996,15 @@ class LoopVectorizationCostModel { /// Holds the maximum number of concurrent live intervals in the loop. /// The key is ClassID of target-provided register class. SmallMapVector MaxLocalUsers; - }; - /// \return Returns information about the register usages of the loop for the - /// given vectorization factors. - SmallVector - calculateRegisterUsage(ArrayRef VFs); + /// Check if any of the tracked live intervals exceeds the number of + /// available registers for the target. + bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const { + return any_of(MaxLocalUsers, [&TTI](auto &LU) { + return LU.second > TTI.getNumberOfRegisters(LU.first); + }); + } + }; /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); @@ -4013,29 +4016,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( auto MaxVectorElementCountMaxBW = ElementCount::get( llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), ComputeScalableMaxVF); - MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); - - // Collect all viable vectorization factors larger than the default MaxVF - // (i.e. MaxVectorElementCount). - SmallVector VFs; - for (ElementCount VS = MaxVectorElementCount * 2; - ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) - VFs.push_back(VS); - - // For each VF calculate its register usage. - auto RUs = calculateRegisterUsage(VFs); - - // Select the largest VF which doesn't require more registers than existing - // ones. - for (int I = RUs.size() - 1; I >= 0; --I) { - const auto &MLU = RUs[I].MaxLocalUsers; - if (all_of(MLU, [&](decltype(MLU.front()) &LU) { - return LU.second <= TTI.getNumberOfRegisters(LU.first); - })) { - MaxVF = VFs[I]; - break; - } - } + MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); + if (ElementCount MinVF = TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { if (ElementCount::isKnownLT(MaxVF, MinVF)) { @@ -4360,6 +4342,15 @@ static bool hasReplicatorRegion(VPlan &Plan) { } #ifndef NDEBUG +/// Estimate the register usage for \p Plan and vectorization factors in \p VFs +/// by calculating the highest number of values that are live at a single +/// location as a rough estimate. Returns the register usage for each VF in \p +/// VFs. +static SmallVector +calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, + const TargetTransformInfo &TTI, + const SmallPtrSetImpl &ValuesToIgnore); + VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); @@ -4383,11 +4374,19 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { } for (auto &P : VPlans) { - for (ElementCount VF : P->vectorFactors()) { + ArrayRef VFs(P->vectorFactors().begin(), + P->vectorFactors().end()); + auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore); + for (auto [VF, RU] : zip_equal(VFs, RUs)) { // The cost for scalar VF=1 is already calculated, so ignore it. if (VF.isScalar()) continue; + /// Don't consider the VF if it exceeds the number of registers for the + /// target. + if (RU.exceedsMaxNumRegs(TTI)) + continue; + InstructionCost C = CM.expectedCost(VF); // Add on other costs that are modelled in VPlan, but not in the legacy @@ -4859,9 +4858,13 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, isa(R) || (isa(R) && - all_of(cast(R)->users(), [&](VPUser *U) { - return cast(U)->usesScalars(R->getVPSingleValue()); - }))) { + all_of(cast(R)->users(), + [&](VPUser *U) { + return cast(U)->usesScalars( + R->getVPSingleValue()); + })) || + (isa(R) && + (cast(R))->isInLoop())) { unsigned ClassID = TTI.getRegisterClassForType( false, TypeInfo.inferScalarType(R->getVPSingleValue())); // FIXME: The target might use more than one register for the type @@ -5234,213 +5237,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, return 1; } -SmallVector -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { - // This function calculates the register usage by measuring the highest number - // of values that are alive at a single location. Obviously, this is a very - // rough estimation. We scan the loop in a topological order in order and - // assign a number to each instruction. We use RPO to ensure that defs are - // met before their users. We assume that each instruction that has in-loop - // users starts an interval. We record every time that an in-loop value is - // used, so we have a list of the first and last occurrences of each - // instruction. Next, we transpose this data structure into a multi map that - // holds the list of intervals that *end* at a specific location. This multi - // map allows us to perform a linear search. We scan the instructions linearly - // and record each time that a new interval starts, by placing it in a set. - // If we find this value in the multi-map then we remove it from the set. - // The max register usage is the maximum size of the set. - // We also search for instructions that are defined outside the loop, but are - // used inside the loop. We need this number separately from the max-interval - // usage number because when we unroll, loop-invariant values do not take - // more registers. - LoopBlocksDFS DFS(TheLoop); - DFS.perform(LI); - - RegisterUsage RU; - - // Each 'key' in the map opens a new interval. The values - // of the map are the index of the 'last seen' usage of the - // instruction that is the key. - using IntervalMap = SmallDenseMap; - - // Maps instruction to its index. - SmallVector IdxToInstr; - // Marks the end of each interval. - IntervalMap EndPoint; - // Saves the list of instruction indices that are used in the loop. - SmallPtrSet Ends; - // Saves the list of values that are used in the loop but are defined outside - // the loop (not including non-instruction values such as arguments and - // constants). - SmallSetVector LoopInvariants; - - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { - for (Instruction &I : BB->instructionsWithoutDebug()) { - IdxToInstr.push_back(&I); - - // Save the end location of each USE. - for (Value *U : I.operands()) { - auto *Instr = dyn_cast(U); - - // Ignore non-instruction values such as arguments, constants, etc. - // FIXME: Might need some motivation why these values are ignored. If - // for example an argument is used inside the loop it will increase the - // register pressure (so shouldn't we add it to LoopInvariants). - if (!Instr) - continue; - - // If this instruction is outside the loop then record it and continue. - if (!TheLoop->contains(Instr)) { - LoopInvariants.insert(Instr); - continue; - } - - // Overwrite previous end points. - EndPoint[Instr] = IdxToInstr.size(); - Ends.insert(Instr); - } - } - } - - // Saves the list of intervals that end with the index in 'key'. - using InstrList = SmallVector; - SmallDenseMap TransposeEnds; - - // Transpose the EndPoints to a list of values that end at each index. - for (auto &Interval : EndPoint) - TransposeEnds[Interval.second].push_back(Interval.first); - - SmallPtrSet OpenIntervals; - SmallVector RUs(VFs.size()); - SmallVector, 8> MaxUsages(VFs.size()); - - LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); - - const auto &TTICapture = TTI; - auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { - if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || - (VF.isScalable() && - !TTICapture.isElementTypeLegalForScalableVector(Ty))) - return 0; - return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); - }; - - collectInLoopReductions(); - - for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) { - Instruction *I = IdxToInstr[Idx]; - - // Remove all of the instructions that end at this location. - InstrList &List = TransposeEnds[Idx]; - for (Instruction *ToRemove : List) - OpenIntervals.erase(ToRemove); - - // Ignore instructions that are never used within the loop and do not have - // side-effects. - if (!Ends.count(I) && !I->mayHaveSideEffects()) - continue; - - // Skip ignored values. - if (ValuesToIgnore.count(I)) - continue; - - // For each VF find the maximum usage of registers. - for (unsigned J = 0, E = VFs.size(); J < E; ++J) { - // Count the number of registers used, per register class, given all open - // intervals. - // Note that elements in this SmallMapVector will be default constructed - // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if - // there is no previous entry for ClassID. - SmallMapVector RegUsage; - - if (VFs[J].isScalar()) { - for (auto *Inst : OpenIntervals) { - unsigned ClassID = - TTI.getRegisterClassForType(false, Inst->getType()); - // FIXME: The target might use more than one register for the type - // even in the scalar case. - RegUsage[ClassID] += 1; - } - } else { - collectNonVectorizedAndSetWideningDecisions(VFs[J]); - for (auto *Inst : OpenIntervals) { - // Skip ignored values for VF > 1. - if (VecValuesToIgnore.count(Inst)) - continue; - if (isScalarAfterVectorization(Inst, VFs[J])) { - unsigned ClassID = - TTI.getRegisterClassForType(false, Inst->getType()); - // FIXME: The target might use more than one register for the type - // even in the scalar case. - RegUsage[ClassID] += 1; - } else { - unsigned ClassID = - TTI.getRegisterClassForType(true, Inst->getType()); - RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]); - } - } - } - - for (const auto &Pair : RegUsage) { - auto &Entry = MaxUsages[J][Pair.first]; - Entry = std::max(Entry, Pair.second); - } - } - - LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " - << OpenIntervals.size() << '\n'); - - // Add the current instruction to the list of open intervals. - OpenIntervals.insert(I); - } - - for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { - // Note that elements in this SmallMapVector will be default constructed - // as 0. So we can use "Invariant[ClassID] += n" in the code below even if - // there is no previous entry for ClassID. - SmallMapVector Invariant; - - for (auto *Inst : LoopInvariants) { - // FIXME: The target might use more than one register for the type - // even in the scalar case. - bool IsScalar = all_of(Inst->users(), [&](User *U) { - auto *I = cast(U); - return TheLoop != LI->getLoopFor(I->getParent()) || - isScalarAfterVectorization(I, VFs[Idx]); - }); - - ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; - unsigned ClassID = - TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); - Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); - } - - LLVM_DEBUG({ - dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; - dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() - << " item\n"; - for (const auto &pair : MaxUsages[Idx]) { - dbgs() << "LV(REG): RegisterClass: " - << TTI.getRegisterClassName(pair.first) << ", " << pair.second - << " registers\n"; - } - dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() - << " item\n"; - for (const auto &pair : Invariant) { - dbgs() << "LV(REG): RegisterClass: " - << TTI.getRegisterClassName(pair.first) << ", " << pair.second - << " registers\n"; - } - }); - - RU.LoopInvariantRegs = Invariant; - RU.MaxLocalUsers = MaxUsages[Idx]; - RUs[Idx] = RU; - } - - return RUs; -} - bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF) { // TODO: Cost model for emulated masked load/store is completely @@ -7621,7 +7417,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { } for (auto &P : VPlans) { - for (ElementCount VF : P->vectorFactors()) { + ArrayRef VFs(P->vectorFactors().begin(), + P->vectorFactors().end()); + auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore); + for (auto [VF, RU] : zip_equal(VFs, RUs)) { if (VF.isScalar()) continue; if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { @@ -7642,6 +7441,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { InstructionCost Cost = cost(*P, VF); VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); + + if (RU.exceedsMaxNumRegs(TTI)) { + LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width " + << VF << " because it uses too many registers\n"); + continue; + } + if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) BestFactor = CurrentFactor; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 1b22523e9f5bd..52dcba69d036a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -3116,8 +3116,8 @@ for.exit: ; preds = %for.body ret i32 %add } -define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 %n) #1 { -; CHECK-INTERLEAVE1-LABEL: define dso_local void @not_dotp_high_register_pressure( +define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 %n) #1 { +; CHECK-INTERLEAVE1-LABEL: define dso_local void @dotp_high_register_pressure( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[SUM:%.*]], i32 [[N:%.*]]) #[[ATTR1]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[CMP100:%.*]] = icmp sgt i32 [[N]], 0 @@ -3139,10 +3139,10 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4 ; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX67_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX67]], align 4 ; CHECK-INTERLEAVE1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX67_PROMOTED]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX58_PROMOTED]], i32 0 @@ -3165,44 +3165,44 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 ; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP12]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <4 x i8> [[STRIDED_VEC]] to <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <4 x i32> [[TMP14]], [[VEC_PHI7]] -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = sext <4 x i8> [[STRIDED_VEC8]] to <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = add <4 x i32> [[TMP17]], [[VEC_PHI6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC9]] to <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP21]] = add <4 x i32> [[TMP20]], [[VEC_PHI5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = sext <4 x i8> [[STRIDED_VEC10]] to <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP22]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP24]] = add <4 x i32> [[TMP23]], [[VEC_PHI4]] -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC11]] to <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP27]] = add <4 x i32> [[TMP26]], [[VEC_PHI3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <4 x i8> [[STRIDED_VEC12]] to <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add <4 x i32> [[TMP29]], [[VEC_PHI2]] -; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC13]] to <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP33]] = add <4 x i32> [[TMP32]], [[VEC_PHI1]] -; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = sext <4 x i8> [[STRIDED_VEC14]] to <4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = add <4 x i32> [[TMP35]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP31]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP34]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP33]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: @@ -3218,7 +3218,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: ; -; CHECK-INTERLEAVED-LABEL: define dso_local void @not_dotp_high_register_pressure( +; CHECK-INTERLEAVED-LABEL: define dso_local void @dotp_high_register_pressure( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[SUM:%.*]], i32 [[N:%.*]]) #[[ATTR1]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[CMP100:%.*]] = icmp sgt i32 [[N]], 0 @@ -3240,10 +3240,10 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4 ; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX67_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX67]], align 4 ; CHECK-INTERLEAVED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX67_PROMOTED]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX58_PROMOTED]], i32 0 @@ -3256,70 +3256,70 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE21:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE20:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE19:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE18:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP12]], align 1 -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <4 x i8> [[STRIDED_VEC]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP15]] = add <4 x i32> [[TMP14]], [[VEC_PHI7]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <4 x i8> [[STRIDED_VEC8]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP18]] = add <4 x i32> [[TMP17]], [[VEC_PHI6]] -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC9]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add <4 x i32> [[TMP20]], [[VEC_PHI5]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = sext <4 x i8> [[STRIDED_VEC10]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP22]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <4 x i32> [[TMP23]], [[VEC_PHI4]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC11]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP27]] = add <4 x i32> [[TMP26]], [[VEC_PHI3]] -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <4 x i8> [[STRIDED_VEC12]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP30]] = add <4 x i32> [[TMP29]], [[VEC_PHI2]] -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC13]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP33]] = add <4 x i32> [[TMP32]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <4 x i8> [[STRIDED_VEC14]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP36]] = add <4 x i32> [[TMP35]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]]) +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]]) +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]]) +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]]) +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]]) +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]]) -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]]) -; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]]) -; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]]) -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]]) -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP21]]) -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]]) -; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]]) +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]]) +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]]) +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]]) +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]]) +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: ; -; CHECK-MAXBW-LABEL: define dso_local void @not_dotp_high_register_pressure( +; CHECK-MAXBW-LABEL: define dso_local void @dotp_high_register_pressure( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[SUM:%.*]], i32 [[N:%.*]]) #[[ATTR1]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[CMP100:%.*]] = icmp sgt i32 [[N]], 0 @@ -3341,10 +3341,10 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4 ; CHECK-MAXBW-NEXT: [[ARRAYIDX67_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX67]], align 4 ; CHECK-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX67_PROMOTED]], i32 0 ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX58_PROMOTED]], i32 0 @@ -3367,44 +3367,44 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] -; CHECK-MAXBW-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP12]], align 1 -; CHECK-MAXBW-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-MAXBW-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-MAXBW-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-MAXBW-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-MAXBW-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-MAXBW-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-MAXBW-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-MAXBW-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <4 x i8> [[STRIDED_VEC]] to <4 x i32> -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP15]] = add <4 x i32> [[TMP14]], [[VEC_PHI7]] -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <4 x i8> [[STRIDED_VEC8]] to <4 x i32> -; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP18]] = add <4 x i32> [[TMP17]], [[VEC_PHI6]] -; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC9]] to <4 x i32> -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP21]] = add <4 x i32> [[TMP20]], [[VEC_PHI5]] -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = sext <4 x i8> [[STRIDED_VEC10]] to <4 x i32> -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP22]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP24]] = add <4 x i32> [[TMP23]], [[VEC_PHI4]] -; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC11]] to <4 x i32> -; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP27]] = add <4 x i32> [[TMP26]], [[VEC_PHI3]] -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <4 x i8> [[STRIDED_VEC12]] to <4 x i32> -; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP30]] = add <4 x i32> [[TMP29]], [[VEC_PHI2]] -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC13]] to <4 x i32> -; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP33]] = add <4 x i32> [[TMP32]], [[VEC_PHI1]] -; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = sext <4 x i8> [[STRIDED_VEC14]] to <4 x i32> -; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP36]] = add <4 x i32> [[TMP35]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP15]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]]) +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP18]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]]) +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP21]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP31]]) +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP24]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]]) +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP27]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]]) +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP30]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP34]]) +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP33]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]]) +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP36]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] ; CHECK-MAXBW: middle.block: @@ -3419,6 +3419,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: +; entry: %cmp100 = icmp sgt i32 %n, 0 br i1 %cmp100, label %for.body.lr.ph, label %for.cond.cleanup @@ -3476,7 +3477,7 @@ for.body: ; preds = %for.body.lr.ph, %fo %7 = phi i32 [ %sum.promoted, %for.body.lr.ph ], [ %add.1, %for.body ] %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv %load.a = load i8, ptr %arrayidx, align 1 - %ext.a = zext i8 %load.a to i32 + %ext.a = sext i8 %load.a to i32 %9 = shl nsw i64 %indvars.iv, 3 %gep.b.1 = getelementptr inbounds nuw i8, ptr %b, i64 %9 %load.b.1 = load i8, ptr %gep.b.1, align 1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index 2e6efec297d61..c5b2be33cae85 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -74,8 +74,8 @@ define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 ; CHECK: LV(REG): VF = 16 ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 40 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 48 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item entry: %cmp100 = icmp sgt i32 %n, 0 br i1 %cmp100, label %for.body.lr.ph, label %for.cond.cleanup @@ -193,7 +193,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 9 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 24 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll index e796e40a7591e..f4102ff8c402b 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll @@ -200,44 +200,16 @@ for.body: ; preds = %entry, %for.body ; CHECK: LV: Selecting VF: 4 define void @tripcount8(ptr nocapture readonly %in, ptr nocapture %out, ptr nocapture readonly %consts, i32 %n) #0 { entry: - %arrayidx20 = getelementptr inbounds i32, ptr %out, i32 1 - %arrayidx38 = getelementptr inbounds i32, ptr %out, i32 2 - %arrayidx56 = getelementptr inbounds i32, ptr %out, i32 3 - %arrayidx74 = getelementptr inbounds i32, ptr %out, i32 4 - %arrayidx92 = getelementptr inbounds i32, ptr %out, i32 5 - %arrayidx110 = getelementptr inbounds i32, ptr %out, i32 6 - %arrayidx128 = getelementptr inbounds i32, ptr %out, i32 7 %out.promoted = load i32, ptr %out, align 4 - %arrayidx20.promoted = load i32, ptr %arrayidx20, align 4 - %arrayidx38.promoted = load i32, ptr %arrayidx38, align 4 - %arrayidx56.promoted = load i32, ptr %arrayidx56, align 4 - %arrayidx74.promoted = load i32, ptr %arrayidx74, align 4 - %arrayidx92.promoted = load i32, ptr %arrayidx92, align 4 - %arrayidx110.promoted = load i32, ptr %arrayidx110, align 4 - %arrayidx128.promoted = load i32, ptr %arrayidx128, align 4 br label %for.body for.cond.cleanup: ; preds = %for.body store i32 %add12, ptr %out, align 4 - store i32 %add30, ptr %arrayidx20, align 4 - store i32 %add48, ptr %arrayidx38, align 4 - store i32 %add66, ptr %arrayidx56, align 4 - store i32 %add84, ptr %arrayidx74, align 4 - store i32 %add102, ptr %arrayidx92, align 4 - store i32 %add120, ptr %arrayidx110, align 4 - store i32 %add138, ptr %arrayidx128, align 4 ret void for.body: ; preds = %entry, %for.body %hop.0236 = phi i32 [ 0, %entry ], [ %add139, %for.body ] %add12220235 = phi i32 [ %out.promoted, %entry ], [ %add12, %for.body ] - %add30221234 = phi i32 [ %arrayidx20.promoted, %entry ], [ %add30, %for.body ] - %add48222233 = phi i32 [ %arrayidx38.promoted, %entry ], [ %add48, %for.body ] - %add66223232 = phi i32 [ %arrayidx56.promoted, %entry ], [ %add66, %for.body ] - %add84224231 = phi i32 [ %arrayidx74.promoted, %entry ], [ %add84, %for.body ] - %add102225230 = phi i32 [ %arrayidx92.promoted, %entry ], [ %add102, %for.body ] - %add120226229 = phi i32 [ %arrayidx110.promoted, %entry ], [ %add120, %for.body ] - %add138227228 = phi i32 [ %arrayidx128.promoted, %entry ], [ %add138, %for.body ] %arrayidx = getelementptr inbounds i16, ptr %in, i32 %hop.0236 %0 = load i16, ptr %arrayidx, align 2 %conv = sext i16 %0 to i32 @@ -255,132 +227,6 @@ for.body: ; preds = %entry, %for.body %conv9 = sext i16 %3 to i32 %mul10 = mul nsw i32 %conv9, %conv6 %add12 = add nsw i32 %mul10, %add - %add13 = or i32 %hop.0236, 2 - %arrayidx14 = getelementptr inbounds i16, ptr %in, i32 %add13 - %4 = load i16, ptr %arrayidx14, align 2 - %conv15 = sext i16 %4 to i32 - %arrayidx17 = getelementptr inbounds i16, ptr %consts, i32 %add13 - %5 = load i16, ptr %arrayidx17, align 2 - %conv18 = sext i16 %5 to i32 - %mul19 = mul nsw i32 %conv18, %conv15 - %add21 = add nsw i32 %mul19, %add30221234 - %add22 = or i32 %hop.0236, 3 - %arrayidx23 = getelementptr inbounds i16, ptr %in, i32 %add22 - %6 = load i16, ptr %arrayidx23, align 2 - %conv24 = sext i16 %6 to i32 - %arrayidx26 = getelementptr inbounds i16, ptr %consts, i32 %add22 - %7 = load i16, ptr %arrayidx26, align 2 - %conv27 = sext i16 %7 to i32 - %mul28 = mul nsw i32 %conv27, %conv24 - %add30 = add nsw i32 %mul28, %add21 - %add31 = or i32 %hop.0236, 4 - %arrayidx32 = getelementptr inbounds i16, ptr %in, i32 %add31 - %8 = load i16, ptr %arrayidx32, align 2 - %conv33 = sext i16 %8 to i32 - %arrayidx35 = getelementptr inbounds i16, ptr %consts, i32 %add31 - %9 = load i16, ptr %arrayidx35, align 2 - %conv36 = sext i16 %9 to i32 - %mul37 = mul nsw i32 %conv36, %conv33 - %add39 = add nsw i32 %mul37, %add48222233 - %add40 = or i32 %hop.0236, 5 - %arrayidx41 = getelementptr inbounds i16, ptr %in, i32 %add40 - %10 = load i16, ptr %arrayidx41, align 2 - %conv42 = sext i16 %10 to i32 - %arrayidx44 = getelementptr inbounds i16, ptr %consts, i32 %add40 - %11 = load i16, ptr %arrayidx44, align 2 - %conv45 = sext i16 %11 to i32 - %mul46 = mul nsw i32 %conv45, %conv42 - %add48 = add nsw i32 %mul46, %add39 - %add49 = or i32 %hop.0236, 6 - %arrayidx50 = getelementptr inbounds i16, ptr %in, i32 %add49 - %12 = load i16, ptr %arrayidx50, align 2 - %conv51 = sext i16 %12 to i32 - %arrayidx53 = getelementptr inbounds i16, ptr %consts, i32 %add49 - %13 = load i16, ptr %arrayidx53, align 2 - %conv54 = sext i16 %13 to i32 - %mul55 = mul nsw i32 %conv54, %conv51 - %add57 = add nsw i32 %mul55, %add66223232 - %add58 = or i32 %hop.0236, 7 - %arrayidx59 = getelementptr inbounds i16, ptr %in, i32 %add58 - %14 = load i16, ptr %arrayidx59, align 2 - %conv60 = sext i16 %14 to i32 - %arrayidx62 = getelementptr inbounds i16, ptr %consts, i32 %add58 - %15 = load i16, ptr %arrayidx62, align 2 - %conv63 = sext i16 %15 to i32 - %mul64 = mul nsw i32 %conv63, %conv60 - %add66 = add nsw i32 %mul64, %add57 - %add67 = or i32 %hop.0236, 8 - %arrayidx68 = getelementptr inbounds i16, ptr %in, i32 %add67 - %16 = load i16, ptr %arrayidx68, align 2 - %conv69 = sext i16 %16 to i32 - %arrayidx71 = getelementptr inbounds i16, ptr %consts, i32 %add67 - %17 = load i16, ptr %arrayidx71, align 2 - %conv72 = sext i16 %17 to i32 - %mul73 = mul nsw i32 %conv72, %conv69 - %add75 = add nsw i32 %mul73, %add84224231 - %add76 = or i32 %hop.0236, 9 - %arrayidx77 = getelementptr inbounds i16, ptr %in, i32 %add76 - %18 = load i16, ptr %arrayidx77, align 2 - %conv78 = sext i16 %18 to i32 - %arrayidx80 = getelementptr inbounds i16, ptr %consts, i32 %add76 - %19 = load i16, ptr %arrayidx80, align 2 - %conv81 = sext i16 %19 to i32 - %mul82 = mul nsw i32 %conv81, %conv78 - %add84 = add nsw i32 %mul82, %add75 - %add85 = or i32 %hop.0236, 10 - %arrayidx86 = getelementptr inbounds i16, ptr %in, i32 %add85 - %20 = load i16, ptr %arrayidx86, align 2 - %conv87 = sext i16 %20 to i32 - %arrayidx89 = getelementptr inbounds i16, ptr %consts, i32 %add85 - %21 = load i16, ptr %arrayidx89, align 2 - %conv90 = sext i16 %21 to i32 - %mul91 = mul nsw i32 %conv90, %conv87 - %add93 = add nsw i32 %mul91, %add102225230 - %add94 = or i32 %hop.0236, 11 - %arrayidx95 = getelementptr inbounds i16, ptr %in, i32 %add94 - %22 = load i16, ptr %arrayidx95, align 2 - %conv96 = sext i16 %22 to i32 - %arrayidx98 = getelementptr inbounds i16, ptr %consts, i32 %add94 - %23 = load i16, ptr %arrayidx98, align 2 - %conv99 = sext i16 %23 to i32 - %mul100 = mul nsw i32 %conv99, %conv96 - %add102 = add nsw i32 %mul100, %add93 - %add103 = or i32 %hop.0236, 12 - %arrayidx104 = getelementptr inbounds i16, ptr %in, i32 %add103 - %24 = load i16, ptr %arrayidx104, align 2 - %conv105 = sext i16 %24 to i32 - %arrayidx107 = getelementptr inbounds i16, ptr %consts, i32 %add103 - %25 = load i16, ptr %arrayidx107, align 2 - %conv108 = sext i16 %25 to i32 - %mul109 = mul nsw i32 %conv108, %conv105 - %add111 = add nsw i32 %mul109, %add120226229 - %add112 = or i32 %hop.0236, 13 - %arrayidx113 = getelementptr inbounds i16, ptr %in, i32 %add112 - %26 = load i16, ptr %arrayidx113, align 2 - %conv114 = sext i16 %26 to i32 - %arrayidx116 = getelementptr inbounds i16, ptr %consts, i32 %add112 - %27 = load i16, ptr %arrayidx116, align 2 - %conv117 = sext i16 %27 to i32 - %mul118 = mul nsw i32 %conv117, %conv114 - %add120 = add nsw i32 %mul118, %add111 - %add121 = or i32 %hop.0236, 14 - %arrayidx122 = getelementptr inbounds i16, ptr %in, i32 %add121 - %28 = load i16, ptr %arrayidx122, align 2 - %conv123 = sext i16 %28 to i32 - %arrayidx125 = getelementptr inbounds i16, ptr %consts, i32 %add121 - %29 = load i16, ptr %arrayidx125, align 2 - %conv126 = sext i16 %29 to i32 - %mul127 = mul nsw i32 %conv126, %conv123 - %add129 = add nsw i32 %mul127, %add138227228 - %add130 = or i32 %hop.0236, 15 - %arrayidx131 = getelementptr inbounds i16, ptr %in, i32 %add130 - %30 = load i16, ptr %arrayidx131, align 2 - %conv132 = sext i16 %30 to i32 - %arrayidx134 = getelementptr inbounds i16, ptr %consts, i32 %add130 - %31 = load i16, ptr %arrayidx134, align 2 - %conv135 = sext i16 %31 to i32 - %mul136 = mul nsw i32 %conv135, %conv132 - %add138 = add nsw i32 %mul136, %add129 %add139 = add nuw nsw i32 %hop.0236, 16 %cmp = icmp ult i32 %hop.0236, 112 br i1 %cmp, label %for.body, label %for.cond.cleanup diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll index ce3b2a9f216f2..18d607f5993a6 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -20,17 +20,17 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP7]]) ; CHECK-NEXT: [[TMP10]] = add i32 [[TMP9]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index d021306b89aab..e835731232a62 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -908,11 +908,11 @@ define i32 @mla_i8_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i32> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) ; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]] @@ -1525,12 +1525,12 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP2]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) -; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP5]] = add i64 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll index 534f8aff1788d..8609f306140e5 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll @@ -46,22 +46,10 @@ define i32 @tp_reduces_vf(ptr nocapture %0, i32 %1, ptr %input) { 7: %indvars.iv = phi i32 [ 1, %.preheader ], [ %indvars.iv.next, %7 ] %8 = add nuw nsw i32 %6, %indvars.iv - %9 = add nsw i32 %8, -320 - %10 = add nsw i32 %8, -321 - %11 = getelementptr inbounds i8, ptr %3, i32 %10 - %12 = load i8, ptr %11, align 1 - %13 = sext i8 %12 to i32 - %14 = getelementptr inbounds i8, ptr %3, i32 %9 - %15 = load i8, ptr %14, align 1 - %16 = sext i8 %15 to i32 %17 = add nsw i32 %8, -319 %18 = getelementptr inbounds i8, ptr %3, i32 %17 %19 = load i8, ptr %18, align 1 %20 = sext i8 %19 to i32 - %21 = add nsw i32 %8, -1 - %22 = getelementptr inbounds i8, ptr %3, i32 %21 - %23 = load i8, ptr %22, align 1 - %24 = sext i8 %23 to i32 %25 = getelementptr inbounds i8, ptr %3, i32 %8 %26 = load i8, ptr %25, align 1 %27 = sext i8 %26 to i32 @@ -71,24 +59,16 @@ define i32 @tp_reduces_vf(ptr nocapture %0, i32 %1, ptr %input) { %31 = load i8, ptr %30, align 1 %32 = sext i8 %31 to i32 %33 = add nuw nsw i32 %8, 320 - %34 = add nuw nsw i32 %8, 319 - %35 = getelementptr inbounds i8, ptr %3, i32 %34 - %36 = load i8, ptr %35, align 1 - %37 = sext i8 %36 to i32 %38 = getelementptr inbounds i8, ptr %3, i32 %33 %39 = load i8, ptr %38, align 1 %40 = sext i8 %39 to i32 - %41 = add nuw nsw i32 %8, 321 - %42 = getelementptr inbounds i8, ptr %3, i32 %41 - %43 = load i8, ptr %42, align 1 - %44 = sext i8 %43 to i32 - %reass.add = add nsw i32 %16, %13 + %reass.add = add nsw i32 %20, %20 %reass.add44 = add nsw i32 %reass.add, %20 - %reass.add45 = add nsw i32 %reass.add44, %24 + %reass.add45 = add nsw i32 %reass.add44, %20 %45 = add nsw i32 %reass.add45, %32 - %46 = add nsw i32 %45, %37 + %46 = add nsw i32 %45, %32 %47 = add nsw i32 %46, %40 - %reass.add46 = add nsw i32 %47, %44 + %reass.add46 = add nsw i32 %47, %40 %reass.mul = mul nsw i32 %reass.add46, -28 %48 = add nsw i32 %reass.mul, %28 %49 = lshr i32 %48, 8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index edf89a0fa7d7f..8e90287bac2a2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -163,7 +163,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; IF-EVL-OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL-OUTLOOP: for.cond.cleanup.loopexit: ; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]] @@ -218,7 +218,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; IF-EVL-INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL-INLOOP: for.cond.cleanup.loopexit: ; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP]] @@ -377,7 +377,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] ; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP15]]) ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_END:%.*]] @@ -394,7 +394,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: ; IF-EVL-OUTLOOP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[SMIN_LCSSA]] @@ -431,7 +431,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] ; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] ; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-INLOOP-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL-INLOOP: middle.block: ; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-INLOOP: scalar.ph: @@ -447,7 +447,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP16]], i32 [[RDX]] ; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL-INLOOP: for.end: ; IF-EVL-INLOOP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ] ; IF-EVL-INLOOP-NEXT: ret i32 [[SMIN_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll index 4e3077cfcab67..8de1beea8e57e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll @@ -3,7 +3,8 @@ define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add -; CHECK: LV(REG): Found max usage: 2 item +; CHECK: LV(REG): VF = 8 +; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll index 8825065aa5fe8..2005e82e9f27a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll @@ -4,12 +4,14 @@ define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add -; ZVFH: LV(REG): Found max usage: 2 item +; ZVFH: LV(REG): VF = 8 +; ZVFH-NEXT: LV(REG): Found max usage: 2 item ; ZVFH-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; ZVFH-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; ZVFH-NEXT: LV(REG): Found invariant usage: 1 item ; ZVFH-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers -; ZVFHMIN: LV(REG): Found max usage: 2 item +; ZVFHMIN: LV(REG): VF = 8 +; ZVFHMIN-NEXT: LV(REG): Found max usage: 2 item ; ZVFHMIN-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; ZVFHMIN-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers ; ZVFHMIN-NEXT: LV(REG): Found invariant usage: 1 item diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll index 9585d0d6d6cfd..15665fbd9e315 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll @@ -22,29 +22,34 @@ define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add -; CHECK-SCALAR: LV(REG): Found max usage: 2 item +; CHECK-SCALAR: LV(REG): VF = 1 +; CHECK-SCALAR-NEXT: LV(REG): Found max usage: 2 item ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::FPRRC, 2 registers ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers -; CHECK-LMUL1: LV(REG): Found max usage: 2 item +; CHECK-LMUL1: LV(REG): VF = 2 +; CHECK-LMUL1-NEXT: LV(REG): Found max usage: 2 item ; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-LMUL1-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers -; CHECK-LMUL2: LV(REG): Found max usage: 2 item +; CHECK-LMUL2: LV(REG): VF = 4 +; CHECK-LMUL2-NEXT: LV(REG): Found max usage: 2 item ; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers +; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-LMUL2-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers -; CHECK-LMUL4: LV(REG): Found max usage: 2 item +; CHECK-LMUL4: LV(REG): VF = 8 +; CHECK-LMUL4-NEXT: LV(REG): Found max usage: 2 item ; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers +; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers ; CHECK-LMUL4-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers -; CHECK-LMUL8: LV(REG): Found max usage: 2 item +; CHECK-LMUL8: LV(REG): VF = 16 +; CHECK-LMUL8-NEXT: LV(REG): Found max usage: 2 item ; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 16 registers +; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers ; CHECK-LMUL8-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers @@ -72,19 +77,24 @@ for.body: define void @goo(ptr nocapture noundef %a, i32 noundef signext %n) { ; CHECK-LABEL: goo -; CHECK-SCALAR: LV(REG): Found max usage: 1 item +; CHECK-SCALAR: LV(REG): VF = 1 +; CHECK-SCALAR-NEXT: LV(REG): Found max usage: 1 item ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-LMUL1: LV(REG): Found max usage: 2 item -; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers -; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 1 registers -; CHECK-LMUL2: LV(REG): Found max usage: 2 item -; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-LMUL1: LV(REG): VF = 2 +; CHECK-LMUL1-NEXT: LV(REG): Found max usage: 2 item +; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers +; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers +; CHECK-LMUL2: LV(REG): VF = 4 +; CHECK-LMUL2-NEXT: LV(REG): Found max usage: 2 item +; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers -; CHECK-LMUL4: LV(REG): Found max usage: 2 item -; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-LMUL4: LV(REG): VF = 8 +; CHECK-LMUL4-NEXT: LV(REG): Found max usage: 2 item +; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers -; CHECK-LMUL8: LV(REG): Found max usage: 2 item -; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-LMUL8: LV(REG): VF = 16 +; CHECK-LMUL8-NEXT: LV(REG): Found max usage: 2 item +; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers entry: %cmp3 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll index 6a3e2471f393a..dc4e7f4ced60e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll @@ -18,7 +18,6 @@ define void @test() { ; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 ; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 ; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 -; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -27,7 +26,6 @@ define void @test() { ; AVX1: LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX1: LV: Found an estimated cost of 60 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX1: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX1: LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -36,7 +34,6 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -46,7 +43,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 92 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll index 5a17a00d16d96..7048abaf4b66a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll @@ -102,6 +102,11 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 ; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 ; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -139,11 +144,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll index 600381d8f8c02..416dda2db8774 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll @@ -34,6 +34,7 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 37 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 76 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -43,7 +44,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll index 5a145dafc831d..fb223b03aec68 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll @@ -136,6 +136,13 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -180,13 +187,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-8.ll index 1e7dde431db25..8a0a120d3eef0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-8.ll @@ -45,14 +45,6 @@ define void @test() { ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4 -; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -95,14 +87,6 @@ define void @test() { ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4 -; AVX1: LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -153,6 +137,14 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -203,14 +195,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll index a7d5f0ca881ce..dca8cdf61fb7e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll @@ -68,6 +68,9 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 ; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -91,9 +94,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-4.ll index e7619f14f63f1..508932c40c507 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-4.ll @@ -29,10 +29,6 @@ define void @test() { ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 -; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -55,10 +51,6 @@ define void @test() { ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX1: LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -81,10 +73,10 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll index 243a476f174da..09782733e0812 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll @@ -87,6 +87,16 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-6.ll index 9849062fcb48e..bed79fbc64796 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-6.ll @@ -101,6 +101,18 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll index 8b4ac35bed12e..fd4f1acc270e8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll @@ -115,6 +115,20 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -159,13 +173,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-8.ll index 6050edec2900c..a78d82bb5d205 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-8.ll @@ -37,14 +37,6 @@ define void @test() { ; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 ; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 ; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8 -; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -79,14 +71,6 @@ define void @test() { ; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -129,6 +113,22 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 32 For instruction: %v7 = load double, ptr %in7, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-half.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-half.ll index 457b00dea4bbe..3e52b1f286f97 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-half.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-half.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=loop-vectorize -debug-only=loop-vectorize -mattr=avx512fp16 %s 2>&1 | FileCheck %s ; REQUIRES: asserts target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" @@ -10,7 +11,7 @@ target triple = "i386-unknown-linux-gnu" define void @stride8(half %k, i32 %width_) { entry: -; CHECK: Found an estimated cost of 148 for VF 32 For instruction: %0 = load half +; CHECK: Cost of 148 for VF 32: INTERLEAVE-GROUP with factor 8 at %0, ir<%arrayidx> %cmp72 = icmp sgt i32 %width_, 0 br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup @@ -98,7 +99,7 @@ for.body: ; preds = %for.body.lr.ph, %fo define void @stride3(half %k, i32 %width_) { entry: -; CHECK: Found an estimated cost of 18 for VF 32 For instruction: %0 = load half +; CHECK: LV: Found an estimated cost of 18 for VF 32 For instruction: %0 = load half %cmp27 = icmp sgt i32 %width_, 0 br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-8.ll index 1d99bcd5bec02..35165f3ca0cdd 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-8.ll @@ -54,14 +54,6 @@ define void @test() { ; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 ; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 ; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2 -; SSE2: LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 -; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 -; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 -; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 -; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 -; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 -; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 -; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 @@ -112,14 +104,6 @@ define void @test() { ; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 ; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX1: LV: Found an estimated cost of 560 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 @@ -236,14 +220,6 @@ define void @test() { ; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 ; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX512DQ: LV: Found an estimated cost of 1136 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i16, ptr %in7, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 @@ -302,14 +278,6 @@ define void @test() { ; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 ; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX512BW: LV: Found an estimated cost of 616 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i16, ptr %in7, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll index 880fb82ebacd7..47629bcae70d4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll @@ -46,7 +46,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 17 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 71 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll index 4b35a71b2b40c..1fe4ecb02fa6e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll @@ -18,7 +18,6 @@ define void @test() { ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 ; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 ; SSE2: LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -27,7 +26,6 @@ define void @test() { ; AVX1: LV: Found an estimated cost of 36 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX1: LV: Found an estimated cost of 76 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX1: LV: Found an estimated cost of 152 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: LV: Found an estimated cost of 304 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -36,7 +34,6 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -46,7 +43,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 92 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll index cd451d0cb70bc..433c33a106a4d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll @@ -102,6 +102,11 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 ; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 ; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -139,11 +144,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll index 0bfb4df3ddfe7..cc1c01df3e63e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll @@ -34,6 +34,7 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 37 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 76 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -43,7 +44,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll index 26613caf76faa..47d4b9d7b98f0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll @@ -136,6 +136,13 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -180,13 +187,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-8.ll index 46f4351dff0bf..7534695df95d4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-8.ll @@ -45,14 +45,6 @@ define void @test() { ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4 -; SSE2: LV: Found an estimated cost of 240 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -95,14 +87,6 @@ define void @test() { ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4 -; AVX1: LV: Found an estimated cost of 304 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -153,6 +137,14 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 68 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -203,14 +195,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll index 7c575b6dc8f37..3c91125610d9e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll @@ -68,6 +68,9 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 ; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -91,9 +94,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-4.ll index 4089026b957ad..4bd0ac8251355 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-4.ll @@ -29,10 +29,6 @@ define void @test() { ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 ; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 -; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -55,10 +51,6 @@ define void @test() { ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX1: LV: Found an estimated cost of 176 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -81,10 +73,10 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll index c741e473e7739..9386a0f923199 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-7.ll @@ -115,6 +115,20 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -159,13 +173,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-8.ll index 68370852ce85a..34d01d050fdbe 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-8.ll @@ -37,14 +37,6 @@ define void @test() { ; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 ; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 ; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8 -; SSE2: LV: Found an estimated cost of 112 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 -; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -79,14 +71,6 @@ define void @test() { ; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX1: LV: Found an estimated cost of 176 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -129,6 +113,22 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: %v7 = load i64, ptr %in7, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-8.ll index fb0f86e1db7a6..a11d86ce14ef7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-8.ll @@ -171,14 +171,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v5, ptr %out5, align 8 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v7, ptr %out7, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v0, ptr %out0, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v1, ptr %out1, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v3, ptr %out3, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v4, ptr %out4, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v5, ptr %out5, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v7, ptr %out7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-8.ll index 6bf88fce8df80..572f90b41debc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-8.ll @@ -171,14 +171,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll index 629ee1d2711ae..faa2aa43d4934 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll @@ -2,7 +2,7 @@ ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse4.2 --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=SSE42 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX1 -; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,-fast-gather --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX2 +; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,-fast-gather --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX2,AVX2-NOFAST ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,+fast-gather --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw --debug-only=loop-vectorize --disable-output < %s 2>&1 | FileCheck %s --check-prefixes=AVX512 @@ -44,7 +44,7 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: store i64 %valB, ptr %out, align 8 ; AVX2: LV: Found an estimated cost of 9 for VF 8 For instruction: store i64 %valB, ptr %out, align 8 ; AVX2: LV: Found an estimated cost of 18 for VF 16 For instruction: store i64 %valB, ptr %out, align 8 -; AVX2: LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8 +; AVX2-NOFAST: LV: Found an estimated cost of 36 for VF 32 For instruction: store i64 %valB, ptr %out, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll index 784b030bf3ab3..3cf44947ea462 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll @@ -12,14 +12,14 @@ define i32 @foo() { ; CHECK-LABEL: foo ; CHECK: LV(REG): VF = 8 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 7 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK: LV(REG): VF = 16 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 13 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %for.body @@ -54,14 +54,26 @@ define i32 @goo() { ; CHECK-LABEL: goo ; CHECK: LV(REG): VF = 8 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 7 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK: LV(REG): VF = 16 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 13 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item +; +; AVX512F-LABEL: goo +; AVX512F: LV(REG): VF = 8 +; AVX512F-NEXT: LV(REG): Found max usage: 2 item +; AVX512F-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers +; AVX512F-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 4 registers +; AVX512F-NEXT: LV(REG): Found invariant usage: 1 item +; AVX512F: LV(REG): VF = 16 +; AVX512F-NEXT: LV(REG): Found max usage: 2 item +; AVX512F-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers +; AVX512F-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 4 registers +; AVX512F-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %for.body