@@ -992,7 +992,8 @@ class LoopVectorizationCostModel {
992
992
// / If interleave count has been specified by metadata it will be returned.
993
993
// / Otherwise, the interleave count is computed and returned. VF and LoopCost
994
994
// / are the selected vectorization factor and the cost of the selected VF.
995
- unsigned selectInterleaveCount (ElementCount VF, InstructionCost LoopCost);
995
+ unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
996
+ InstructionCost LoopCost);
996
997
997
998
// / Memory access instruction may be vectorized in more than one way.
998
999
// / Form of instruction after vectorization depends on cost.
@@ -4873,8 +4874,233 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
4873
4874
}
4874
4875
}
4875
4876
4877
+ // / Estimate the register usage for \p Plan and vectorization factors in \p VFs
4878
+ // / by calculating the highest number of values that are live at a single
4879
+ // / location as a rough estimate. Returns the register usage for each VF in \p
4880
+ // / VFs.
4881
+ static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
4882
+ calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
4883
+ const TargetTransformInfo &TTI,
4884
+ const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4885
+ // Each 'key' in the map opens a new interval. The values
4886
+ // of the map are the index of the 'last seen' usage of the
4887
+ // recipe that is the key.
4888
+ using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned , 16 >;
4889
+
4890
+ // Maps indices to recipes.
4891
+ SmallVector<VPRecipeBase *, 64 > Idx2Recipe;
4892
+ // Marks the end of each interval.
4893
+ IntervalMap EndPoint;
4894
+ // Saves the list of recipe indices that are used in the loop.
4895
+ SmallPtrSet<VPRecipeBase *, 8 > Ends;
4896
+ // Saves the list of values that are used in the loop but are defined outside
4897
+ // the loop (not including non-recipe values such as arguments and
4898
+ // constants).
4899
+ SmallSetVector<VPValue *, 8 > LoopInvariants;
4900
+ LoopInvariants.insert (&Plan.getVectorTripCount ());
4901
+
4902
+ // We scan the loop in a topological order in order and assign a number to
4903
+ // each recipe. We use RPO to ensure that defs are met before their users. We
4904
+ // assume that each recipe that has in-loop users starts an interval. We
4905
+ // record every time that an in-loop value is used, so we have a list of the
4906
+ // first and last occurrences of each recipe.
4907
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT (
4908
+ Plan.getVectorLoopRegion ());
4909
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4910
+ if (!VPBB->getParent ())
4911
+ break ;
4912
+ for (VPRecipeBase &R : *VPBB) {
4913
+ Idx2Recipe.push_back (&R);
4914
+
4915
+ // Save the end location of each USE.
4916
+ for (VPValue *U : R.operands ()) {
4917
+ auto *DefR = U->getDefiningRecipe ();
4918
+
4919
+ // Ignore non-recipe values such as arguments, constants, etc.
4920
+ // FIXME: Might need some motivation why these values are ignored. If
4921
+ // for example an argument is used inside the loop it will increase the
4922
+ // register pressure (so shouldn't we add it to LoopInvariants).
4923
+ if (!DefR && (!U->getLiveInIRValue () ||
4924
+ !isa<Instruction>(U->getLiveInIRValue ())))
4925
+ continue ;
4926
+
4927
+ // If this recipe is outside the loop then record it and continue.
4928
+ if (!DefR) {
4929
+ LoopInvariants.insert (U);
4930
+ continue ;
4931
+ }
4932
+
4933
+ // Overwrite previous end points.
4934
+ EndPoint[DefR] = Idx2Recipe.size ();
4935
+ Ends.insert (DefR);
4936
+ }
4937
+ }
4938
+ if (VPBB == Plan.getVectorLoopRegion ()->getExiting ()) {
4939
+ // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4940
+ // exiting block, where their increment will get materialized eventually.
4941
+ for (auto &R : Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
4942
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4943
+ EndPoint[&R] = Idx2Recipe.size ();
4944
+ Ends.insert (&R);
4945
+ }
4946
+ }
4947
+ }
4948
+ }
4949
+
4950
+ // Saves the list of intervals that end with the index in 'key'.
4951
+ using RecipeList = SmallVector<VPRecipeBase *, 2 >;
4952
+ SmallDenseMap<unsigned , RecipeList, 16 > TransposeEnds;
4953
+
4954
+ // Next, we transpose the EndPoints into a multi map that holds the list of
4955
+ // intervals that *end* at a specific location.
4956
+ for (auto &Interval : EndPoint)
4957
+ TransposeEnds[Interval.second ].push_back (Interval.first );
4958
+
4959
+ SmallPtrSet<VPRecipeBase *, 8 > OpenIntervals;
4960
+ SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 > RUs (VFs.size ());
4961
+ SmallVector<SmallMapVector<unsigned , unsigned , 4 >, 8 > MaxUsages (VFs.size ());
4962
+
4963
+ LLVM_DEBUG (dbgs () << " LV(REG): Calculating max register usage:\n " );
4964
+
4965
+ VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
4966
+
4967
+ const auto &TTICapture = TTI;
4968
+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4969
+ if (Ty->isTokenTy () || !VectorType::isValidElementType (Ty) ||
4970
+ (VF.isScalable () &&
4971
+ !TTICapture.isElementTypeLegalForScalableVector (Ty)))
4972
+ return 0 ;
4973
+ return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
4974
+ };
4975
+
4976
+ // We scan the instructions linearly and record each time that a new interval
4977
+ // starts, by placing it in a set. If we find this value in TransposEnds then
4978
+ // we remove it from the set. The max register usage is the maximum register
4979
+ // usage of the recipes of the set.
4980
+ for (unsigned int Idx = 0 , Sz = Idx2Recipe.size (); Idx < Sz; ++Idx) {
4981
+ VPRecipeBase *R = Idx2Recipe[Idx];
4982
+
4983
+ // Remove all of the recipes that end at this location.
4984
+ RecipeList &List = TransposeEnds[Idx];
4985
+ for (VPRecipeBase *ToRemove : List)
4986
+ OpenIntervals.erase (ToRemove);
4987
+
4988
+ // Ignore recipes that are never used within the loop and do not have side
4989
+ // effects.
4990
+ if (!Ends.count (R) && !R->mayHaveSideEffects ())
4991
+ continue ;
4992
+
4993
+ // Skip recipes for ignored values.
4994
+ // TODO: Should mark recipes for ephemeral values that cannot be removed
4995
+ // explictly in VPlan.
4996
+ if (isa<VPSingleDefRecipe>(R) &&
4997
+ ValuesToIgnore.contains (
4998
+ cast<VPSingleDefRecipe>(R)->getUnderlyingValue ()))
4999
+ continue ;
5000
+
5001
+ // For each VF find the maximum usage of registers.
5002
+ for (unsigned J = 0 , E = VFs.size (); J < E; ++J) {
5003
+ // Count the number of registers used, per register class, given all open
5004
+ // intervals.
5005
+ // Note that elements in this SmallMapVector will be default constructed
5006
+ // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5007
+ // there is no previous entry for ClassID.
5008
+ SmallMapVector<unsigned , unsigned , 4 > RegUsage;
5009
+
5010
+ for (auto *R : OpenIntervals) {
5011
+ // Skip recipes that weren't present in the original loop.
5012
+ // TODO: Remove after removing the legacy
5013
+ // LoopVectorizationCostModel::calculateRegisterUsage
5014
+ if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
5015
+ VPBranchOnMaskRecipe>(R))
5016
+ continue ;
5017
+
5018
+ if (VFs[J].isScalar () ||
5019
+ isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
5020
+ VPScalarIVStepsRecipe>(R) ||
5021
+ (isa<VPInstruction>(R) &&
5022
+ all_of (cast<VPSingleDefRecipe>(R)->users (), [&](VPUser *U) {
5023
+ return cast<VPRecipeBase>(U)->usesScalars (R->getVPSingleValue ());
5024
+ }))) {
5025
+ unsigned ClassID = TTI.getRegisterClassForType (
5026
+ false , TypeInfo.inferScalarType (R->getVPSingleValue ()));
5027
+ // FIXME: The target might use more than one register for the type
5028
+ // even in the scalar case.
5029
+ RegUsage[ClassID] += 1 ;
5030
+ } else {
5031
+ for (VPValue *DefV : R->definedValues ()) {
5032
+ Type *ScalarTy = TypeInfo.inferScalarType (DefV);
5033
+ unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
5034
+ RegUsage[ClassID] += GetRegUsage (ScalarTy, VFs[J]);
5035
+ }
5036
+ }
5037
+ }
5038
+
5039
+ for (const auto &Pair : RegUsage) {
5040
+ auto &Entry = MaxUsages[J][Pair.first ];
5041
+ Entry = std::max (Entry, Pair.second );
5042
+ }
5043
+ }
5044
+
5045
+ LLVM_DEBUG (dbgs () << " LV(REG): At #" << Idx << " Interval # "
5046
+ << OpenIntervals.size () << ' \n ' );
5047
+
5048
+ // Add the current recipe to the list of open intervals.
5049
+ OpenIntervals.insert (R);
5050
+ }
5051
+
5052
+ // We also search for instructions that are defined outside the loop, but are
5053
+ // used inside the loop. We need this number separately from the max-interval
5054
+ // usage number because when we unroll, loop-invariant values do not take
5055
+ // more register.
5056
+ LoopVectorizationCostModel::RegisterUsage RU;
5057
+ for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
5058
+ // Note that elements in this SmallMapVector will be default constructed
5059
+ // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5060
+ // there is no previous entry for ClassID.
5061
+ SmallMapVector<unsigned , unsigned , 4 > Invariant;
5062
+
5063
+ for (auto *In : LoopInvariants) {
5064
+ // FIXME: The target might use more than one register for the type
5065
+ // even in the scalar case.
5066
+ bool IsScalar = all_of (In->users (), [&](VPUser *U) {
5067
+ return cast<VPRecipeBase>(U)->usesScalars (In);
5068
+ });
5069
+
5070
+ ElementCount VF = IsScalar ? ElementCount::getFixed (1 ) : VFs[Idx];
5071
+ unsigned ClassID = TTI.getRegisterClassForType (
5072
+ VF.isVector (), TypeInfo.inferScalarType (In));
5073
+ Invariant[ClassID] += GetRegUsage (TypeInfo.inferScalarType (In), VF);
5074
+ }
5075
+
5076
+ LLVM_DEBUG ({
5077
+ dbgs () << " LV(REG): VF = " << VFs[Idx] << ' \n ' ;
5078
+ dbgs () << " LV(REG): Found max usage: " << MaxUsages[Idx].size ()
5079
+ << " item\n " ;
5080
+ for (const auto &pair : MaxUsages[Idx]) {
5081
+ dbgs () << " LV(REG): RegisterClass: "
5082
+ << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5083
+ << " registers\n " ;
5084
+ }
5085
+ dbgs () << " LV(REG): Found invariant usage: " << Invariant.size ()
5086
+ << " item\n " ;
5087
+ for (const auto &pair : Invariant) {
5088
+ dbgs () << " LV(REG): RegisterClass: "
5089
+ << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5090
+ << " registers\n " ;
5091
+ }
5092
+ });
5093
+
5094
+ RU.LoopInvariantRegs = Invariant;
5095
+ RU.MaxLocalUsers = MaxUsages[Idx];
5096
+ RUs[Idx] = RU;
5097
+ }
5098
+
5099
+ return RUs;
5100
+ }
5101
+
4876
5102
unsigned
4877
- LoopVectorizationCostModel::selectInterleaveCount (ElementCount VF,
5103
+ LoopVectorizationCostModel::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4878
5104
InstructionCost LoopCost) {
4879
5105
// -- The interleave heuristics --
4880
5106
// We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4924,7 +5150,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4924
5150
return 1 ;
4925
5151
}
4926
5152
4927
- RegisterUsage R = calculateRegisterUsage ({VF})[0 ];
5153
+ RegisterUsage R =
5154
+ ::calculateRegisterUsage (Plan, {VF}, TTI, ValuesToIgnore)[0];
4928
5155
// We divide by these constants so assume that we have at least one
4929
5156
// instruction that uses at least one register.
4930
5157
for (auto &Pair : R.MaxLocalUsers ) {
@@ -5175,7 +5402,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5175
5402
// We also search for instructions that are defined outside the loop, but are
5176
5403
// used inside the loop. We need this number separately from the max-interval
5177
5404
// usage number because when we unroll, loop-invariant values do not take
5178
- // more register .
5405
+ // more registers .
5179
5406
LoopBlocksDFS DFS (TheLoop);
5180
5407
DFS.perform (LI);
5181
5408
@@ -10755,7 +10982,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10755
10982
AddBranchWeights, CM.CostKind );
10756
10983
if (LVP.hasPlanWithVF (VF.Width )) {
10757
10984
// Select the interleave count.
10758
- IC = CM.selectInterleaveCount (VF.Width , VF.Cost );
10985
+ IC = CM.selectInterleaveCount (LVP. getPlanFor (VF. Width ), VF.Width , VF.Cost );
10759
10986
10760
10987
unsigned SelectedIC = std::max (IC, UserIC);
10761
10988
// Optimistically generate runtime checks if they are needed. Drop them if
0 commit comments