@@ -1020,7 +1020,8 @@ class LoopVectorizationCostModel {
1020
1020
// / If interleave count has been specified by metadata it will be returned.
1021
1021
// / Otherwise, the interleave count is computed and returned. VF and LoopCost
1022
1022
// / are the selected vectorization factor and the cost of the selected VF.
1023
- unsigned selectInterleaveCount (ElementCount VF, InstructionCost LoopCost);
1023
+ unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
1024
+ InstructionCost LoopCost);
1024
1025
1025
1026
// / Memory access instruction may be vectorized in more than one way.
1026
1027
// / Form of instruction after vectorization depends on cost.
@@ -4878,8 +4879,232 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
4878
4879
}
4879
4880
}
4880
4881
4882
+ // / Estimate the register usage for \p Plan and vectorization factors in \p VFs.
4883
+ // / Returns the register usage for each VF in \p VFs.
4884
+ static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
4885
+ calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
4886
+ const TargetTransformInfo &TTI) {
4887
+ // This function calculates the register usage by measuring the highest number
4888
+ // of values that are alive at a single location. Obviously, this is a very
4889
+ // rough estimation. We scan the loop in a topological order in order and
4890
+ // assign a number to each recipe. We use RPO to ensure that defs are
4891
+ // met before their users. We assume that each recipe that has in-loop
4892
+ // users starts an interval. We record every time that an in-loop value is
4893
+ // used, so we have a list of the first and last occurrences of each
4894
+ // recipe. Next, we transpose this data structure into a multi map that
4895
+ // holds the list of intervals that *end* at a specific location. This multi
4896
+ // map allows us to perform a linear search. We scan the instructions linearly
4897
+ // and record each time that a new interval starts, by placing it in a set.
4898
+ // If we find this value in the multi-map then we remove it from the set.
4899
+ // The max register usage is the maximum size of the set.
4900
+ // We also search for instructions that are defined outside the loop, but are
4901
+ // used inside the loop. We need this number separately from the max-interval
4902
+ // usage number because when we unroll, loop-invariant values do not take
4903
+ // more register.
4904
+ LoopVectorizationCostModel::RegisterUsage RU;
4905
+
4906
+ // Each 'key' in the map opens a new interval. The values
4907
+ // of the map are the index of the 'last seen' usage of the
4908
+ // recipe that is the key.
4909
+ using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned , 16 >;
4910
+
4911
+ // Maps recipe to its index.
4912
+ SmallVector<VPRecipeBase *, 64 > IdxToRecipe;
4913
+ // Marks the end of each interval.
4914
+ IntervalMap EndPoint;
4915
+ // Saves the list of recipe indices that are used in the loop.
4916
+ SmallPtrSet<VPRecipeBase *, 8 > Ends;
4917
+ // Saves the list of values that are used in the loop but are defined outside
4918
+ // the loop (not including non-recipe values such as arguments and
4919
+ // constants).
4920
+ SmallSetVector<VPValue *, 8 > LoopInvariants;
4921
+ LoopInvariants.insert (&Plan.getVectorTripCount ());
4922
+
4923
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT (
4924
+ Plan.getVectorLoopRegion ());
4925
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4926
+ if (!VPBB->getParent ())
4927
+ break ;
4928
+ for (VPRecipeBase &R : *VPBB) {
4929
+ IdxToRecipe.push_back (&R);
4930
+
4931
+ // Save the end location of each USE.
4932
+ for (VPValue *U : R.operands ()) {
4933
+ auto *DefR = U->getDefiningRecipe ();
4934
+
4935
+ // Ignore non-recipe values such as arguments, constants, etc.
4936
+ // FIXME: Might need some motivation why these values are ignored. If
4937
+ // for example an argument is used inside the loop it will increase the
4938
+ // register pressure (so shouldn't we add it to LoopInvariants).
4939
+ if (!DefR && (!U->getLiveInIRValue () ||
4940
+ !isa<Instruction>(U->getLiveInIRValue ())))
4941
+ continue ;
4942
+
4943
+ // If this recipe is outside the loop then record it and continue.
4944
+ if (!DefR) {
4945
+ LoopInvariants.insert (U);
4946
+ continue ;
4947
+ }
4948
+
4949
+ // Overwrite previous end points.
4950
+ EndPoint[DefR] = IdxToRecipe.size ();
4951
+ Ends.insert (DefR);
4952
+ }
4953
+ }
4954
+ if (VPBB == Plan.getVectorLoopRegion ()->getExiting ()) {
4955
+ // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4956
+ // exiting block, where their increment will get materialized eventually.
4957
+ for (auto &R : Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
4958
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4959
+ EndPoint[&R] = IdxToRecipe.size ();
4960
+ Ends.insert (&R);
4961
+ }
4962
+ }
4963
+ }
4964
+ }
4965
+
4966
+ // Saves the list of intervals that end with the index in 'key'.
4967
+ using RecipeList = SmallVector<VPRecipeBase *, 2 >;
4968
+ SmallDenseMap<unsigned , RecipeList, 16 > TransposeEnds;
4969
+
4970
+ // Transpose the EndPoints to a list of values that end at each index.
4971
+ for (auto &Interval : EndPoint)
4972
+ TransposeEnds[Interval.second ].push_back (Interval.first );
4973
+
4974
+ SmallPtrSet<VPRecipeBase *, 8 > OpenIntervals;
4975
+ SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 > RUs (VFs.size ());
4976
+ SmallVector<SmallMapVector<unsigned , unsigned , 4 >, 8 > MaxUsages (VFs.size ());
4977
+
4978
+ LLVM_DEBUG (dbgs () << " LV(REG): Calculating max register usage:\n " );
4979
+
4980
+ VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
4981
+
4982
+ const auto &TTICapture = TTI;
4983
+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4984
+ if (Ty->isTokenTy () || !VectorType::isValidElementType (Ty) ||
4985
+ (VF.isScalable () &&
4986
+ !TTICapture.isElementTypeLegalForScalableVector (Ty)))
4987
+ return 0 ;
4988
+ return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
4989
+ };
4990
+
4991
+ for (unsigned int Idx = 0 , Sz = IdxToRecipe.size (); Idx < Sz; ++Idx) {
4992
+ VPRecipeBase *R = IdxToRecipe[Idx];
4993
+
4994
+ // Remove all of the recipes that end at this location.
4995
+ RecipeList &List = TransposeEnds[Idx];
4996
+ for (VPRecipeBase *ToRemove : List)
4997
+ OpenIntervals.erase (ToRemove);
4998
+
4999
+ // Ignore recipes that are never used within the loop.
5000
+ if (!Ends.count (R) && !R->mayHaveSideEffects ())
5001
+ continue ;
5002
+
5003
+ // For each VF find the maximum usage of registers.
5004
+ for (unsigned J = 0 , E = VFs.size (); J < E; ++J) {
5005
+ // Count the number of registers used, per register class, given all open
5006
+ // intervals.
5007
+ // Note that elements in this SmallMapVector will be default constructed
5008
+ // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5009
+ // there is no previous entry for ClassID.
5010
+ SmallMapVector<unsigned , unsigned , 4 > RegUsage;
5011
+
5012
+ if (VFs[J].isScalar ()) {
5013
+ for (auto *Inst : OpenIntervals) {
5014
+ for (VPValue *DefV : Inst->definedValues ()) {
5015
+ unsigned ClassID = TTI.getRegisterClassForType (
5016
+ false , TypeInfo.inferScalarType (DefV));
5017
+ // FIXME: The target might use more than one register for the type
5018
+ // even in the scalar case.
5019
+ RegUsage[ClassID] += 1 ;
5020
+ }
5021
+ }
5022
+ } else {
5023
+ for (auto *R : OpenIntervals) {
5024
+ if (isa<VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(R))
5025
+ continue ;
5026
+ if (isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
5027
+ VPScalarIVStepsRecipe>(R) ||
5028
+ (isa<VPInstruction>(R) &&
5029
+ all_of (cast<VPSingleDefRecipe>(R)->users (), [&](VPUser *U) {
5030
+ return cast<VPRecipeBase>(U)->usesScalars (
5031
+ R->getVPSingleValue ());
5032
+ }))) {
5033
+ unsigned ClassID = TTI.getRegisterClassForType (
5034
+ false , TypeInfo.inferScalarType (R->getVPSingleValue ()));
5035
+ // FIXME: The target might use more than one register for the type
5036
+ // even in the scalar case.
5037
+ RegUsage[ClassID] += 1 ;
5038
+ } else {
5039
+ for (VPValue *DefV : R->definedValues ()) {
5040
+ Type *ScalarTy = TypeInfo.inferScalarType (DefV);
5041
+ unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
5042
+ RegUsage[ClassID] += GetRegUsage (ScalarTy, VFs[J]);
5043
+ }
5044
+ }
5045
+ }
5046
+ }
5047
+
5048
+ for (const auto &Pair : RegUsage) {
5049
+ auto &Entry = MaxUsages[J][Pair.first ];
5050
+ Entry = std::max (Entry, Pair.second );
5051
+ }
5052
+ }
5053
+
5054
+ LLVM_DEBUG (dbgs () << " LV(REG): At #" << Idx << " Interval # "
5055
+ << OpenIntervals.size () << ' \n ' );
5056
+
5057
+ // Add the current recipe to the list of open intervals.
5058
+ OpenIntervals.insert (R);
5059
+ }
5060
+
5061
+ for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
5062
+ // Note that elements in this SmallMapVector will be default constructed
5063
+ // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5064
+ // there is no previous entry for ClassID.
5065
+ SmallMapVector<unsigned , unsigned , 4 > Invariant;
5066
+
5067
+ for (auto *In : LoopInvariants) {
5068
+ // FIXME: The target might use more than one register for the type
5069
+ // even in the scalar case.
5070
+ bool IsScalar = all_of (In->users (), [&](VPUser *U) {
5071
+ return cast<VPRecipeBase>(U)->usesScalars (In);
5072
+ });
5073
+
5074
+ ElementCount VF = IsScalar ? ElementCount::getFixed (1 ) : VFs[Idx];
5075
+ unsigned ClassID = TTI.getRegisterClassForType (
5076
+ VF.isVector (), TypeInfo.inferScalarType (In));
5077
+ Invariant[ClassID] += GetRegUsage (TypeInfo.inferScalarType (In), VF);
5078
+ }
5079
+
5080
+ LLVM_DEBUG ({
5081
+ dbgs () << " LV(REG): VF = " << VFs[Idx] << ' \n ' ;
5082
+ dbgs () << " LV(REG): Found max usage: " << MaxUsages[Idx].size ()
5083
+ << " item\n " ;
5084
+ for (const auto &pair : MaxUsages[Idx]) {
5085
+ dbgs () << " LV(REG): RegisterClass: "
5086
+ << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5087
+ << " registers\n " ;
5088
+ }
5089
+ dbgs () << " LV(REG): Found invariant usage: " << Invariant.size ()
5090
+ << " item\n " ;
5091
+ for (const auto &pair : Invariant) {
5092
+ dbgs () << " LV(REG): RegisterClass: "
5093
+ << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5094
+ << " registers\n " ;
5095
+ }
5096
+ });
5097
+
5098
+ RU.LoopInvariantRegs = Invariant;
5099
+ RU.MaxLocalUsers = MaxUsages[Idx];
5100
+ RUs[Idx] = RU;
5101
+ }
5102
+
5103
+ return RUs;
5104
+ }
5105
+
4881
5106
unsigned
4882
- LoopVectorizationCostModel::selectInterleaveCount (ElementCount VF,
5107
+ LoopVectorizationCostModel::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4883
5108
InstructionCost LoopCost) {
4884
5109
// -- The interleave heuristics --
4885
5110
// We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4929,7 +5154,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4929
5154
return 1 ;
4930
5155
}
4931
5156
4932
- RegisterUsage R = calculateRegisterUsage ({VF})[0 ];
5157
+ RegisterUsage R = :: calculateRegisterUsage (Plan, {VF}, TTI )[0 ];
4933
5158
// We divide by these constants so assume that we have at least one
4934
5159
// instruction that uses at least one register.
4935
5160
for (auto &Pair : R.MaxLocalUsers ) {
@@ -10574,7 +10799,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10574
10799
AddBranchWeights, CM.CostKind );
10575
10800
if (LVP.hasPlanWithVF (VF.Width )) {
10576
10801
// Select the interleave count.
10577
- IC = CM.selectInterleaveCount (VF.Width , VF.Cost );
10802
+ IC = CM.selectInterleaveCount (LVP. getPlanFor (VF. Width ), VF.Width , VF.Cost );
10578
10803
10579
10804
unsigned SelectedIC = std::max (IC, UserIC);
10580
10805
// Optimistically generate runtime checks if they are needed. Drop them if
0 commit comments