@@ -1022,11 +1022,6 @@ class LoopVectorizationCostModel {
1022
1022
SmallMapVector<unsigned , unsigned , 4 > MaxLocalUsers;
1023
1023
};
1024
1024
1025
- // / \return Returns information about the register usages of the loop for the
1026
- // / given vectorization factors.
1027
- SmallVector<RegisterUsage, 8 >
1028
- calculateRegisterUsage (ArrayRef<ElementCount> VFs);
1029
-
1030
1025
// / Collect values we want to ignore in the cost model.
1031
1026
void collectValuesToIgnore ();
1032
1027
@@ -4189,27 +4184,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4189
4184
ComputeScalableMaxVF);
4190
4185
MaxVectorElementCountMaxBW = MinVF (MaxVectorElementCountMaxBW, MaxSafeVF);
4191
4186
4192
- // Collect all viable vectorization factors larger than the default MaxVF
4193
- // (i.e. MaxVectorElementCount).
4194
- SmallVector<ElementCount, 8 > VFs;
4187
+ // Set the max VF to the largest viable vectorization factor less than or
4188
+ // equal to the max vector element count.
4195
4189
for (ElementCount VS = MaxVectorElementCount * 2 ;
4196
4190
ElementCount::isKnownLE (VS, MaxVectorElementCountMaxBW); VS *= 2 )
4197
- VFs.push_back (VS);
4198
-
4199
- // For each VF calculate its register usage.
4200
- auto RUs = calculateRegisterUsage (VFs);
4191
+ MaxVF = VS;
4201
4192
4202
- // Select the largest VF which doesn't require more registers than existing
4203
- // ones.
4204
- for (int I = RUs.size () - 1 ; I >= 0 ; --I) {
4205
- const auto &MLU = RUs[I].MaxLocalUsers ;
4206
- if (all_of (MLU, [&](decltype (MLU.front ()) &LU) {
4207
- return LU.second <= TTI.getNumberOfRegisters (LU.first );
4208
- })) {
4209
- MaxVF = VFs[I];
4210
- break ;
4211
- }
4212
- }
4213
4193
if (ElementCount MinVF =
4214
4194
TTI.getMinimumVF (SmallestType, ComputeScalableMaxVF)) {
4215
4195
if (ElementCount::isKnownLT (MaxVF, MinVF)) {
@@ -5392,213 +5372,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
5392
5372
return 1 ;
5393
5373
}
5394
5374
5395
- SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
5396
- LoopVectorizationCostModel::calculateRegisterUsage (ArrayRef<ElementCount> VFs) {
5397
- // This function calculates the register usage by measuring the highest number
5398
- // of values that are alive at a single location. Obviously, this is a very
5399
- // rough estimation. We scan the loop in a topological order in order and
5400
- // assign a number to each instruction. We use RPO to ensure that defs are
5401
- // met before their users. We assume that each instruction that has in-loop
5402
- // users starts an interval. We record every time that an in-loop value is
5403
- // used, so we have a list of the first and last occurrences of each
5404
- // instruction. Next, we transpose this data structure into a multi map that
5405
- // holds the list of intervals that *end* at a specific location. This multi
5406
- // map allows us to perform a linear search. We scan the instructions linearly
5407
- // and record each time that a new interval starts, by placing it in a set.
5408
- // If we find this value in the multi-map then we remove it from the set.
5409
- // The max register usage is the maximum size of the set.
5410
- // We also search for instructions that are defined outside the loop, but are
5411
- // used inside the loop. We need this number separately from the max-interval
5412
- // usage number because when we unroll, loop-invariant values do not take
5413
- // more registers.
5414
- LoopBlocksDFS DFS (TheLoop);
5415
- DFS.perform (LI);
5416
-
5417
- RegisterUsage RU;
5418
-
5419
- // Each 'key' in the map opens a new interval. The values
5420
- // of the map are the index of the 'last seen' usage of the
5421
- // instruction that is the key.
5422
- using IntervalMap = SmallDenseMap<Instruction *, unsigned , 16 >;
5423
-
5424
- // Maps instruction to its index.
5425
- SmallVector<Instruction *, 64 > IdxToInstr;
5426
- // Marks the end of each interval.
5427
- IntervalMap EndPoint;
5428
- // Saves the list of instruction indices that are used in the loop.
5429
- SmallPtrSet<Instruction *, 8 > Ends;
5430
- // Saves the list of values that are used in the loop but are defined outside
5431
- // the loop (not including non-instruction values such as arguments and
5432
- // constants).
5433
- SmallSetVector<Instruction *, 8 > LoopInvariants;
5434
-
5435
- for (BasicBlock *BB : make_range (DFS.beginRPO (), DFS.endRPO ())) {
5436
- for (Instruction &I : BB->instructionsWithoutDebug ()) {
5437
- IdxToInstr.push_back (&I);
5438
-
5439
- // Save the end location of each USE.
5440
- for (Value *U : I.operands ()) {
5441
- auto *Instr = dyn_cast<Instruction>(U);
5442
-
5443
- // Ignore non-instruction values such as arguments, constants, etc.
5444
- // FIXME: Might need some motivation why these values are ignored. If
5445
- // for example an argument is used inside the loop it will increase the
5446
- // register pressure (so shouldn't we add it to LoopInvariants).
5447
- if (!Instr)
5448
- continue ;
5449
-
5450
- // If this instruction is outside the loop then record it and continue.
5451
- if (!TheLoop->contains (Instr)) {
5452
- LoopInvariants.insert (Instr);
5453
- continue ;
5454
- }
5455
-
5456
- // Overwrite previous end points.
5457
- EndPoint[Instr] = IdxToInstr.size ();
5458
- Ends.insert (Instr);
5459
- }
5460
- }
5461
- }
5462
-
5463
- // Saves the list of intervals that end with the index in 'key'.
5464
- using InstrList = SmallVector<Instruction *, 2 >;
5465
- SmallDenseMap<unsigned , InstrList, 16 > TransposeEnds;
5466
-
5467
- // Transpose the EndPoints to a list of values that end at each index.
5468
- for (auto &Interval : EndPoint)
5469
- TransposeEnds[Interval.second ].push_back (Interval.first );
5470
-
5471
- SmallPtrSet<Instruction *, 8 > OpenIntervals;
5472
- SmallVector<RegisterUsage, 8 > RUs (VFs.size ());
5473
- SmallVector<SmallMapVector<unsigned , unsigned , 4 >, 8 > MaxUsages (VFs.size ());
5474
-
5475
- LLVM_DEBUG (dbgs () << " LV(REG): Calculating max register usage:\n " );
5476
-
5477
- const auto &TTICapture = TTI;
5478
- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5479
- if (Ty->isTokenTy () || !VectorType::isValidElementType (Ty) ||
5480
- (VF.isScalable () &&
5481
- !TTICapture.isElementTypeLegalForScalableVector (Ty)))
5482
- return 0 ;
5483
- return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
5484
- };
5485
-
5486
- collectInLoopReductions ();
5487
-
5488
- for (unsigned int Idx = 0 , Sz = IdxToInstr.size (); Idx < Sz; ++Idx) {
5489
- Instruction *I = IdxToInstr[Idx];
5490
-
5491
- // Remove all of the instructions that end at this location.
5492
- InstrList &List = TransposeEnds[Idx];
5493
- for (Instruction *ToRemove : List)
5494
- OpenIntervals.erase (ToRemove);
5495
-
5496
- // Ignore instructions that are never used within the loop and do not have
5497
- // side-effects.
5498
- if (!Ends.count (I) && !I->mayHaveSideEffects ())
5499
- continue ;
5500
-
5501
- // Skip ignored values.
5502
- if (ValuesToIgnore.count (I))
5503
- continue ;
5504
-
5505
- // For each VF find the maximum usage of registers.
5506
- for (unsigned J = 0 , E = VFs.size (); J < E; ++J) {
5507
- // Count the number of registers used, per register class, given all open
5508
- // intervals.
5509
- // Note that elements in this SmallMapVector will be default constructed
5510
- // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5511
- // there is no previous entry for ClassID.
5512
- SmallMapVector<unsigned , unsigned , 4 > RegUsage;
5513
-
5514
- if (VFs[J].isScalar ()) {
5515
- for (auto *Inst : OpenIntervals) {
5516
- unsigned ClassID =
5517
- TTI.getRegisterClassForType (false , Inst->getType ());
5518
- // FIXME: The target might use more than one register for the type
5519
- // even in the scalar case.
5520
- RegUsage[ClassID] += 1 ;
5521
- }
5522
- } else {
5523
- collectNonVectorizedAndSetWideningDecisions (VFs[J]);
5524
- for (auto *Inst : OpenIntervals) {
5525
- // Skip ignored values for VF > 1.
5526
- if (VecValuesToIgnore.count (Inst))
5527
- continue ;
5528
- if (isScalarAfterVectorization (Inst, VFs[J])) {
5529
- unsigned ClassID =
5530
- TTI.getRegisterClassForType (false , Inst->getType ());
5531
- // FIXME: The target might use more than one register for the type
5532
- // even in the scalar case.
5533
- RegUsage[ClassID] += 1 ;
5534
- } else {
5535
- unsigned ClassID =
5536
- TTI.getRegisterClassForType (true , Inst->getType ());
5537
- RegUsage[ClassID] += GetRegUsage (Inst->getType (), VFs[J]);
5538
- }
5539
- }
5540
- }
5541
-
5542
- for (const auto &Pair : RegUsage) {
5543
- auto &Entry = MaxUsages[J][Pair.first ];
5544
- Entry = std::max (Entry, Pair.second );
5545
- }
5546
- }
5547
-
5548
- LLVM_DEBUG (dbgs () << " LV(REG): At #" << Idx << " Interval # "
5549
- << OpenIntervals.size () << ' \n ' );
5550
-
5551
- // Add the current instruction to the list of open intervals.
5552
- OpenIntervals.insert (I);
5553
- }
5554
-
5555
- for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
5556
- // Note that elements in this SmallMapVector will be default constructed
5557
- // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5558
- // there is no previous entry for ClassID.
5559
- SmallMapVector<unsigned , unsigned , 4 > Invariant;
5560
-
5561
- for (auto *Inst : LoopInvariants) {
5562
- // FIXME: The target might use more than one register for the type
5563
- // even in the scalar case.
5564
- bool IsScalar = all_of (Inst->users (), [&](User *U) {
5565
- auto *I = cast<Instruction>(U);
5566
- return TheLoop != LI->getLoopFor (I->getParent ()) ||
5567
- isScalarAfterVectorization (I, VFs[Idx]);
5568
- });
5569
-
5570
- ElementCount VF = IsScalar ? ElementCount::getFixed (1 ) : VFs[Idx];
5571
- unsigned ClassID =
5572
- TTI.getRegisterClassForType (VF.isVector (), Inst->getType ());
5573
- Invariant[ClassID] += GetRegUsage (Inst->getType (), VF);
5574
- }
5575
-
5576
- LLVM_DEBUG ({
5577
- dbgs () << " LV(REG): VF = " << VFs[Idx] << ' \n ' ;
5578
- dbgs () << " LV(REG): Found max usage: " << MaxUsages[Idx].size ()
5579
- << " item\n " ;
5580
- for (const auto &pair : MaxUsages[Idx]) {
5581
- dbgs () << " LV(REG): RegisterClass: "
5582
- << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5583
- << " registers\n " ;
5584
- }
5585
- dbgs () << " LV(REG): Found invariant usage: " << Invariant.size ()
5586
- << " item\n " ;
5587
- for (const auto &pair : Invariant) {
5588
- dbgs () << " LV(REG): RegisterClass: "
5589
- << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5590
- << " registers\n " ;
5591
- }
5592
- });
5593
-
5594
- RU.LoopInvariantRegs = Invariant;
5595
- RU.MaxLocalUsers = MaxUsages[Idx];
5596
- RUs[Idx] = RU;
5597
- }
5598
-
5599
- return RUs;
5600
- }
5601
-
5602
5375
bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack (Instruction *I,
5603
5376
ElementCount VF) {
5604
5377
// TODO: Cost model for emulated masked load/store is completely
@@ -7764,7 +7537,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7764
7537
}
7765
7538
7766
7539
for (auto &P : VPlans) {
7767
- for (ElementCount VF : P->vectorFactors ()) {
7540
+ SmallVector<ElementCount, 1 > VFs (P->vectorFactors ());
7541
+ auto RUs = ::calculateRegisterUsage (*P, VFs, TTI, CM.ValuesToIgnore );
7542
+ for (unsigned I = 0 ; I < VFs.size (); I++) {
7543
+ auto VF = VFs[I];
7768
7544
if (VF.isScalar ())
7769
7545
continue ;
7770
7546
if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI)) {
@@ -7777,12 +7553,23 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7777
7553
7778
7554
InstructionCost Cost = cost (*P, VF);
7779
7555
VectorizationFactor CurrentFactor (VF, Cost, ScalarCost);
7780
- if (isMoreProfitable (CurrentFactor, BestFactor, P->hasScalarTail ()))
7781
- BestFactor = CurrentFactor;
7782
-
7783
7556
// If profitable add it to ProfitableVF list.
7784
7557
if (isMoreProfitable (CurrentFactor, ScalarFactor, P->hasScalarTail ()))
7785
7558
ProfitableVFs.push_back (CurrentFactor);
7559
+
7560
+ // Make sure that the VF doesn't use more than the number of available
7561
+ // registers
7562
+ const auto &MLU = RUs[I].MaxLocalUsers ;
7563
+ if (any_of (MLU, [&](decltype (MLU.front ()) &LU) {
7564
+ return LU.second > TTI.getNumberOfRegisters (LU.first );
7565
+ })) {
7566
+ LLVM_DEBUG (dbgs () << " LV(REG): Ignoring VF " << VF
7567
+ << " as it uses too many registers\n " );
7568
+ continue ;
7569
+ }
7570
+
7571
+ if (isMoreProfitable (CurrentFactor, BestFactor, P->hasScalarTail ()))
7572
+ BestFactor = CurrentFactor;
7786
7573
}
7787
7574
}
7788
7575
@@ -7794,6 +7581,30 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7794
7581
VectorizationFactor LegacyVF = selectVectorizationFactor ();
7795
7582
VPlan &BestPlan = getPlanFor (BestFactor.Width );
7796
7583
7584
+ // VPlan calculates register pressure from the plan, so it can come to
7585
+ // different conclusions than the legacy cost model.
7586
+ bool RegUsageDeterminedVF = false ;
7587
+ if (BestFactor.Width != LegacyVF.Width ) {
7588
+ SmallVector<ElementCount, 1 > LegacyVFs = {LegacyVF.Width };
7589
+ SmallVector<ElementCount, 1 > VFs = {BestFactor.Width };
7590
+
7591
+ auto LegacyRUs =
7592
+ ::calculateRegisterUsage (getPlanFor(LegacyVF.Width), LegacyVFs, TTI, CM.ValuesToIgnore);
7593
+ auto RUs = ::calculateRegisterUsage (BestPlan, VFs, TTI, CM.ValuesToIgnore );
7594
+
7595
+ auto GetMaxUsage = [](
7596
+ SmallMapVector<unsigned , unsigned , 4 > MaxLocalUsers) {
7597
+ unsigned Max = 0 ;
7598
+ for (auto Pair : MaxLocalUsers)
7599
+ if (Pair.second > Max)
7600
+ Max = Pair.second ;
7601
+ return Max;
7602
+ };
7603
+ unsigned MaxLegacyRegUsage = GetMaxUsage (LegacyRUs[0 ].MaxLocalUsers );
7604
+ unsigned MaxRegUsage = GetMaxUsage (RUs[0 ].MaxLocalUsers );
7605
+ RegUsageDeterminedVF = MaxRegUsage <= MaxLegacyRegUsage;
7606
+ }
7607
+
7797
7608
// Pre-compute the cost and use it to check if BestPlan contains any
7798
7609
// simplifications not accounted for in the legacy cost model. If that's the
7799
7610
// case, don't trigger the assertion, as the extra simplifications may cause a
@@ -7805,6 +7616,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7805
7616
// with early exits and plans with additional VPlan simplifications. The
7806
7617
// legacy cost model doesn't properly model costs for such loops.
7807
7618
assert ((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit () ||
7619
+ RegUsageDeterminedVF ||
7808
7620
planContainsAdditionalSimplifications (getPlanFor (BestFactor.Width ),
7809
7621
CostCtx, OrigLoop) ||
7810
7622
planContainsAdditionalSimplifications (getPlanFor (LegacyVF.Width ),
0 commit comments