@@ -218,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
218
218
" block placement." ),
219
219
cl::init(UINT_MAX), cl::Hidden);
220
220
221
+ // Apply the ext-tsp algorithm minimizing the size of a binary.
222
+ static cl::opt<bool >
223
+ ApplyExtTspForSize (" apply-ext-tsp-for-size" , cl::init(false ), cl::Hidden,
224
+ cl::desc(" Use ext-tsp for size-aware block placement." ));
225
+
221
226
namespace llvm {
222
227
extern cl::opt<bool > EnableExtTspBlockPlacement;
223
228
extern cl::opt<bool > ApplyExtTspWithoutProfile;
@@ -595,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
595
600
void precomputeTriangleChains ();
596
601
597
602
// / Apply a post-processing step optimizing block placement.
598
- void applyExtTsp ();
603
+ void applyExtTsp (bool OptForSize );
599
604
600
605
// / Modify the existing block placement in the function and adjust all jumps.
601
606
void assignBlockOrder (const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -3505,20 +3510,36 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3505
3510
// Initialize tail duplication thresholds.
3506
3511
initTailDupThreshold ();
3507
3512
3513
+ const bool OptForSize =
3514
+ MF.getFunction ().hasOptSize () ||
3515
+ llvm::shouldOptimizeForSize (&MF, PSI, &MBFI->getMBFI ());
3516
+ // Determine whether to use ext-tsp for perf/size optimization. The method
3517
+ // is beneficial only for instances with at least 3 basic blocks and it can be
3518
+ // disabled for huge functions (exceeding a certain size).
3519
+ bool UseExtTspForPerf = false ;
3520
+ bool UseExtTspForSize = false ;
3521
+ if (3 <= MF.size () && MF.size () <= ExtTspBlockPlacementMaxBlocks) {
3522
+ UseExtTspForPerf =
3523
+ EnableExtTspBlockPlacement &&
3524
+ (ApplyExtTspWithoutProfile || MF.getFunction ().hasProfileData ());
3525
+ UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3526
+ }
3527
+
3508
3528
// Apply tail duplication.
3509
3529
if (allowTailDupPlacement ()) {
3510
3530
MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree ();
3511
- bool OptForSize = MF.getFunction ().hasOptSize () ||
3512
- llvm::shouldOptimizeForSize (&MF, PSI, &MBFI->getMBFI ());
3513
3531
if (OptForSize)
3514
3532
TailDupSize = 1 ;
3515
3533
const bool PreRegAlloc = false ;
3516
3534
TailDup.initMF (MF, PreRegAlloc, MBPI, MBFI.get (), PSI,
3517
3535
/* LayoutMode */ true , TailDupSize);
3518
- precomputeTriangleChains ();
3536
+ if (!UseExtTspForSize)
3537
+ precomputeTriangleChains ();
3519
3538
}
3520
3539
3521
- buildCFGChains ();
3540
+ // Run the main block placement.
3541
+ if (!UseExtTspForSize)
3542
+ buildCFGChains ();
3522
3543
3523
3544
// Changing the layout can create new tail merging opportunities.
3524
3545
// TailMerge can create jump into if branches that make CFG irreducible for
@@ -3545,14 +3566,14 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3545
3566
}
3546
3567
}
3547
3568
3548
- // Apply a post-processing optimizing block placement.
3549
- if (MF. size () >= 3 && EnableExtTspBlockPlacement &&
3550
- (ApplyExtTspWithoutProfile || MF. getFunction (). hasProfileData ()) &&
3551
- MF. size () <= ExtTspBlockPlacementMaxBlocks ) {
3552
- // Find a new placement and modify the layout of the blocks in the function.
3553
- applyExtTsp ();
3554
-
3555
- // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3569
+ // Apply a post-processing optimizing block placement:
3570
+ // - find a new placement and modify the layout of the blocks in the function;
3571
+ // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3572
+ if (UseExtTspForPerf || UseExtTspForSize ) {
3573
+ assert (
3574
+ !(UseExtTspForPerf && UseExtTspForSize) &&
3575
+ " UseExtTspForPerf and UseExtTspForSize can not be set simultaneosly " );
3576
+ applyExtTsp ( /* OptForSize= */ UseExtTspForSize);
3556
3577
createCFGChainExtTsp ();
3557
3578
}
3558
3579
@@ -3577,7 +3598,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3577
3598
return true ;
3578
3599
}
3579
3600
3580
- void MachineBlockPlacement::applyExtTsp () {
3601
+ void MachineBlockPlacement::applyExtTsp (bool OptForSize ) {
3581
3602
// Prepare data; blocks are indexed by their index in the current ordering.
3582
3603
DenseMap<const MachineBasicBlock *, uint64_t > BlockIndex;
3583
3604
BlockIndex.reserve (F->size ());
@@ -3589,13 +3610,15 @@ void MachineBlockPlacement::applyExtTsp() {
3589
3610
CurrentBlockOrder.push_back (&MBB);
3590
3611
}
3591
3612
3592
- auto BlockSizes = std::vector<uint64_t >(F->size ());
3593
- auto BlockCounts = std::vector<uint64_t >(F->size ());
3594
- std::vector<codelayout::EdgeCount> JumpCounts;
3613
+ SmallVector<uint64_t , 0 > BlockCounts (F->size ());
3614
+ SmallVector<uint64_t , 0 > BlockSizes (F->size ());
3615
+ SmallVector<codelayout::EdgeCount, 0 > JumpCounts;
3616
+ SmallVector<MachineOperand, 4 > Cond; // For analyzeBranch.
3617
+ SmallVector<const MachineBasicBlock *, 4 > Succs;
3595
3618
for (MachineBasicBlock &MBB : *F) {
3596
3619
// Getting the block frequency.
3597
3620
BlockFrequency BlockFreq = MBFI->getBlockFreq (&MBB);
3598
- BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency ();
3621
+ BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency ();
3599
3622
// Getting the block size:
3600
3623
// - approximate the size of an instruction by 4 bytes, and
3601
3624
// - ignore debug instructions.
@@ -3604,23 +3627,49 @@ void MachineBlockPlacement::applyExtTsp() {
3604
3627
// not see a perf improvement with the exact block sizes.
3605
3628
auto NonDbgInsts =
3606
3629
instructionsWithoutDebug (MBB.instr_begin (), MBB.instr_end ());
3607
- int NumInsts = std::distance (NonDbgInsts.begin (), NonDbgInsts.end ());
3630
+ size_t NumInsts = std::distance (NonDbgInsts.begin (), NonDbgInsts.end ());
3608
3631
BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3632
+
3609
3633
// Getting jump frequencies.
3610
- for (MachineBasicBlock *Succ : MBB.successors ()) {
3611
- auto EP = MBPI->getEdgeProbability (&MBB, Succ);
3612
- BlockFrequency JumpFreq = BlockFreq * EP;
3613
- JumpCounts.push_back (
3614
- {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency ()});
3634
+ if (OptForSize) {
3635
+ Cond.clear ();
3636
+ MachineBasicBlock *TBB = nullptr , *FBB = nullptr ; // For analyzeBranch.
3637
+ if (TII->analyzeBranch (MBB, TBB, FBB, Cond))
3638
+ continue ;
3639
+
3640
+ const MachineBasicBlock *FTB = MBB.getFallThrough ();
3641
+ // Succs is a collection of distinct destinations of the block reachable
3642
+ // from MBB via a jump instruction; initialize the list using the three
3643
+ // (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3644
+ Succs.clear ();
3645
+ if (TBB && TBB != FTB)
3646
+ Succs.push_back (TBB);
3647
+ if (FBB && FBB != FTB)
3648
+ Succs.push_back (FBB);
3649
+ if (FTB)
3650
+ Succs.push_back (FTB);
3651
+ // Absolute magnitude of non-zero counts does not matter for the
3652
+ // optimization; prioritize slightly jumps with a single successor, since
3653
+ // the corresponding jump instruction will be removed from the binary.
3654
+ const uint64_t Freq = Succs.size () == 1 ? 110 : 100 ;
3655
+ for (const MachineBasicBlock *Succ : Succs)
3656
+ JumpCounts.push_back ({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3657
+ } else {
3658
+ for (MachineBasicBlock *Succ : MBB.successors ()) {
3659
+ auto EP = MBPI->getEdgeProbability (&MBB, Succ);
3660
+ BlockFrequency JumpFreq = BlockFreq * EP;
3661
+ JumpCounts.push_back (
3662
+ {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency ()});
3663
+ }
3615
3664
}
3616
3665
}
3617
3666
3618
3667
LLVM_DEBUG (dbgs () << " Applying ext-tsp layout for |V| = " << F->size ()
3619
3668
<< " with profile = " << F->getFunction ().hasProfileData ()
3620
- << " (" << F->getName (). str () << " )"
3621
- << " \n " );
3622
- LLVM_DEBUG ( dbgs () << format ( " original layout score: %0.2f \n " ,
3623
- calcExtTspScore (BlockSizes, JumpCounts) ));
3669
+ << " (" << F->getName () << " )" << " \n " );
3670
+
3671
+ const double OrgScore = calcExtTspScore (BlockSizes, JumpCounts);
3672
+ LLVM_DEBUG ( dbgs () << format ( " original layout score: %0.2f \n " , OrgScore ));
3624
3673
3625
3674
// Run the layout algorithm.
3626
3675
auto NewOrder = computeExtTspLayout (BlockSizes, BlockCounts, JumpCounts);
@@ -3629,12 +3678,14 @@ void MachineBlockPlacement::applyExtTsp() {
3629
3678
for (uint64_t Node : NewOrder) {
3630
3679
NewBlockOrder.push_back (CurrentBlockOrder[Node]);
3631
3680
}
3632
- LLVM_DEBUG (
3633
- dbgs () << format (" optimized layout score: %0.2f\n " ,
3634
- calcExtTspScore (NewOrder, BlockSizes, JumpCounts)));
3681
+ const double OptScore = calcExtTspScore (NewOrder, BlockSizes, JumpCounts);
3682
+ LLVM_DEBUG (dbgs () << format (" optimized layout score: %0.2f\n " , OptScore));
3635
3683
3636
- // Assign new block order.
3637
- assignBlockOrder (NewBlockOrder);
3684
+ // If the optimization is unsuccessful, fall back to the original block order.
3685
+ if (OptForSize && OrgScore > OptScore)
3686
+ assignBlockOrder (CurrentBlockOrder);
3687
+ else
3688
+ assignBlockOrder (NewBlockOrder);
3638
3689
}
3639
3690
3640
3691
void MachineBlockPlacement::assignBlockOrder (
0 commit comments