@@ -218,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
218
218
" block placement." ),
219
219
cl::init(UINT_MAX), cl::Hidden);
220
220
221
+ // Apply the ext-tsp algorithm minimizing the size of a binary.
222
+ static cl::opt<bool >
223
+ ApplyExtTspForSize (" apply-ext-tsp-for-size" , cl::init(false ), cl::Hidden,
224
+ cl::desc(" Use ext-tsp for size-aware block placement." ));
225
+
221
226
namespace llvm {
222
227
extern cl::opt<bool > EnableExtTspBlockPlacement;
223
228
extern cl::opt<bool > ApplyExtTspWithoutProfile;
@@ -595,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
595
600
void precomputeTriangleChains ();
596
601
597
602
// / Apply a post-processing step optimizing block placement.
598
- void applyExtTsp ();
603
+ void applyExtTsp (bool OptForSize );
599
604
600
605
// / Modify the existing block placement in the function and adjust all jumps.
601
606
void assignBlockOrder (const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -3505,20 +3510,29 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3505
3510
// Initialize tail duplication thresholds.
3506
3511
initTailDupThreshold ();
3507
3512
3513
+ const bool OptForSize =
3514
+ MF.getFunction ().hasOptSize () ||
3515
+ llvm::shouldOptimizeForSize (&MF, PSI, &MBFI->getMBFI ());
3516
+ // Use ext-tsp for size optimization is possible only when the function
3517
+ // contains more than two basic blocks.
3518
+ const bool UseExtTspForSize =
3519
+ OptForSize && ApplyExtTspForSize && MF.size () >= 3 ;
3520
+
3508
3521
// Apply tail duplication.
3509
3522
if (allowTailDupPlacement ()) {
3510
3523
MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree ();
3511
- bool OptForSize = MF.getFunction ().hasOptSize () ||
3512
- llvm::shouldOptimizeForSize (&MF, PSI, &MBFI->getMBFI ());
3513
3524
if (OptForSize)
3514
3525
TailDupSize = 1 ;
3515
3526
const bool PreRegAlloc = false ;
3516
3527
TailDup.initMF (MF, PreRegAlloc, MBPI, MBFI.get (), PSI,
3517
3528
/* LayoutMode */ true , TailDupSize);
3518
- precomputeTriangleChains ();
3529
+ if (!UseExtTspForSize)
3530
+ precomputeTriangleChains ();
3519
3531
}
3520
3532
3521
- buildCFGChains ();
3533
+ // Run the main block placement.
3534
+ if (!UseExtTspForSize)
3535
+ buildCFGChains ();
3522
3536
3523
3537
// Changing the layout can create new tail merging opportunities.
3524
3538
// TailMerge can create jump into if branches that make CFG irreducible for
@@ -3545,15 +3559,19 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3545
3559
}
3546
3560
}
3547
3561
3548
- // Apply a post-processing optimizing block placement.
3549
- if (MF.size () >= 3 && EnableExtTspBlockPlacement &&
3550
- (ApplyExtTspWithoutProfile || MF.getFunction ().hasProfileData ()) &&
3551
- MF.size () <= ExtTspBlockPlacementMaxBlocks) {
3552
- // Find a new placement and modify the layout of the blocks in the function.
3553
- applyExtTsp ();
3554
-
3555
- // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3556
- createCFGChainExtTsp ();
3562
+ // Apply a post-processing optimizing block placement:
3563
+ // - find a new placement and modify the layout of the blocks in the function;
3564
+ // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3565
+ if (MF.size () >= 3 ) {
3566
+ if (EnableExtTspBlockPlacement &&
3567
+ (ApplyExtTspWithoutProfile || MF.getFunction ().hasProfileData ()) &&
3568
+ MF.size () <= ExtTspBlockPlacementMaxBlocks) {
3569
+ applyExtTsp (false );
3570
+ createCFGChainExtTsp ();
3571
+ } else if (UseExtTspForSize) {
3572
+ applyExtTsp (true );
3573
+ createCFGChainExtTsp ();
3574
+ }
3557
3575
}
3558
3576
3559
3577
optimizeBranches ();
@@ -3577,7 +3595,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
3577
3595
return true ;
3578
3596
}
3579
3597
3580
- void MachineBlockPlacement::applyExtTsp () {
3598
+ void MachineBlockPlacement::applyExtTsp (bool OptForSize ) {
3581
3599
// Prepare data; blocks are indexed by their index in the current ordering.
3582
3600
DenseMap<const MachineBasicBlock *, uint64_t > BlockIndex;
3583
3601
BlockIndex.reserve (F->size ());
@@ -3589,13 +3607,15 @@ void MachineBlockPlacement::applyExtTsp() {
3589
3607
CurrentBlockOrder.push_back (&MBB);
3590
3608
}
3591
3609
3592
- auto BlockSizes = std::vector<uint64_t >(F->size ());
3593
- auto BlockCounts = std::vector<uint64_t >(F->size ());
3610
+ std::vector<uint64_t > BlockCounts (F->size ());
3611
+ std::vector<uint64_t > BlockSizes (F->size ());
3594
3612
std::vector<codelayout::EdgeCount> JumpCounts;
3613
+ SmallVector<MachineOperand, 4 > Cond; // For analyzeBranch.
3614
+ SmallVector<const MachineBasicBlock *, 4 > Succs;
3595
3615
for (MachineBasicBlock &MBB : *F) {
3596
3616
// Getting the block frequency.
3597
3617
BlockFrequency BlockFreq = MBFI->getBlockFreq (&MBB);
3598
- BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency ();
3618
+ BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency ();
3599
3619
// Getting the block size:
3600
3620
// - approximate the size of an instruction by 4 bytes, and
3601
3621
// - ignore debug instructions.
@@ -3604,24 +3624,48 @@ void MachineBlockPlacement::applyExtTsp() {
3604
3624
// not see a perf improvement with the exact block sizes.
3605
3625
auto NonDbgInsts =
3606
3626
instructionsWithoutDebug (MBB.instr_begin (), MBB.instr_end ());
3607
- int NumInsts = std::distance (NonDbgInsts.begin (), NonDbgInsts.end ());
3627
+ size_t NumInsts = std::distance (NonDbgInsts.begin (), NonDbgInsts.end ());
3608
3628
BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3609
3629
// Getting jump frequencies.
3610
- for (MachineBasicBlock *Succ : MBB.successors ()) {
3611
- auto EP = MBPI->getEdgeProbability (&MBB, Succ);
3612
- BlockFrequency JumpFreq = BlockFreq * EP;
3613
- JumpCounts.push_back (
3614
- {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency ()});
3630
+
3631
+ if (!OptForSize) {
3632
+ for (MachineBasicBlock *Succ : MBB.successors ()) {
3633
+ auto EP = MBPI->getEdgeProbability (&MBB, Succ);
3634
+ BlockFrequency JumpFreq = BlockFreq * EP;
3635
+ JumpCounts.push_back (
3636
+ {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency ()});
3637
+ }
3638
+ } else {
3639
+ Cond.clear ();
3640
+ MachineBasicBlock *TBB = nullptr , *FBB = nullptr ; // For analyzeBranch.
3641
+ if (TII->analyzeBranch (MBB, TBB, FBB, Cond))
3642
+ continue ;
3643
+
3644
+ const MachineBasicBlock *FTB = MBB.getFallThrough ();
3645
+
3646
+ Succs.clear ();
3647
+ if (TBB && TBB != FTB)
3648
+ Succs.push_back (TBB);
3649
+ if (FBB && FBB != FTB)
3650
+ Succs.push_back (FBB);
3651
+ if (FTB)
3652
+ Succs.push_back (FTB);
3653
+ // Absolute magnitude of non-zero counts does not matter for the
3654
+ // optimization; prioritize slightly jumps with a single successor, since
3655
+ // the corresponding jump instruction will be removed from the binary.
3656
+ const uint64_t Freq = Succs.size () == 1 ? 110 : 100 ;
3657
+ for (const MachineBasicBlock *Succ : Succs) {
3658
+ JumpCounts.push_back ({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3659
+ }
3615
3660
}
3616
3661
}
3617
3662
3618
3663
LLVM_DEBUG (dbgs () << " Applying ext-tsp layout for |V| = " << F->size ()
3619
3664
<< " with profile = " << F->getFunction ().hasProfileData ()
3620
- << " (" << F->getName ().str () << " )"
3621
- << " \n " );
3622
- LLVM_DEBUG (
3623
- dbgs () << format (" original layout score: %0.2f\n " ,
3624
- calcExtTspScore (BlockSizes, BlockCounts, JumpCounts)));
3665
+ << " (" << F->getName ().str () << " )" << " \n " );
3666
+
3667
+ const double OrgScore = calcExtTspScore (BlockSizes, BlockCounts, JumpCounts);
3668
+ LLVM_DEBUG (dbgs () << format (" original layout score: %0.2f\n " , OrgScore));
3625
3669
3626
3670
// Run the layout algorithm.
3627
3671
auto NewOrder = computeExtTspLayout (BlockSizes, BlockCounts, JumpCounts);
@@ -3630,12 +3674,15 @@ void MachineBlockPlacement::applyExtTsp() {
3630
3674
for (uint64_t Node : NewOrder) {
3631
3675
NewBlockOrder.push_back (CurrentBlockOrder[Node]);
3632
3676
}
3633
- LLVM_DEBUG ( dbgs () << format ( " optimized layout score: %0.2f \n " ,
3634
- calcExtTspScore (NewOrder, BlockSizes, BlockCounts,
3635
- JumpCounts) ));
3677
+ const double OptScore =
3678
+ calcExtTspScore (NewOrder, BlockSizes, BlockCounts, JumpCounts);
3679
+ LLVM_DEBUG ( dbgs () << format ( " optimized layout score: %0.2f \n " , OptScore ));
3636
3680
3637
- // Assign new block order.
3638
- assignBlockOrder (NewBlockOrder);
3681
+ // If the optimization is unsuccessful, fall back to the original block order.
3682
+ if (OptForSize && OrgScore > OptScore)
3683
+ assignBlockOrder (CurrentBlockOrder);
3684
+ else
3685
+ assignBlockOrder (NewBlockOrder);
3639
3686
}
3640
3687
3641
3688
void MachineBlockPlacement::assignBlockOrder (
0 commit comments