Skip to content

Commit 9016f27

Browse files
authored
[CodeLayout] Size-aware machine block placement (#109711)
This is an implementation of a new "size-aware" machine block placement. The idea is to reorder blocks so that the number of fall-through jumps is maximized. Observe that profile data is ignored for the optimization, and it is applied only for instances with hasOptSize()=true. This strategy has two benefits: (i) it eliminates jump instructions, which results in smaller text size; (ii) we avoid using profile data while reordering blocks, which yields more "uniform" functions, thus helping ICF and machine outliner/merger. For large (mobile) apps, the size benefits of (i) and (ii) are roughly the same, combined providing up to 0.5% uncompressed and up to 1% compressed savings size on top of the current solution. The optimization is turned off by default.
1 parent 99f527d commit 9016f27

File tree

2 files changed

+215
-33
lines changed

2 files changed

+215
-33
lines changed

llvm/lib/CodeGen/MachineBlockPlacement.cpp

Lines changed: 84 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
218218
"block placement."),
219219
cl::init(UINT_MAX), cl::Hidden);
220220

221+
// Apply the ext-tsp algorithm minimizing the size of a binary.
222+
static cl::opt<bool>
223+
ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
224+
cl::desc("Use ext-tsp for size-aware block placement."));
225+
221226
namespace llvm {
222227
extern cl::opt<bool> EnableExtTspBlockPlacement;
223228
extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -595,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
595600
void precomputeTriangleChains();
596601

597602
/// Apply a post-processing step optimizing block placement.
598-
void applyExtTsp();
603+
void applyExtTsp(bool OptForSize);
599604

600605
/// Modify the existing block placement in the function and adjust all jumps.
601606
void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -3505,20 +3510,36 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35053510
// Initialize tail duplication thresholds.
35063511
initTailDupThreshold();
35073512

3513+
const bool OptForSize =
3514+
MF.getFunction().hasOptSize() ||
3515+
llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3516+
// Determine whether to use ext-tsp for perf/size optimization. The method
3517+
// is beneficial only for instances with at least 3 basic blocks and it can be
3518+
// disabled for huge functions (exceeding a certain size).
3519+
bool UseExtTspForPerf = false;
3520+
bool UseExtTspForSize = false;
3521+
if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3522+
UseExtTspForPerf =
3523+
EnableExtTspBlockPlacement &&
3524+
(ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData());
3525+
UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3526+
}
3527+
35083528
// Apply tail duplication.
35093529
if (allowTailDupPlacement()) {
35103530
MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3511-
bool OptForSize = MF.getFunction().hasOptSize() ||
3512-
llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
35133531
if (OptForSize)
35143532
TailDupSize = 1;
35153533
const bool PreRegAlloc = false;
35163534
TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
35173535
/* LayoutMode */ true, TailDupSize);
3518-
precomputeTriangleChains();
3536+
if (!UseExtTspForSize)
3537+
precomputeTriangleChains();
35193538
}
35203539

3521-
buildCFGChains();
3540+
// Run the main block placement.
3541+
if (!UseExtTspForSize)
3542+
buildCFGChains();
35223543

35233544
// Changing the layout can create new tail merging opportunities.
35243545
// TailMerge can create jump into if branches that make CFG irreducible for
@@ -3545,14 +3566,14 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35453566
}
35463567
}
35473568

3548-
// Apply a post-processing optimizing block placement.
3549-
if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
3550-
(ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
3551-
MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3552-
// Find a new placement and modify the layout of the blocks in the function.
3553-
applyExtTsp();
3554-
3555-
// Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3569+
// Apply a post-processing optimizing block placement:
3570+
// - find a new placement and modify the layout of the blocks in the function;
3571+
// - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3572+
if (UseExtTspForPerf || UseExtTspForSize) {
3573+
assert(
3574+
!(UseExtTspForPerf && UseExtTspForSize) &&
3575+
"UseExtTspForPerf and UseExtTspForSize can not be set simultaneosly");
3576+
applyExtTsp(/*OptForSize=*/UseExtTspForSize);
35563577
createCFGChainExtTsp();
35573578
}
35583579

@@ -3577,7 +3598,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35773598
return true;
35783599
}
35793600

3580-
void MachineBlockPlacement::applyExtTsp() {
3601+
void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
35813602
// Prepare data; blocks are indexed by their index in the current ordering.
35823603
DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
35833604
BlockIndex.reserve(F->size());
@@ -3589,13 +3610,15 @@ void MachineBlockPlacement::applyExtTsp() {
35893610
CurrentBlockOrder.push_back(&MBB);
35903611
}
35913612

3592-
auto BlockSizes = std::vector<uint64_t>(F->size());
3593-
auto BlockCounts = std::vector<uint64_t>(F->size());
3594-
std::vector<codelayout::EdgeCount> JumpCounts;
3613+
SmallVector<uint64_t, 0> BlockCounts(F->size());
3614+
SmallVector<uint64_t, 0> BlockSizes(F->size());
3615+
SmallVector<codelayout::EdgeCount, 0> JumpCounts;
3616+
SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
3617+
SmallVector<const MachineBasicBlock *, 4> Succs;
35953618
for (MachineBasicBlock &MBB : *F) {
35963619
// Getting the block frequency.
35973620
BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3598-
BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
3621+
BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
35993622
// Getting the block size:
36003623
// - approximate the size of an instruction by 4 bytes, and
36013624
// - ignore debug instructions.
@@ -3604,23 +3627,49 @@ void MachineBlockPlacement::applyExtTsp() {
36043627
// not see a perf improvement with the exact block sizes.
36053628
auto NonDbgInsts =
36063629
instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
3607-
int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3630+
size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
36083631
BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3632+
36093633
// Getting jump frequencies.
3610-
for (MachineBasicBlock *Succ : MBB.successors()) {
3611-
auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3612-
BlockFrequency JumpFreq = BlockFreq * EP;
3613-
JumpCounts.push_back(
3614-
{BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3634+
if (OptForSize) {
3635+
Cond.clear();
3636+
MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
3637+
if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3638+
continue;
3639+
3640+
const MachineBasicBlock *FTB = MBB.getFallThrough();
3641+
// Succs is a collection of distinct destinations of the block reachable
3642+
// from MBB via a jump instruction; initialize the list using the three
3643+
// (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3644+
Succs.clear();
3645+
if (TBB && TBB != FTB)
3646+
Succs.push_back(TBB);
3647+
if (FBB && FBB != FTB)
3648+
Succs.push_back(FBB);
3649+
if (FTB)
3650+
Succs.push_back(FTB);
3651+
// Absolute magnitude of non-zero counts does not matter for the
3652+
// optimization; prioritize slightly jumps with a single successor, since
3653+
// the corresponding jump instruction will be removed from the binary.
3654+
const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
3655+
for (const MachineBasicBlock *Succ : Succs)
3656+
JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3657+
} else {
3658+
for (MachineBasicBlock *Succ : MBB.successors()) {
3659+
auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3660+
BlockFrequency JumpFreq = BlockFreq * EP;
3661+
JumpCounts.push_back(
3662+
{BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3663+
}
36153664
}
36163665
}
36173666

36183667
LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
36193668
<< " with profile = " << F->getFunction().hasProfileData()
3620-
<< " (" << F->getName().str() << ")"
3621-
<< "\n");
3622-
LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n",
3623-
calcExtTspScore(BlockSizes, JumpCounts)));
3669+
<< " (" << F->getName() << ")" << "\n");
3670+
3671+
const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts);
3672+
LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
36243673

36253674
// Run the layout algorithm.
36263675
auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
@@ -3629,12 +3678,14 @@ void MachineBlockPlacement::applyExtTsp() {
36293678
for (uint64_t Node : NewOrder) {
36303679
NewBlockOrder.push_back(CurrentBlockOrder[Node]);
36313680
}
3632-
LLVM_DEBUG(
3633-
dbgs() << format(" optimized layout score: %0.2f\n",
3634-
calcExtTspScore(NewOrder, BlockSizes, JumpCounts)));
3681+
const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts);
3682+
LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
36353683

3636-
// Assign new block order.
3637-
assignBlockOrder(NewBlockOrder);
3684+
// If the optimization is unsuccessful, fall back to the original block order.
3685+
if (OptForSize && OrgScore > OptScore)
3686+
assignBlockOrder(CurrentBlockOrder);
3687+
else
3688+
assignBlockOrder(NewBlockOrder);
36383689
}
36393690

36403691
void MachineBlockPlacement::assignBlockOrder(
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=true < %s | FileCheck %s -check-prefix=CHECK-PERF
2+
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=false < %s | FileCheck %s -check-prefix=CHECK-SIZE
3+
4+
define void @func1() minsize {
5+
;
6+
; +-----+
7+
; | b0 | -+
8+
; +-----+ |
9+
; | |
10+
; | 10 |
11+
; v |
12+
; +-----+ |
13+
; | b1 | | 10000
14+
; +-----+ |
15+
; | |
16+
; | 10 |
17+
; v |
18+
; +-----+ |
19+
; | b2 | <+
20+
; +-----+
21+
;
22+
; CHECK-PERF-LABEL: func1:
23+
; CHECK-PERF: %b0
24+
; CHECK-PERF: %b1
25+
; CHECK-PERF: %b2
26+
;
27+
; CHECK-SIZE-LABEL: func1:
28+
; CHECK-SIZE: %b0
29+
; CHECK-SIZE: %b2
30+
; CHECK-SIZE: %b1
31+
32+
b0:
33+
%call = call zeroext i1 @a()
34+
br i1 %call, label %b1, label %b2, !prof !1
35+
36+
b1:
37+
call void @d()
38+
call void @d()
39+
call void @d()
40+
br label %b2
41+
42+
b2:
43+
call void @e()
44+
ret void
45+
}
46+
47+
define void @func_loop() minsize !prof !9 {
48+
; Test that the algorithm can rotate loops in the presence of profile data.
49+
;
50+
; +--------+
51+
; | entry |
52+
; +--------+
53+
; |
54+
; | 1
55+
; v
56+
; +--------+ 16 +--------+
57+
; | if.then| <---- | header | <+
58+
; +--------+ +--------+ |
59+
; | | |
60+
; | | 160 |
61+
; | v |
62+
; | +--------+ |
63+
; | | if.else| | 175
64+
; | +--------+ |
65+
; | | |
66+
; | | 160 |
67+
; | v |
68+
; | 16 +--------+ |
69+
; +------------> | if.end | -+
70+
; +--------+
71+
; |
72+
; | 1
73+
; v
74+
; +--------+
75+
; | end |
76+
; +--------+
77+
;
78+
; CHECK-PERF-LABEL: func_loop:
79+
; CHECK-PERF: %entry
80+
; CHECK-PERF: %header
81+
; CHECK-PERF: %if.then
82+
; CHECK-PERF: %if.else
83+
; CHECK-PERF: %if.end
84+
; CHECK-PERF: %end
85+
;
86+
; CHECK-SIZE-LABEL: func_loop:
87+
; CHECK-SIZE: %entry
88+
; CHECK-SIZE: %header
89+
; CHECK-SIZE: %if.else
90+
; CHECK-SIZE: %if.end
91+
; CHECK-SIZE: %if.then
92+
; CHECK-SIZE: %end
93+
94+
entry:
95+
br label %header
96+
97+
header:
98+
call void @e()
99+
%call = call zeroext i1 @a()
100+
br i1 %call, label %if.then, label %if.else, !prof !10
101+
102+
if.then:
103+
call void @f()
104+
br label %if.end
105+
106+
if.else:
107+
call void @g()
108+
br label %if.end
109+
110+
if.end:
111+
call void @h()
112+
%call2 = call zeroext i1 @a()
113+
br i1 %call2, label %header, label %end
114+
115+
end:
116+
ret void
117+
}
118+
119+
120+
declare zeroext i1 @a()
121+
declare void @b()
122+
declare void @c()
123+
declare void @d()
124+
declare void @e()
125+
declare void @g()
126+
declare void @f()
127+
declare void @h()
128+
129+
!1 = !{!"branch_weights", i32 10, i32 10000}
130+
!9 = !{!"function_entry_count", i64 1}
131+
!10 = !{!"branch_weights", i32 16, i32 160}

0 commit comments

Comments
 (0)