Skip to content

Commit 6cbc54a

Browse files
author
spupyrev
committed
[CodeLayout] Size-aware machine block placement
1 parent 9830156 commit 6cbc54a

File tree

2 files changed

+212
-34
lines changed

2 files changed

+212
-34
lines changed

llvm/lib/CodeGen/MachineBlockPlacement.cpp

Lines changed: 81 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
218218
"block placement."),
219219
cl::init(UINT_MAX), cl::Hidden);
220220

221+
// Apply the ext-tsp algorithm minimizing the size of a binary.
222+
static cl::opt<bool>
223+
ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
224+
cl::desc("Use ext-tsp for size-aware block placement."));
225+
221226
namespace llvm {
222227
extern cl::opt<bool> EnableExtTspBlockPlacement;
223228
extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -595,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
595600
void precomputeTriangleChains();
596601

597602
/// Apply a post-processing step optimizing block placement.
598-
void applyExtTsp();
603+
void applyExtTsp(bool OptForSize);
599604

600605
/// Modify the existing block placement in the function and adjust all jumps.
601606
void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -3505,20 +3510,29 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35053510
// Initialize tail duplication thresholds.
35063511
initTailDupThreshold();
35073512

3513+
const bool OptForSize =
3514+
MF.getFunction().hasOptSize() ||
3515+
llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
3516+
// Use ext-tsp for size optimization is possible only when the function
3517+
// contains more than two basic blocks.
3518+
const bool UseExtTspForSize =
3519+
OptForSize && ApplyExtTspForSize && MF.size() >= 3;
3520+
35083521
// Apply tail duplication.
35093522
if (allowTailDupPlacement()) {
35103523
MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3511-
bool OptForSize = MF.getFunction().hasOptSize() ||
3512-
llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
35133524
if (OptForSize)
35143525
TailDupSize = 1;
35153526
const bool PreRegAlloc = false;
35163527
TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
35173528
/* LayoutMode */ true, TailDupSize);
3518-
precomputeTriangleChains();
3529+
if (!UseExtTspForSize)
3530+
precomputeTriangleChains();
35193531
}
35203532

3521-
buildCFGChains();
3533+
// Run the main block placement.
3534+
if (!UseExtTspForSize)
3535+
buildCFGChains();
35223536

35233537
// Changing the layout can create new tail merging opportunities.
35243538
// TailMerge can create jump into if branches that make CFG irreducible for
@@ -3545,15 +3559,19 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35453559
}
35463560
}
35473561

3548-
// Apply a post-processing optimizing block placement.
3549-
if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
3550-
(ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
3551-
MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3552-
// Find a new placement and modify the layout of the blocks in the function.
3553-
applyExtTsp();
3554-
3555-
// Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3556-
createCFGChainExtTsp();
3562+
// Apply a post-processing optimizing block placement:
3563+
// - find a new placement and modify the layout of the blocks in the function;
3564+
// - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3565+
if (MF.size() >= 3) {
3566+
if (EnableExtTspBlockPlacement &&
3567+
(ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
3568+
MF.size() <= ExtTspBlockPlacementMaxBlocks) {
3569+
applyExtTsp(false);
3570+
createCFGChainExtTsp();
3571+
} else if (UseExtTspForSize) {
3572+
applyExtTsp(true);
3573+
createCFGChainExtTsp();
3574+
}
35573575
}
35583576

35593577
optimizeBranches();
@@ -3577,7 +3595,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35773595
return true;
35783596
}
35793597

3580-
void MachineBlockPlacement::applyExtTsp() {
3598+
void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
35813599
// Prepare data; blocks are indexed by their index in the current ordering.
35823600
DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
35833601
BlockIndex.reserve(F->size());
@@ -3589,13 +3607,15 @@ void MachineBlockPlacement::applyExtTsp() {
35893607
CurrentBlockOrder.push_back(&MBB);
35903608
}
35913609

3592-
auto BlockSizes = std::vector<uint64_t>(F->size());
3593-
auto BlockCounts = std::vector<uint64_t>(F->size());
3610+
std::vector<uint64_t> BlockCounts(F->size());
3611+
std::vector<uint64_t> BlockSizes(F->size());
35943612
std::vector<codelayout::EdgeCount> JumpCounts;
3613+
SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
3614+
SmallVector<const MachineBasicBlock *, 4> Succs;
35953615
for (MachineBasicBlock &MBB : *F) {
35963616
// Getting the block frequency.
35973617
BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
3598-
BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
3618+
BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
35993619
// Getting the block size:
36003620
// - approximate the size of an instruction by 4 bytes, and
36013621
// - ignore debug instructions.
@@ -3604,24 +3624,48 @@ void MachineBlockPlacement::applyExtTsp() {
36043624
// not see a perf improvement with the exact block sizes.
36053625
auto NonDbgInsts =
36063626
instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
3607-
int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
3627+
size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
36083628
BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
36093629
// Getting jump frequencies.
3610-
for (MachineBasicBlock *Succ : MBB.successors()) {
3611-
auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3612-
BlockFrequency JumpFreq = BlockFreq * EP;
3613-
JumpCounts.push_back(
3614-
{BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3630+
3631+
if (!OptForSize) {
3632+
for (MachineBasicBlock *Succ : MBB.successors()) {
3633+
auto EP = MBPI->getEdgeProbability(&MBB, Succ);
3634+
BlockFrequency JumpFreq = BlockFreq * EP;
3635+
JumpCounts.push_back(
3636+
{BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
3637+
}
3638+
} else {
3639+
Cond.clear();
3640+
MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
3641+
if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
3642+
continue;
3643+
3644+
const MachineBasicBlock *FTB = MBB.getFallThrough();
3645+
3646+
Succs.clear();
3647+
if (TBB && TBB != FTB)
3648+
Succs.push_back(TBB);
3649+
if (FBB && FBB != FTB)
3650+
Succs.push_back(FBB);
3651+
if (FTB)
3652+
Succs.push_back(FTB);
3653+
// Absolute magnitude of non-zero counts does not matter for the
3654+
// optimization; prioritize slightly jumps with a single successor, since
3655+
// the corresponding jump instruction will be removed from the binary.
3656+
const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
3657+
for (const MachineBasicBlock *Succ : Succs) {
3658+
JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3659+
}
36153660
}
36163661
}
36173662

36183663
LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
36193664
<< " with profile = " << F->getFunction().hasProfileData()
3620-
<< " (" << F->getName().str() << ")"
3621-
<< "\n");
3622-
LLVM_DEBUG(
3623-
dbgs() << format(" original layout score: %0.2f\n",
3624-
calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
3665+
<< " (" << F->getName().str() << ")" << "\n");
3666+
3667+
const double OrgScore = calcExtTspScore(BlockSizes, BlockCounts, JumpCounts);
3668+
LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));
36253669

36263670
// Run the layout algorithm.
36273671
auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
@@ -3630,12 +3674,15 @@ void MachineBlockPlacement::applyExtTsp() {
36303674
for (uint64_t Node : NewOrder) {
36313675
NewBlockOrder.push_back(CurrentBlockOrder[Node]);
36323676
}
3633-
LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n",
3634-
calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
3635-
JumpCounts)));
3677+
const double OptScore =
3678+
calcExtTspScore(NewOrder, BlockSizes, BlockCounts, JumpCounts);
3679+
LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));
36363680

3637-
// Assign new block order.
3638-
assignBlockOrder(NewBlockOrder);
3681+
// If the optimization is unsuccessful, fall back to the original block order.
3682+
if (OptForSize && OrgScore > OptScore)
3683+
assignBlockOrder(CurrentBlockOrder);
3684+
else
3685+
assignBlockOrder(NewBlockOrder);
36393686
}
36403687

36413688
void MachineBlockPlacement::assignBlockOrder(
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=true < %s | FileCheck %s
2+
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=false < %s | FileCheck %s -check-prefix=CHECK2
3+
4+
define void @func1() minsize {
5+
;
6+
; +-----+
7+
; | b0 | -+
8+
; +-----+ |
9+
; | |
10+
; | 10 |
11+
; v |
12+
; +-----+ |
13+
; | b1 | | 10000
14+
; +-----+ |
15+
; | |
16+
; | 10 |
17+
; v |
18+
; +-----+ |
19+
; | b2 | <+
20+
; +-----+
21+
;
22+
; CHECK-LABEL: func1:
23+
; CHECK: %b0
24+
; CHECK: %b1
25+
; CHECK: %b2
26+
;
27+
; CHECK2-LABEL: func1:
28+
; CHECK2: %b0
29+
; CHECK2: %b2
30+
; CHECK2: %b1
31+
32+
b0:
33+
%call = call zeroext i1 @a()
34+
br i1 %call, label %b1, label %b2, !prof !1
35+
36+
b1:
37+
call void @d()
38+
call void @d()
39+
call void @d()
40+
br label %b2
41+
42+
b2:
43+
call void @e()
44+
ret void
45+
}
46+
47+
define void @func_loop() minsize !prof !9 {
48+
; Test that the algorithm can rotate loops in the presence of profile data.
49+
;
50+
; +--------+
51+
; | entry |
52+
; +--------+
53+
; |
54+
; | 1
55+
; v
56+
; +--------+ 16 +--------+
57+
; | if.then| <---- | header | <+
58+
; +--------+ +--------+ |
59+
; | | |
60+
; | | 160 |
61+
; | v |
62+
; | +--------+ |
63+
; | | if.else| | 175
64+
; | +--------+ |
65+
; | | |
66+
; | | 160 |
67+
; | v |
68+
; | 16 +--------+ |
69+
; +------------> | if.end | -+
70+
; +--------+
71+
; |
72+
; | 1
73+
; v
74+
; +--------+
75+
; | end |
76+
; +--------+
77+
;
78+
; CHECK-LABEL: func_loop:
79+
; CHECK: %entry
80+
; CHECK: %header
81+
; CHECK: %if.then
82+
; CHECK: %if.else
83+
; CHECK: %if.end
84+
; CHECK: %end
85+
;
86+
; CHECK2-LABEL: func_loop:
87+
; CHECK2: %entry
88+
; CHECK2: %header
89+
; CHECK2: %if.else
90+
; CHECK2: %if.end
91+
; CHECK2: %if.then
92+
; CHECK2: %end
93+
94+
entry:
95+
br label %header
96+
97+
header:
98+
call void @e()
99+
%call = call zeroext i1 @a()
100+
br i1 %call, label %if.then, label %if.else, !prof !10
101+
102+
if.then:
103+
call void @f()
104+
br label %if.end
105+
106+
if.else:
107+
call void @g()
108+
br label %if.end
109+
110+
if.end:
111+
call void @h()
112+
%call2 = call zeroext i1 @a()
113+
br i1 %call2, label %header, label %end
114+
115+
end:
116+
ret void
117+
}
118+
119+
120+
declare zeroext i1 @a()
121+
declare void @b()
122+
declare void @c()
123+
declare void @d()
124+
declare void @e()
125+
declare void @g()
126+
declare void @f()
127+
declare void @h()
128+
129+
!1 = !{!"branch_weights", i32 10, i32 10000}
130+
!9 = !{!"function_entry_count", i64 1}
131+
!10 = !{!"branch_weights", i32 16, i32 160}

0 commit comments

Comments
 (0)