Skip to content

[CodeLayout] Size-aware machine block placement #109711

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 84 additions & 33 deletions llvm/lib/CodeGen/MachineBlockPlacement.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
"block placement."),
cl::init(UINT_MAX), cl::Hidden);

// Apply the ext-tsp algorithm minimizing the size of a binary.
static cl::opt<bool>
ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
cl::desc("Use ext-tsp for size-aware block placement."));

namespace llvm {
extern cl::opt<bool> EnableExtTspBlockPlacement;
extern cl::opt<bool> ApplyExtTspWithoutProfile;
Expand Down Expand Up @@ -595,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
void precomputeTriangleChains();

/// Apply a post-processing step optimizing block placement.
void applyExtTsp();
void applyExtTsp(bool OptForSize);

/// Modify the existing block placement in the function and adjust all jumps.
void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
Expand Down Expand Up @@ -3505,20 +3510,36 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
// Initialize tail duplication thresholds.
initTailDupThreshold();

const bool OptForSize =
MF.getFunction().hasOptSize() ||
llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
// Determine whether to use ext-tsp for perf/size optimization. The method
// is beneficial only for instances with at least 3 basic blocks and it can be
// disabled for huge functions (exceeding a certain size).
bool UseExtTspForPerf = false;
bool UseExtTspForSize = false;
if (3 <= MF.size() && MF.size() <= ExtTspBlockPlacementMaxBlocks) {
UseExtTspForPerf =
EnableExtTspBlockPlacement &&
(ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData());
UseExtTspForSize = OptForSize && ApplyExtTspForSize;
}

// Apply tail duplication.
if (allowTailDupPlacement()) {
MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
bool OptForSize = MF.getFunction().hasOptSize() ||
llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
if (OptForSize)
TailDupSize = 1;
const bool PreRegAlloc = false;
TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
/* LayoutMode */ true, TailDupSize);
precomputeTriangleChains();
if (!UseExtTspForSize)
precomputeTriangleChains();
}

buildCFGChains();
// Run the main block placement.
if (!UseExtTspForSize)
buildCFGChains();

// Changing the layout can create new tail merging opportunities.
// TailMerge can create jump into if branches that make CFG irreducible for
Expand All @@ -3545,14 +3566,14 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
}
}

// Apply a post-processing optimizing block placement.
if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
(ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
MF.size() <= ExtTspBlockPlacementMaxBlocks) {
// Find a new placement and modify the layout of the blocks in the function.
applyExtTsp();

// Re-create CFG chain so that we can optimizeBranches and alignBlocks.
// Apply a post-processing optimizing block placement:
// - find a new placement and modify the layout of the blocks in the function;
// - re-create CFG chains so that we can optimizeBranches and alignBlocks.
if (UseExtTspForPerf || UseExtTspForSize) {
assert(
!(UseExtTspForPerf && UseExtTspForSize) &&
"UseExtTspForPerf and UseExtTspForSize can not be set simultaneosly");
applyExtTsp(/*OptForSize=*/UseExtTspForSize);
createCFGChainExtTsp();
}

Expand All @@ -3577,7 +3598,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
return true;
}

void MachineBlockPlacement::applyExtTsp() {
void MachineBlockPlacement::applyExtTsp(bool OptForSize) {
// Prepare data; blocks are indexed by their index in the current ordering.
DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
BlockIndex.reserve(F->size());
Expand All @@ -3589,13 +3610,15 @@ void MachineBlockPlacement::applyExtTsp() {
CurrentBlockOrder.push_back(&MBB);
}

auto BlockSizes = std::vector<uint64_t>(F->size());
auto BlockCounts = std::vector<uint64_t>(F->size());
std::vector<codelayout::EdgeCount> JumpCounts;
SmallVector<uint64_t, 0> BlockCounts(F->size());
SmallVector<uint64_t, 0> BlockSizes(F->size());
SmallVector<codelayout::EdgeCount, 0> JumpCounts;
SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
SmallVector<const MachineBasicBlock *, 4> Succs;
for (MachineBasicBlock &MBB : *F) {
// Getting the block frequency.
BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency();
// Getting the block size:
// - approximate the size of an instruction by 4 bytes, and
// - ignore debug instructions.
Expand All @@ -3604,23 +3627,49 @@ void MachineBlockPlacement::applyExtTsp() {
// not see a perf improvement with the exact block sizes.
auto NonDbgInsts =
instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
size_t NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;

// Getting jump frequencies.
for (MachineBasicBlock *Succ : MBB.successors()) {
auto EP = MBPI->getEdgeProbability(&MBB, Succ);
BlockFrequency JumpFreq = BlockFreq * EP;
JumpCounts.push_back(
{BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
if (OptForSize) {
Cond.clear();
MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
continue;

const MachineBasicBlock *FTB = MBB.getFallThrough();
// Succs is a collection of distinct destinations of the block reachable
// from MBB via a jump instruction; initialize the list using the three
// (non-necessarily distinct) blocks, FTB, TBB, and FBB.
Succs.clear();
if (TBB && TBB != FTB)
Succs.push_back(TBB);
if (FBB && FBB != FTB)
Succs.push_back(FBB);
if (FTB)
Succs.push_back(FTB);
// Absolute magnitude of non-zero counts does not matter for the
// optimization; prioritize slightly jumps with a single successor, since
// the corresponding jump instruction will be removed from the binary.
const uint64_t Freq = Succs.size() == 1 ? 110 : 100;
for (const MachineBasicBlock *Succ : Succs)
JumpCounts.push_back({BlockIndex[&MBB], BlockIndex[Succ], Freq});
} else {
for (MachineBasicBlock *Succ : MBB.successors()) {
auto EP = MBPI->getEdgeProbability(&MBB, Succ);
BlockFrequency JumpFreq = BlockFreq * EP;
JumpCounts.push_back(
{BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
}
}
}

LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
<< " with profile = " << F->getFunction().hasProfileData()
<< " (" << F->getName().str() << ")"
<< "\n");
LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n",
calcExtTspScore(BlockSizes, JumpCounts)));
<< " (" << F->getName() << ")" << "\n");

const double OrgScore = calcExtTspScore(BlockSizes, JumpCounts);
LLVM_DEBUG(dbgs() << format(" original layout score: %0.2f\n", OrgScore));

// Run the layout algorithm.
auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
Expand All @@ -3629,12 +3678,14 @@ void MachineBlockPlacement::applyExtTsp() {
for (uint64_t Node : NewOrder) {
NewBlockOrder.push_back(CurrentBlockOrder[Node]);
}
LLVM_DEBUG(
dbgs() << format(" optimized layout score: %0.2f\n",
calcExtTspScore(NewOrder, BlockSizes, JumpCounts)));
const double OptScore = calcExtTspScore(NewOrder, BlockSizes, JumpCounts);
LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", OptScore));

// Assign new block order.
assignBlockOrder(NewBlockOrder);
// If the optimization is unsuccessful, fall back to the original block order.
if (OptForSize && OrgScore > OptScore)
assignBlockOrder(CurrentBlockOrder);
else
assignBlockOrder(NewBlockOrder);
}

void MachineBlockPlacement::assignBlockOrder(
Expand Down
131 changes: 131 additions & 0 deletions llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=true < %s | FileCheck %s -check-prefix=CHECK-PERF
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=false < %s | FileCheck %s -check-prefix=CHECK-SIZE

define void @func1() minsize {
;
; +-----+
; | b0 | -+
; +-----+ |
; | |
; | 10 |
; v |
; +-----+ |
; | b1 | | 10000
; +-----+ |
; | |
; | 10 |
; v |
; +-----+ |
; | b2 | <+
; +-----+
;
; CHECK-PERF-LABEL: func1:
; CHECK-PERF: %b0
; CHECK-PERF: %b1
; CHECK-PERF: %b2
;
; CHECK-SIZE-LABEL: func1:
; CHECK-SIZE: %b0
; CHECK-SIZE: %b2
; CHECK-SIZE: %b1

b0:
%call = call zeroext i1 @a()
br i1 %call, label %b1, label %b2, !prof !1

b1:
call void @d()
call void @d()
call void @d()
br label %b2

b2:
call void @e()
ret void
}

define void @func_loop() minsize !prof !9 {
; Test that the algorithm can rotate loops in the presence of profile data.
;
; +--------+
; | entry |
; +--------+
; |
; | 1
; v
; +--------+ 16 +--------+
; | if.then| <---- | header | <+
; +--------+ +--------+ |
; | | |
; | | 160 |
; | v |
; | +--------+ |
; | | if.else| | 175
; | +--------+ |
; | | |
; | | 160 |
; | v |
; | 16 +--------+ |
; +------------> | if.end | -+
; +--------+
; |
; | 1
; v
; +--------+
; | end |
; +--------+
;
; CHECK-PERF-LABEL: func_loop:
; CHECK-PERF: %entry
; CHECK-PERF: %header
; CHECK-PERF: %if.then
; CHECK-PERF: %if.else
; CHECK-PERF: %if.end
; CHECK-PERF: %end
;
; CHECK-SIZE-LABEL: func_loop:
; CHECK-SIZE: %entry
; CHECK-SIZE: %header
; CHECK-SIZE: %if.else
; CHECK-SIZE: %if.end
; CHECK-SIZE: %if.then
; CHECK-SIZE: %end

entry:
br label %header

header:
call void @e()
%call = call zeroext i1 @a()
br i1 %call, label %if.then, label %if.else, !prof !10

if.then:
call void @f()
br label %if.end

if.else:
call void @g()
br label %if.end

if.end:
call void @h()
%call2 = call zeroext i1 @a()
br i1 %call2, label %header, label %end

end:
ret void
}


declare zeroext i1 @a()
declare void @b()
declare void @c()
declare void @d()
declare void @e()
declare void @g()
declare void @f()
declare void @h()

!1 = !{!"branch_weights", i32 10, i32 10000}
!9 = !{!"function_entry_count", i64 1}
!10 = !{!"branch_weights", i32 16, i32 160}
Loading