Skip to content

Commit 6daa983

Browse files
committed
[AMDGPU] MachineScheduler: schedule execution metric added for the UnclusteredHighRPStage
Since the divergence-driven ISel was fully enabled we have more VGPRs available. MachineScheduler trying to take advantage of that bumps up the occupancy sacrificing the hiding of memory access latency. This really spoils the initially good schedule. A new metric that reflects the latency hiding quality of the schedule has been created to make it to balance between occupancy and latency. The metric is based on the latency model which computes the bubble to working cycles ratio. Then we use this ratio to decide if the higher occupancy schedule is profitable as follows: Profit = NewOccupancy/OldOccupancy * OldMetric/NewMetric Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D139710
1 parent ef47a0a commit 6daa983

File tree

4 files changed

+275
-108
lines changed

4 files changed

+275
-108
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 145 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@ static cl::opt<bool>
3838
cl::desc("Disable unclustred high register pressure "
3939
"reduction scheduling stage."),
4040
cl::init(false));
41+
static cl::opt<unsigned> ScheduleMetricBias(
42+
"amdgpu-schedule-metric-bias", cl::Hidden,
43+
cl::desc(
44+
"Sets the bias which adds weight to occupancy vs latency. Set it to "
45+
"100 to chase the occupancy only."),
46+
cl::init(10));
47+
48+
const unsigned ScheduleMetrics::ScaleFactor = 100;
4149

4250
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
4351
: GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
@@ -862,6 +870,7 @@ void GCNSchedStage::checkScheduling() {
862870
// Check the results of scheduling.
863871
PressureAfter = DAG.getRealRegPressure(RegionIdx);
864872
LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
873+
LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
865874

866875
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
867876
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
@@ -925,6 +934,120 @@ void GCNSchedStage::checkScheduling() {
925934
}
926935
}
927936

937+
unsigned
938+
GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
939+
DenseMap<unsigned, unsigned> &ReadyCycles,
940+
const TargetSchedModel &SM) {
941+
unsigned ReadyCycle = CurrCycle;
942+
for (auto &D : SU.Preds) {
943+
if (D.isAssignedRegDep()) {
944+
MachineInstr *DefMI = D.getSUnit()->getInstr();
945+
unsigned Latency = SM.computeInstrLatency(DefMI);
946+
unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
947+
ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
948+
}
949+
}
950+
ReadyCycles[SU.NodeNum] = ReadyCycle;
951+
return ReadyCycle;
952+
}
953+
954+
#ifndef NDEBUG
955+
struct EarlierIssuingCycle {
956+
bool operator()(std::pair<MachineInstr *, unsigned> A,
957+
std::pair<MachineInstr *, unsigned> B) const {
958+
return A.second < B.second;
959+
}
960+
};
961+
962+
static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
963+
EarlierIssuingCycle> &ReadyCycles) {
964+
if (ReadyCycles.empty())
965+
return;
966+
unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
967+
dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
968+
<< " ##################\n# Cycle #\t\t\tInstruction "
969+
" "
970+
" \n";
971+
unsigned IPrev = 1;
972+
for (auto &I : ReadyCycles) {
973+
if (I.second > IPrev + 1)
974+
dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
975+
<< " CYCLES DETECTED ******************************\n\n";
976+
dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
977+
IPrev = I.second;
978+
}
979+
}
980+
#endif
981+
982+
ScheduleMetrics
983+
GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
984+
#ifndef NDEBUG
985+
std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
986+
ReadyCyclesSorted;
987+
#endif
988+
const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
989+
unsigned SumBubbles = 0;
990+
DenseMap<unsigned, unsigned> ReadyCycles;
991+
unsigned CurrCycle = 0;
992+
for (auto &SU : InputSchedule) {
993+
unsigned ReadyCycle =
994+
computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
995+
SumBubbles += ReadyCycle - CurrCycle;
996+
#ifndef NDEBUG
997+
ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
998+
#endif
999+
CurrCycle = ++ReadyCycle;
1000+
}
1001+
#ifndef NDEBUG
1002+
LLVM_DEBUG(
1003+
printScheduleModel(ReadyCyclesSorted);
1004+
dbgs() << "\n\t"
1005+
<< "Metric: "
1006+
<< (SumBubbles
1007+
? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1008+
: 1)
1009+
<< "\n\n");
1010+
#endif
1011+
1012+
return ScheduleMetrics(CurrCycle, SumBubbles);
1013+
}
1014+
1015+
ScheduleMetrics
1016+
GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1017+
#ifndef NDEBUG
1018+
std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1019+
ReadyCyclesSorted;
1020+
#endif
1021+
const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1022+
unsigned SumBubbles = 0;
1023+
DenseMap<unsigned, unsigned> ReadyCycles;
1024+
unsigned CurrCycle = 0;
1025+
for (auto &MI : DAG) {
1026+
SUnit *SU = DAG.getSUnit(&MI);
1027+
if (!SU)
1028+
continue;
1029+
unsigned ReadyCycle =
1030+
computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1031+
SumBubbles += ReadyCycle - CurrCycle;
1032+
#ifndef NDEBUG
1033+
ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1034+
#endif
1035+
CurrCycle = ++ReadyCycle;
1036+
}
1037+
#ifndef NDEBUG
1038+
LLVM_DEBUG(
1039+
printScheduleModel(ReadyCyclesSorted);
1040+
dbgs() << "\n\t"
1041+
<< "Metric: "
1042+
<< (SumBubbles
1043+
? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1044+
: 1)
1045+
<< "\n\n");
1046+
#endif
1047+
1048+
return ScheduleMetrics(CurrCycle, SumBubbles);
1049+
}
1050+
9281051
bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
9291052
if (WavesAfter < DAG.MinOccupancy)
9301053
return true;
@@ -955,7 +1078,28 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
9551078
return true;
9561079
}
9571080

958-
return false;
1081+
LLVM_DEBUG(
1082+
dbgs()
1083+
<< "\n\t *** In shouldRevertScheduling ***\n"
1084+
<< " *********** BEFORE UnclusteredHighRPStage ***********\n");
1085+
ScheduleMetrics MBefore =
1086+
getScheduleMetrics(DAG.SUnits);
1087+
LLVM_DEBUG(
1088+
dbgs()
1089+
<< "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1090+
ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1091+
unsigned OldMetric = MBefore.getMetric();
1092+
unsigned NewMetric = MAfter.getMetric();
1093+
unsigned WavesBefore =
1094+
std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
1095+
unsigned Profit =
1096+
((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1097+
((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1098+
NewMetric) /
1099+
ScheduleMetrics::ScaleFactor;
1100+
LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1101+
<< MAfter << "Profit: " << Profit << "\n");
1102+
return Profit < ScheduleMetrics::ScaleFactor;
9591103
}
9601104

9611105
bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,33 @@ class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
126126
GCNMaxILPSchedStrategy(const MachineSchedContext *C);
127127
};
128128

129+
class ScheduleMetrics {
130+
unsigned ScheduleLength;
131+
unsigned BubbleCycles;
132+
133+
public:
134+
ScheduleMetrics() {}
135+
ScheduleMetrics(unsigned L, unsigned BC)
136+
: ScheduleLength(L), BubbleCycles(BC) {}
137+
unsigned getLength() const { return ScheduleLength; }
138+
unsigned getBubbles() const { return BubbleCycles; }
139+
unsigned getMetric() const {
140+
unsigned Metric = (BubbleCycles * ScaleFactor) / ScheduleLength;
141+
// Metric is zero if the amount of bubbles is less than 1% which is too
142+
// small. So, return 1.
143+
return Metric ? Metric : 1;
144+
}
145+
static const unsigned ScaleFactor;
146+
};
147+
148+
inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
149+
dbgs() << "\n Schedule Metric (scaled by "
150+
<< ScheduleMetrics::ScaleFactor
151+
<< " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/"
152+
<< Sm.getLength() << " ]\n";
153+
return OS;
154+
}
155+
129156
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
130157
friend class GCNSchedStage;
131158
friend class OccInitialScheduleStage;
@@ -259,6 +286,13 @@ class GCNSchedStage {
259286
// Check result of scheduling.
260287
void checkScheduling();
261288

289+
// computes the given schedule virtual execution time in clocks
290+
ScheduleMetrics getScheduleMetrics(const std::vector<SUnit> &InputSchedule);
291+
ScheduleMetrics getScheduleMetrics(const GCNScheduleDAGMILive &DAG);
292+
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
293+
DenseMap<unsigned, unsigned> &ReadyCycles,
294+
const TargetSchedModel &SM);
295+
262296
// Returns true if scheduling should be reverted.
263297
virtual bool shouldRevertScheduling(unsigned WavesAfter);
264298

llvm/test/CodeGen/AMDGPU/load-global-i16.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6622,10 +6622,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
66226622
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
66236623
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
66246624
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0
6625-
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0
6626-
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0
66276625
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0
66286626
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0
6627+
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0
6628+
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0
66296629
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
66306630
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3
66316631
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)

0 commit comments

Comments
 (0)