[AMDGPU] MachineScheduler: schedule execution metric added for the UnclusteredHighRPStage

alex-t · alex-t · commit 6daa983c9dde · 2023-01-05T21:10:56.000+01:00
Since the divergence-driven ISel was fully enabled we have more VGPRs available. MachineScheduler trying to take advantage of that bumps up the occupancy sacrificing the hiding of memory access latency. This really spoils the initially good schedule. A new metric that reflects the latency hiding quality of the schedule has been created to make it to balance between occupancy and latency. The metric is based on the latency model which computes the bubble to working cycles ratio. Then we use this ratio to decide if the higher occupancy schedule is profitable as follows: Profit = NewOccupancy/OldOccupancy * OldMetric/NewMetric Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D139710
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -38,6 +38,14 @@ static cl::opt<bool>
                            cl::desc("Disable unclustred high register pressure "
                                     "reduction scheduling stage."),
                            cl::init(false));
+static cl::opt<unsigned> ScheduleMetricBias(
+    "amdgpu-schedule-metric-bias", cl::Hidden,
+    cl::desc(
+        "Sets the bias which adds weight to occupancy vs latency. Set it to "
+        "100 to chase the occupancy only."),
+    cl::init(10));
+
+const unsigned ScheduleMetrics::ScaleFactor = 100;
 
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
@@ -862,6 +870,7 @@ void GCNSchedStage::checkScheduling() {
   // Check the results of scheduling.
   PressureAfter = DAG.getRealRegPressure(RegionIdx);
   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
+  LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
 
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
@@ -925,6 +934,120 @@ void GCNSchedStage::checkScheduling() {
   }
 }
 
+unsigned
+GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
+                                      DenseMap<unsigned, unsigned> &ReadyCycles,
+                                      const TargetSchedModel &SM) {
+  unsigned ReadyCycle = CurrCycle;
+  for (auto &D : SU.Preds) {
+    if (D.isAssignedRegDep()) {
+      MachineInstr *DefMI = D.getSUnit()->getInstr();
+      unsigned Latency = SM.computeInstrLatency(DefMI);
+      unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
+      ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
+    }
+  }
+  ReadyCycles[SU.NodeNum] = ReadyCycle;
+  return ReadyCycle;
+}
+
+#ifndef NDEBUG
+struct EarlierIssuingCycle {
+  bool operator()(std::pair<MachineInstr *, unsigned> A,
+                  std::pair<MachineInstr *, unsigned> B) const {
+    return A.second < B.second;
+  }
+};
+
+static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
+                                        EarlierIssuingCycle> &ReadyCycles) {
+  if (ReadyCycles.empty())
+    return;
+  unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
+  dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
+         << " ##################\n# Cycle #\t\t\tInstruction          "
+            "             "
+            "                            \n";
+  unsigned IPrev = 1;
+  for (auto &I : ReadyCycles) {
+    if (I.second > IPrev + 1)
+      dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
+             << " CYCLES DETECTED ******************************\n\n";
+    dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
+    IPrev = I.second;
+  }
+}
+#endif
+
+ScheduleMetrics
+GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
+#ifndef NDEBUG
+  std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
+      ReadyCyclesSorted;
+#endif
+  const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
+  unsigned SumBubbles = 0;
+  DenseMap<unsigned, unsigned> ReadyCycles;
+  unsigned CurrCycle = 0;
+  for (auto &SU : InputSchedule) {
+    unsigned ReadyCycle =
+        computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
+    SumBubbles += ReadyCycle - CurrCycle;
+#ifndef NDEBUG
+    ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
+#endif
+    CurrCycle = ++ReadyCycle;
+  }
+#ifndef NDEBUG
+  LLVM_DEBUG(
+      printScheduleModel(ReadyCyclesSorted);
+      dbgs() << "\n\t"
+             << "Metric: "
+             << (SumBubbles
+                     ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
+                     : 1)
+             << "\n\n");
+#endif
+
+  return ScheduleMetrics(CurrCycle, SumBubbles);
+}
+
+ScheduleMetrics
+GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
+#ifndef NDEBUG
+  std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
+      ReadyCyclesSorted;
+#endif
+  const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
+  unsigned SumBubbles = 0;
+  DenseMap<unsigned, unsigned> ReadyCycles;
+  unsigned CurrCycle = 0;
+  for (auto &MI : DAG) {
+    SUnit *SU = DAG.getSUnit(&MI);
+    if (!SU)
+      continue;
+    unsigned ReadyCycle =
+        computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
+    SumBubbles += ReadyCycle - CurrCycle;
+#ifndef NDEBUG
+    ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
+#endif
+    CurrCycle = ++ReadyCycle;
+  }
+#ifndef NDEBUG
+  LLVM_DEBUG(
+      printScheduleModel(ReadyCyclesSorted);
+      dbgs() << "\n\t"
+             << "Metric: "
+             << (SumBubbles
+                     ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
+                     : 1)
+             << "\n\n");
+#endif
+
+  return ScheduleMetrics(CurrCycle, SumBubbles);
+}
+
 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
   if (WavesAfter < DAG.MinOccupancy)
     return true;
@@ -955,7 +1078,28 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
     return true;
   }
 
-  return false;
+  LLVM_DEBUG(
+      dbgs()
+      << "\n\t      *** In shouldRevertScheduling ***\n"
+      << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
+  ScheduleMetrics MBefore =
+      getScheduleMetrics(DAG.SUnits);
+  LLVM_DEBUG(
+      dbgs()
+      << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
+  ScheduleMetrics MAfter = getScheduleMetrics(DAG);
+  unsigned OldMetric = MBefore.getMetric();
+  unsigned NewMetric = MAfter.getMetric();
+  unsigned WavesBefore =
+      std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
+  unsigned Profit =
+      ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
+       ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
+       NewMetric) /
+      ScheduleMetrics::ScaleFactor;
+  LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
+                    << MAfter << "Profit: " << Profit << "\n");
+  return Profit < ScheduleMetrics::ScaleFactor;
 }
 
 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -126,6 +126,33 @@ class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
   GCNMaxILPSchedStrategy(const MachineSchedContext *C);
 };
 
+class ScheduleMetrics {
+  unsigned ScheduleLength;
+  unsigned BubbleCycles;
+
+public:
+  ScheduleMetrics() {}
+  ScheduleMetrics(unsigned L, unsigned BC)
+      : ScheduleLength(L), BubbleCycles(BC) {}
+  unsigned getLength() const { return ScheduleLength; }
+  unsigned getBubbles() const { return BubbleCycles; }
+  unsigned getMetric() const {
+    unsigned Metric = (BubbleCycles * ScaleFactor) / ScheduleLength;
+    // Metric is zero if the amount of bubbles is less than 1% which is too
+    // small. So, return 1.
+    return Metric ? Metric : 1;
+  }
+  static const unsigned ScaleFactor;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
+  dbgs() << "\n Schedule Metric (scaled by "
+         << ScheduleMetrics::ScaleFactor
+         << " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/"
+         << Sm.getLength() << " ]\n";
+  return OS;
+}
+
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
@@ -259,6 +286,13 @@ class GCNSchedStage {
   // Check result of scheduling.
   void checkScheduling();
 
+  // computes the given schedule virtual execution time in clocks
+  ScheduleMetrics getScheduleMetrics(const std::vector<SUnit> &InputSchedule);
+  ScheduleMetrics getScheduleMetrics(const GCNScheduleDAGMILive &DAG);
+  unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
+                                  DenseMap<unsigned, unsigned> &ReadyCycles,
+                                  const TargetSchedModel &SM);
+
   // Returns true if scheduling should be reverted.
   virtual bool shouldRevertScheduling(unsigned WavesAfter);
 
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -6622,10 +6622,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v30, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v3
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll