Skip to content

[AMDGPU] Add DAG mutation to improve scheduling before barriers #142716

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file This file contains a DAG scheduling mutation to add latency to
/// barrier edges between ATOMIC_FENCE instructions and preceeding
/// memory accesses potentially affected by the fence.
/// This is beneficial when a fence would cause wait count insertion,
/// as more instructions will be scheduled before the fence hiding
/// memory latency.
/// It also reduces the risk of a fence causing a premature wait
/// on all active memory operations.
//
//===----------------------------------------------------------------------===//

#include "AMDGPUBarrierLatency.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/ScheduleDAGInstrs.h"

using namespace llvm;

namespace {

class BarrierLatency : public ScheduleDAGMutation {
public:
BarrierLatency() = default;
void apply(ScheduleDAGInstrs *DAG) override;
};

static bool isMemLoad(const MachineInstr *MI) {
auto isLoad = [](const MachineInstr *MI) {
return (SIInstrInfo::isDS(*MI) || SIInstrInfo::isVMEM(*MI) ||
SIInstrInfo::isSMRD(*MI)) &&
MI->mayLoad();
};

if (MI->isBundle()) {
auto I = std::next(MI->getIterator());
return I != MI->getParent()->instr_end() && I->isInsideBundle() &&
isLoad(&*I);
}

return isLoad(MI);
}

void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
const unsigned SyntheticLatency = 2000;
for (SUnit &SU : DAG->SUnits) {
const MachineInstr *MI = SU.getInstr();
if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
continue;

// Update latency on barrier edges of ATOMIC_FENCE.
// We don't consider the scope of the fence or type of instruction
// involved in the barrier edge.
for (SDep &PredDep : SU.Preds) {
if (!PredDep.isBarrier())
continue;
SUnit *PredSU = PredDep.getSUnit();
if (!isMemLoad(PredSU->getInstr()))
continue;
SDep ForwardD = PredDep;
ForwardD.setSUnit(&SU);
for (SDep &SuccDep : PredSU->Succs) {
if (SuccDep == ForwardD) {
SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency);
break;
}
}
PredDep.setLatency(PredDep.getLatency() + SyntheticLatency);
PredSU->setDepthDirty();
SU.setDepthDirty();
}
}
}

} // end namespace

std::unique_ptr<ScheduleDAGMutation>
llvm::createAMDGPUBarrierLatencyDAGMutation() {
return std::make_unique<BarrierLatency>();
}
21 changes: 21 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H

#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include <memory>

namespace llvm {

std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation();

} // namespace llvm

#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUBarrierLatency.h"
#include "AMDGPUCtorDtorLowering.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUExportKernelRuntimeHandles.h"
Expand Down Expand Up @@ -588,6 +589,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}

Expand All @@ -608,6 +610,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}

Expand Down Expand Up @@ -1156,6 +1159,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
EnableVOPD)
DAG->addMutation(createVOPDPairingMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}
//===----------------------------------------------------------------------===//
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUAsmPrinter.cpp
AMDGPUAtomicOptimizer.cpp
AMDGPUAttributor.cpp
AMDGPUBarrierLatency.cpp
AMDGPUCallLowering.cpp
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
Expand Down
50 changes: 24 additions & 26 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1528,9 +1528,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
Expand Down Expand Up @@ -1576,9 +1576,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
Expand All @@ -1603,9 +1603,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
Expand All @@ -1630,9 +1630,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
Expand Down Expand Up @@ -1683,10 +1683,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX942-NEXT: buffer_wbl2 sc1
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX942-NEXT: v_mov_b32_e32 v1, v4
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
Expand Down Expand Up @@ -1730,10 +1730,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
Expand All @@ -1756,10 +1756,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX908-NEXT: v_mov_b32_e32 v4, v0
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: v_mov_b32_e32 v1, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
Expand All @@ -1782,10 +1782,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX8-NEXT: v_mov_b32_e32 v4, v0
; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v1, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
Expand Down Expand Up @@ -1830,9 +1830,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
Expand Down Expand Up @@ -1872,11 +1872,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_cbranch_execnz .LBB14_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
Expand Down Expand Up @@ -1925,9 +1924,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX908-NEXT: v_mov_b32_e32 v3, v10
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
Expand Down Expand Up @@ -1956,9 +1955,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX8-NEXT: v_mov_b32_e32 v3, v10
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
Expand Down Expand Up @@ -2000,10 +1999,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
Expand Down Expand Up @@ -2040,12 +2039,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
Expand Down Expand Up @@ -2090,11 +2088,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX908-NEXT: v_mov_b32_e32 v7, v0
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
; GFX908-NEXT: v_mov_b32_e32 v2, v7
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: v_mov_b32_e32 v3, v8
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
Expand All @@ -2119,11 +2117,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX8-NEXT: v_mov_b32_e32 v7, v0
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, v7
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v3, v8
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
Expand Down
Loading
Loading