Skip to content

Commit 0084adc

Browse files
committed
AMDGPU: Add Vega12 and Vega20
Changes by Matt Arsenault Konstantin Zhuravlyov llvm-svn: 331215
1 parent 45c7205 commit 0084adc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+2133
-322
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,16 @@ names from both the *Processor* and *Alternative Processor* can be used.
200200
- Radeon Instinct MI25
201201
``gfx902`` ``amdgcn`` APU - xnack - Ryzen 3 2200G
202202
[on] - Ryzen 5 2400G
203+
``gfx904`` ``amdgcn`` dGPU - xnack *TBA*
204+
[off]
205+
.. TODO
206+
Add product
207+
names.
208+
``gfx906`` ``amdgcn`` dGPU - xnack *TBA*
209+
[off]
210+
.. TODO
211+
Add product
212+
names.
203213
=========== =============== ============ ===== ========= ======= ==================
204214

205215
.. _amdgpu-target-features:
@@ -547,8 +557,8 @@ The AMDGPU backend uses the following ELF header:
547557
``EF_AMDGPU_MACH_AMDGCN_GFX810`` 0x02b ``gfx810``
548558
``EF_AMDGPU_MACH_AMDGCN_GFX900`` 0x02c ``gfx900``
549559
``EF_AMDGPU_MACH_AMDGCN_GFX902`` 0x02d ``gfx902``
550-
*reserved* 0x02e Reserved.
551-
*reserved* 0x02f Reserved.
560+
``EF_AMDGPU_MACH_AMDGCN_GFX904`` 0x02e ``gfx904``
561+
``EF_AMDGPU_MACH_AMDGCN_GFX906`` 0x02f ``gfx906``
552562
*reserved* 0x030 Reserved.
553563
================================= ========== =============================
554564

@@ -765,7 +775,7 @@ The following relocation types are supported:
765775
``R_AMDGPU_ABS32_HI`` Static, 2 ``word32`` (S + A) >> 32
766776
Dynamic
767777
``R_AMDGPU_ABS64`` Static, 3 ``word64`` S + A
768-
Dynamic
778+
Dynamic
769779
``R_AMDGPU_REL32`` Static 4 ``word32`` S + A - P
770780
``R_AMDGPU_REL64`` Static 5 ``word64`` S + A - P
771781
``R_AMDGPU_ABS32`` Static, 6 ``word32`` S + A
@@ -784,7 +794,7 @@ the ``mesa3d`` OS, which does not support ``R_AMDGPU_ABS64``.
784794

785795
There is no current OS loader support for 32 bit programs and so
786796
``R_AMDGPU_ABS32`` is not used.
787-
797+
788798
.. _amdgpu-dwarf:
789799

790800
DWARF

llvm/include/llvm/BinaryFormat/ELF.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,7 @@ enum : unsigned {
687687

688688
// AMDGCN-based processors.
689689
EF_AMDGPU_MACH_AMDGCN_FIRST = 0x020,
690-
EF_AMDGPU_MACH_AMDGCN_LAST = 0x02d,
690+
EF_AMDGPU_MACH_AMDGCN_LAST = 0x02f,
691691
// AMDGCN GFX6.
692692
EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
693693
EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
@@ -705,12 +705,12 @@ enum : unsigned {
705705
// AMDGCN GFX9.
706706
EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
707707
EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
708+
EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
709+
EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
708710

709711
// Reserved for AMDGCN-based processors.
710712
EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027,
711-
EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x02e,
712-
EF_AMDGPU_MACH_AMDGCN_RESERVED2 = 0x02f,
713-
EF_AMDGPU_MACH_AMDGCN_RESERVED3 = 0x030,
713+
EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x030,
714714

715715
// Indicates if the xnack target feature is enabled for all code contained in
716716
// the object.

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,6 +1287,109 @@ def int_amdgcn_ds_bpermute :
12871287
GCCBuiltin<"__builtin_amdgcn_ds_bpermute">,
12881288
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
12891289

1290+
//===----------------------------------------------------------------------===//
1291+
// Deep learning intrinsics.
1292+
//===----------------------------------------------------------------------===//
1293+
1294+
// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c)
1295+
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1296+
def int_amdgcn_fdot2 :
1297+
GCCBuiltin<"__builtin_amdgcn_fdot2">,
1298+
Intrinsic<
1299+
[llvm_float_ty], // %r
1300+
[
1301+
llvm_v2f16_ty, // %a
1302+
llvm_v2f16_ty, // %b
1303+
llvm_float_ty // %c
1304+
],
1305+
[IntrNoMem, IntrSpeculatable]
1306+
>;
1307+
1308+
// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c)
1309+
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1310+
def int_amdgcn_sdot2 :
1311+
GCCBuiltin<"__builtin_amdgcn_sdot2">,
1312+
Intrinsic<
1313+
[llvm_i32_ty], // %r
1314+
[
1315+
llvm_v2i16_ty, // %a
1316+
llvm_v2i16_ty, // %b
1317+
llvm_i32_ty // %c
1318+
],
1319+
[IntrNoMem, IntrSpeculatable]
1320+
>;
1321+
1322+
// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c)
1323+
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1324+
def int_amdgcn_udot2 :
1325+
GCCBuiltin<"__builtin_amdgcn_udot2">,
1326+
Intrinsic<
1327+
[llvm_i32_ty], // %r
1328+
[
1329+
llvm_v2i16_ty, // %a
1330+
llvm_v2i16_ty, // %b
1331+
llvm_i32_ty // %c
1332+
],
1333+
[IntrNoMem, IntrSpeculatable]
1334+
>;
1335+
1336+
// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c)
1337+
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
1338+
def int_amdgcn_sdot4 :
1339+
GCCBuiltin<"__builtin_amdgcn_sdot4">,
1340+
Intrinsic<
1341+
[llvm_i32_ty], // %r
1342+
[
1343+
llvm_i32_ty, // %a
1344+
llvm_i32_ty, // %b
1345+
llvm_i32_ty // %c
1346+
],
1347+
[IntrNoMem, IntrSpeculatable]
1348+
>;
1349+
1350+
// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c)
1351+
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
1352+
def int_amdgcn_udot4 :
1353+
GCCBuiltin<"__builtin_amdgcn_udot4">,
1354+
Intrinsic<
1355+
[llvm_i32_ty], // %r
1356+
[
1357+
llvm_i32_ty, // %a
1358+
llvm_i32_ty, // %b
1359+
llvm_i32_ty // %c
1360+
],
1361+
[IntrNoMem, IntrSpeculatable]
1362+
>;
1363+
1364+
// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c)
1365+
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
1366+
// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
1367+
def int_amdgcn_sdot8 :
1368+
GCCBuiltin<"__builtin_amdgcn_sdot8">,
1369+
Intrinsic<
1370+
[llvm_i32_ty], // %r
1371+
[
1372+
llvm_i32_ty, // %a
1373+
llvm_i32_ty, // %b
1374+
llvm_i32_ty // %c
1375+
],
1376+
[IntrNoMem, IntrSpeculatable]
1377+
>;
1378+
1379+
// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c)
1380+
// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
1381+
// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
1382+
def int_amdgcn_udot8 :
1383+
GCCBuiltin<"__builtin_amdgcn_udot8">,
1384+
Intrinsic<
1385+
[llvm_i32_ty], // %r
1386+
[
1387+
llvm_i32_ty, // %a
1388+
llvm_i32_ty, // %b
1389+
llvm_i32_ty // %c
1390+
],
1391+
[IntrNoMem, IntrSpeculatable]
1392+
>;
12901393

12911394
//===----------------------------------------------------------------------===//
12921395
// Special Intrinsics for backend internal use only. No frontend

llvm/lib/ObjectYAML/ELFYAML.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,8 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
400400
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX810, EF_AMDGPU_MACH);
401401
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_MACH);
402402
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX902, EF_AMDGPU_MACH);
403+
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH);
404+
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH);
403405
BCase(EF_AMDGPU_XNACK);
404406
break;
405407
case ELF::EM_X86_64:

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
127127
"Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
128128
>;
129129

130+
def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
131+
"HasFmaMixInsts",
132+
"true",
133+
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
134+
>;
135+
130136
// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
131137
// XNACK. The current default kernel driver setting is:
132138
// - graphics ring: XNACK disabled
@@ -310,6 +316,12 @@ def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
310316
"Has unpacked d16 vmem instructions"
311317
>;
312318

319+
def FeatureDLInsts : SubtargetFeature<"dl-insts",
320+
"HasDLInsts",
321+
"true",
322+
"Has deep learning instructions"
323+
>;
324+
313325
//===------------------------------------------------------------===//
314326
// Subtarget Features (options and debugging)
315327
//===------------------------------------------------------------===//
@@ -606,6 +618,18 @@ def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
606618
FeatureXNACK
607619
]>;
608620

621+
def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
622+
[FeatureGFX9,
623+
FeatureLDSBankCount32,
624+
FeatureFmaMixInsts]>;
625+
626+
def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
627+
[FeatureGFX9,
628+
HalfRate64Ops,
629+
FeatureFmaMixInsts,
630+
FeatureLDSBankCount32,
631+
FeatureDLInsts]>;
632+
609633
//===----------------------------------------------------------------------===//
610634
// Debugger related subtarget features.
611635
//===----------------------------------------------------------------------===//
@@ -788,6 +812,13 @@ def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
788812
def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
789813
AssemblerPredicate<"FeatureMovrel">;
790814

815+
def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
816+
AssemblerPredicate<"FeatureFmaMixInsts">;
817+
818+
def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
819+
AssemblerPredicate<"FeatureDLInsts">;
820+
821+
791822
def EnableLateCFGStructurize : Predicate<
792823
"EnableLateStructurizeCFG">;
793824

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
215215
void SelectS_BFE(SDNode *N);
216216
bool isCBranchSCC(const SDNode *N) const;
217217
void SelectBRCOND(SDNode *N);
218-
void SelectFMAD(SDNode *N);
218+
void SelectFMAD_FMA(SDNode *N);
219219
void SelectATOMIC_CMP_SWAP(SDNode *N);
220220

221221
protected:
@@ -621,7 +621,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
621621
SelectBRCOND(N);
622622
return;
623623
case ISD::FMAD:
624-
SelectFMAD(N);
624+
case ISD::FMA:
625+
SelectFMAD_FMA(N);
625626
return;
626627
case AMDGPUISD::ATOMIC_CMP_SWAP:
627628
SelectATOMIC_CMP_SWAP(N);
@@ -1728,9 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
17281729
VCC.getValue(0));
17291730
}
17301731

1731-
void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
1732+
void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
17321733
MVT VT = N->getSimpleValueType(0);
1733-
if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) {
1734+
bool IsFMA = N->getOpcode() == ISD::FMA;
1735+
if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
1736+
!Subtarget->hasFmaMixInsts()) ||
1737+
((IsFMA && Subtarget->hasMadMixInsts()) ||
1738+
(!IsFMA && Subtarget->hasFmaMixInsts()))) {
17341739
SelectCode(N);
17351740
return;
17361741
}
@@ -1740,13 +1745,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
17401745
SDValue Src2 = N->getOperand(2);
17411746
unsigned Src0Mods, Src1Mods, Src2Mods;
17421747

1743-
// Avoid using v_mad_mix_f32 unless there is actually an operand using the
1744-
// conversion from f16.
1748+
// Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
1749+
// using the conversion from f16.
17451750
bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
17461751
bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
17471752
bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
17481753

1749-
assert(!Subtarget->hasFP32Denormals() &&
1754+
assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
17501755
"fmad selected with denormals enabled");
17511756
// TODO: We can select this with f32 denormals enabled if all the sources are
17521757
// converted from f16 (in which case fmad isn't legal).
@@ -1762,7 +1767,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
17621767
Zero, Zero
17631768
};
17641769

1765-
CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops);
1770+
CurDAG->SelectNodeTo(N,
1771+
IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
1772+
MVT::f32, Ops);
17661773
} else {
17671774
SelectCode(N);
17681775
}

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,8 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
939939
// where this is OK to use.
940940
bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
941941
EVT DestVT, EVT SrcVT) const {
942-
return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
942+
return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
943+
(Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
943944
DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
944945
SrcVT.getScalarType() == MVT::f16;
945946
}

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
148148
HasIntClamp(false),
149149
HasVOP3PInsts(false),
150150
HasMadMixInsts(false),
151+
HasFmaMixInsts(false),
151152
HasMovrel(false),
152153
HasVGPRIndexMode(false),
153154
HasScalarStores(false),
@@ -160,6 +161,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
160161
HasSDWAMac(false),
161162
HasSDWAOutModsVOPC(false),
162163
HasDPP(false),
164+
HasDLInsts(false),
163165
FlatAddressSpace(false),
164166
FlatInstOffsets(false),
165167
FlatGlobalInsts(false),

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
7272
ISAVersion8_0_3,
7373
ISAVersion8_1_0,
7474
ISAVersion9_0_0,
75-
ISAVersion9_0_2
75+
ISAVersion9_0_1,
76+
ISAVersion9_0_2,
77+
ISAVersion9_0_4,
78+
ISAVersion9_0_6
7679
};
7780

7881
enum TrapHandlerAbi {
@@ -150,6 +153,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
150153
bool HasIntClamp;
151154
bool HasVOP3PInsts;
152155
bool HasMadMixInsts;
156+
bool HasFmaMixInsts;
153157
bool HasMovrel;
154158
bool HasVGPRIndexMode;
155159
bool HasScalarStores;
@@ -162,6 +166,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
162166
bool HasSDWAMac;
163167
bool HasSDWAOutModsVOPC;
164168
bool HasDPP;
169+
bool HasDLInsts;
165170
bool FlatAddressSpace;
166171
bool FlatInstOffsets;
167172
bool FlatGlobalInsts;
@@ -329,6 +334,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
329334
return HasMadMixInsts;
330335
}
331336

337+
bool hasFmaMixInsts() const {
338+
return HasFmaMixInsts;
339+
}
340+
332341
bool hasCARRY() const {
333342
return (getGeneration() >= EVERGREEN);
334343
}
@@ -534,6 +543,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
534543
return getGeneration() < SEA_ISLANDS;
535544
}
536545

546+
bool hasDLInsts() const {
547+
return HasDLInsts;
548+
}
549+
537550
/// \brief Returns the offset in bytes from the start of the input buffer
538551
/// of the first explicit kernel argument.
539552
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {

0 commit comments

Comments
 (0)