llvm
diff --git a/‎llvm/docs/AMDGPUUsage.rst
Lines changed: 14 additions & 4 deletions b/‎llvm/docs/AMDGPUUsage.rst
Lines changed: 14 additions & 4 deletions
diff --git a/‎llvm/include/llvm/BinaryFormat/ELF.h
Lines changed: 4 additions & 4 deletions b/‎llvm/include/llvm/BinaryFormat/ELF.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 103 additions & 0 deletions b/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 103 additions & 0 deletions
diff --git a/‎llvm/lib/ObjectYAML/ELFYAML.cpp
Lines changed: 2 additions & 0 deletions b/‎llvm/lib/ObjectYAML/ELFYAML.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPU.td
Lines changed: 31 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPU.td
Lines changed: 31 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Lines changed: 15 additions & 8 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Lines changed: 15 additions & 8 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 2 additions & 1 deletion b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Lines changed: 2 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Lines changed: 14 additions & 1 deletion b/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Lines changed: 14 additions & 1 deletion
@@ -200,6 +200,16 @@ names from both the *Processor* and *Alternative Processor* can be used.
                                                                       - Radeon Instinct MI25
      ``gfx902``                  ``amdgcn``   APU   - xnack           - Ryzen 3 2200G
                                                       [on]            - Ryzen 5 2400G
+     ``gfx904``                  ``amdgcn``   dGPU  - xnack           *TBA*
+                                                      [off]
+                                                                      .. TODO
+                                                                         Add product
+                                                                         names.
+     ``gfx906``                  ``amdgcn``   dGPU  - xnack           *TBA*
+                                                      [off]
+                                                                      .. TODO
+                                                                         Add product
+                                                                         names.
      =========== =============== ============ ===== ========= ======= ==================
 
 .. _amdgpu-target-features:
@@ -547,8 +557,8 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_MACH_AMDGCN_GFX810``  0x02b      ``gfx810``
      ``EF_AMDGPU_MACH_AMDGCN_GFX900``  0x02c      ``gfx900``
      ``EF_AMDGPU_MACH_AMDGCN_GFX902``  0x02d      ``gfx902``
-     *reserved*                        0x02e      Reserved.
-     *reserved*                        0x02f      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX904``  0x02e      ``gfx904``
+     ``EF_AMDGPU_MACH_AMDGCN_GFX906``  0x02f      ``gfx906``
      *reserved*                        0x030      Reserved.
      ================================= ========== =============================
 
@@ -765,7 +775,7 @@ The following relocation types are supported:
      ``R_AMDGPU_ABS32_HI``      Static, 2      ``word32``  (S + A) >> 32
                                 Dynamic
      ``R_AMDGPU_ABS64``         Static, 3      ``word64``  S + A
-                                Dynamic 
+                                Dynamic
      ``R_AMDGPU_REL32``         Static  4      ``word32``  S + A - P
      ``R_AMDGPU_REL64``         Static  5      ``word64``  S + A - P
      ``R_AMDGPU_ABS32``         Static, 6      ``word32``  S + A
@@ -784,7 +794,7 @@ the ``mesa3d`` OS, which does not support ``R_AMDGPU_ABS64``.
 
 There is no current OS loader support for 32 bit programs and so
 ``R_AMDGPU_ABS32`` is not used.
-     
+
 .. _amdgpu-dwarf:
 
 DWARF
 
@@ -687,7 +687,7 @@ enum : unsigned {
 
   // AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = 0x020,
-  EF_AMDGPU_MACH_AMDGCN_LAST = 0x02d,
+  EF_AMDGPU_MACH_AMDGCN_LAST = 0x02f,
   // AMDGCN GFX6.
   EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
   EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
@@ -705,12 +705,12 @@ enum : unsigned {
   // AMDGCN GFX9.
   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
   EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
+  EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
+  EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
 
   // Reserved for AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x02e,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED2 = 0x02f,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED3 = 0x030,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x030,
 
   // Indicates if the xnack target feature is enabled for all code contained in
   // the object.
 
@@ -1287,6 +1287,109 @@ def int_amdgcn_ds_bpermute :
   GCCBuiltin<"__builtin_amdgcn_ds_bpermute">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
 
+//===----------------------------------------------------------------------===//
+// Deep learning intrinsics.
+//===----------------------------------------------------------------------===//
+
+// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
+def int_amdgcn_fdot2 :
+  GCCBuiltin<"__builtin_amdgcn_fdot2">,
+  Intrinsic<
+    [llvm_float_ty], // %r
+    [
+      llvm_v2f16_ty, // %a
+      llvm_v2f16_ty, // %b
+      llvm_float_ty  // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
+def int_amdgcn_sdot2 :
+  GCCBuiltin<"__builtin_amdgcn_sdot2">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_v2i16_ty, // %a
+      llvm_v2i16_ty, // %b
+      llvm_i32_ty    // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
+def int_amdgcn_udot2 :
+  GCCBuiltin<"__builtin_amdgcn_udot2">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_v2i16_ty, // %a
+      llvm_v2i16_ty, // %b
+      llvm_i32_ty    // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
+def int_amdgcn_sdot4 :
+  GCCBuiltin<"__builtin_amdgcn_sdot4">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_i32_ty, // %a
+      llvm_i32_ty, // %b
+      llvm_i32_ty  // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
+def int_amdgcn_udot4 :
+  GCCBuiltin<"__builtin_amdgcn_udot4">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_i32_ty, // %a
+      llvm_i32_ty, // %b
+      llvm_i32_ty  // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
+//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
+def int_amdgcn_sdot8 :
+  GCCBuiltin<"__builtin_amdgcn_sdot8">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_i32_ty, // %a
+      llvm_i32_ty, // %b
+      llvm_i32_ty  // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
+
+// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c)
+//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
+//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
+def int_amdgcn_udot8 :
+  GCCBuiltin<"__builtin_amdgcn_udot8">,
+  Intrinsic<
+    [llvm_i32_ty], // %r
+    [
+      llvm_i32_ty, // %a
+      llvm_i32_ty, // %b
+      llvm_i32_ty  // %c
+    ],
+    [IntrNoMem, IntrSpeculatable]
+  >;
 
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
 
@@ -400,6 +400,8 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX810, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX902, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH);
     BCase(EF_AMDGPU_XNACK);
     break;
   case ELF::EM_X86_64:
 
@@ -127,6 +127,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
   "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
 >;
 
+def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
+  "HasFmaMixInsts",
+  "true",
+  "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
+>;
+
 // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
 // XNACK. The current default kernel driver setting is:
 // - graphics ring: XNACK disabled
@@ -310,6 +316,12 @@ def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
   "Has unpacked d16 vmem instructions"
 >;
 
+def FeatureDLInsts : SubtargetFeature<"dl-insts",
+  "HasDLInsts",
+  "true",
+  "Has deep learning instructions"
+>;
+
 //===------------------------------------------------------------===//
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
@@ -606,6 +618,18 @@ def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
    FeatureXNACK
    ]>;
 
+def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
+  [FeatureGFX9,
+   FeatureLDSBankCount32,
+   FeatureFmaMixInsts]>;
+
+def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
+  [FeatureGFX9,
+   HalfRate64Ops,
+   FeatureFmaMixInsts,
+   FeatureLDSBankCount32,
+   FeatureDLInsts]>;
+
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
 //===----------------------------------------------------------------------===//
@@ -788,6 +812,13 @@ def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
 def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
                 AssemblerPredicate<"FeatureMovrel">;
 
+def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
+  AssemblerPredicate<"FeatureFmaMixInsts">;
+
+def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
+  AssemblerPredicate<"FeatureDLInsts">;
+
+
 def EnableLateCFGStructurize : Predicate<
   "EnableLateStructurizeCFG">;
 
 
@@ -215,7 +215,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   void SelectS_BFE(SDNode *N);
   bool isCBranchSCC(const SDNode *N) const;
   void SelectBRCOND(SDNode *N);
-  void SelectFMAD(SDNode *N);
+  void SelectFMAD_FMA(SDNode *N);
   void SelectATOMIC_CMP_SWAP(SDNode *N);
 
 protected:
@@ -621,7 +621,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectBRCOND(N);
     return;
   case ISD::FMAD:
-    SelectFMAD(N);
+  case ISD::FMA:
+    SelectFMAD_FMA(N);
     return;
   case AMDGPUISD::ATOMIC_CMP_SWAP:
     SelectATOMIC_CMP_SWAP(N);
@@ -1728,9 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
                        VCC.getValue(0));
 }
 
-void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
   MVT VT = N->getSimpleValueType(0);
-  if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) {
+  bool IsFMA = N->getOpcode() == ISD::FMA;
+  if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
+                         !Subtarget->hasFmaMixInsts()) ||
+      ((IsFMA && Subtarget->hasMadMixInsts()) ||
+       (!IsFMA && Subtarget->hasFmaMixInsts()))) {
     SelectCode(N);
     return;
   }
@@ -1740,13 +1745,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
   SDValue Src2 = N->getOperand(2);
   unsigned Src0Mods, Src1Mods, Src2Mods;
 
-  // Avoid using v_mad_mix_f32 unless there is actually an operand using the
-  // conversion from f16.
+  // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
+  // using the conversion from f16.
   bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
 
-  assert(!Subtarget->hasFP32Denormals() &&
+  assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
          "fmad selected with denormals enabled");
   // TODO: We can select this with f32 denormals enabled if all the sources are
   // converted from f16 (in which case fmad isn't legal).
@@ -1762,7 +1767,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
       Zero, Zero
     };
 
-    CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops);
+    CurDAG->SelectNodeTo(N,
+                         IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
+                         MVT::f32, Ops);
   } else {
     SelectCode(N);
   }
 
@@ -939,7 +939,8 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
 // where this is OK to use.
 bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
                                            EVT DestVT, EVT SrcVT) const {
-  return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
+  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+          (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
          DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
          SrcVT.getScalarType() == MVT::f16;
 }
 
@@ -148,6 +148,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasIntClamp(false),
     HasVOP3PInsts(false),
     HasMadMixInsts(false),
+    HasFmaMixInsts(false),
     HasMovrel(false),
     HasVGPRIndexMode(false),
     HasScalarStores(false),
@@ -160,6 +161,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasSDWAMac(false),
     HasSDWAOutModsVOPC(false),
     HasDPP(false),
+    HasDLInsts(false),
     FlatAddressSpace(false),
     FlatInstOffsets(false),
     FlatGlobalInsts(false),
 
@@ -72,7 +72,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
     ISAVersion8_0_3,
     ISAVersion8_1_0,
     ISAVersion9_0_0,
-    ISAVersion9_0_2
+    ISAVersion9_0_1,
+    ISAVersion9_0_2,
+    ISAVersion9_0_4,
+    ISAVersion9_0_6
   };
 
   enum TrapHandlerAbi {
@@ -150,6 +153,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
   bool HasIntClamp;
   bool HasVOP3PInsts;
   bool HasMadMixInsts;
+  bool HasFmaMixInsts;
   bool HasMovrel;
   bool HasVGPRIndexMode;
   bool HasScalarStores;
@@ -162,6 +166,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
   bool HasSDWAMac;
   bool HasSDWAOutModsVOPC;
   bool HasDPP;
+  bool HasDLInsts;
   bool FlatAddressSpace;
   bool FlatInstOffsets;
   bool FlatGlobalInsts;
@@ -329,6 +334,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
     return HasMadMixInsts;
   }
 
+  bool hasFmaMixInsts() const {
+    return HasFmaMixInsts;
+  }
+
   bool hasCARRY() const {
     return (getGeneration() >= EVERGREEN);
   }
@@ -534,6 +543,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
     return getGeneration() < SEA_ISLANDS;
   }
 
+  bool hasDLInsts() const {
+    return HasDLInsts;
+  }
+
   /// \brief Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
   unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {