|
52 | 52 | #define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
|
53 | 53 |
|
54 | 54 | // AMD
|
55 |
| -// GCN/CNDA, wave size is 64 |
| 55 | +// GCN/CDNA, wave size is 64 |
56 | 56 | #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
57 | 57 | #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
58 | 58 | #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
59 | 59 | #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
60 | 60 | #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
61 | 61 | #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
62 | 62 |
|
63 |
| -// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32 |
| 63 | +// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32 |
64 | 64 | #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
|
65 | 65 | #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
66 | 66 | #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
| 67 | +#define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000 |
67 | 68 |
|
68 | 69 | #define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD)
|
69 | 70 | #define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1)
|
70 | 71 | #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
|
71 | 72 | #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
72 |
| -#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3) |
| 73 | +#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4) |
| 74 | +#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) |
73 | 75 | #define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
|
74 | 76 | #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
75 | 77 |
|
@@ -209,9 +211,9 @@ typedef float2 dfloat2;
|
209 | 211 | #define FP16_MMA_AVAILABLE
|
210 | 212 | #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
211 | 213 |
|
212 |
| -#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3)) |
| 214 | +#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4)) |
213 | 215 | #define FP16_MMA_AVAILABLE
|
214 |
| -#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3)) |
| 216 | +#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4)) |
215 | 217 |
|
216 | 218 | #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
217 | 219 | #define NEW_MMA_AVAILABLE
|
@@ -244,14 +246,14 @@ static bool fp16_mma_available(const int cc) {
|
244 | 246 | return false;
|
245 | 247 | #else
|
246 | 248 | return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
247 |
| - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); |
| 249 | + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc); |
248 | 250 | #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
249 | 251 | }
|
250 | 252 |
|
251 | 253 | // To be used for feature selection of external libraries, e.g. cuBLAS.
|
252 | 254 | static bool fp16_mma_hardware_available(const int cc) {
|
253 | 255 | return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
|
254 |
| - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc); |
| 256 | + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc); |
255 | 257 | }
|
256 | 258 |
|
257 | 259 | // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
@@ -409,7 +411,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
409 | 411 | #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
410 | 412 | #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
|
411 | 413 | c = __builtin_amdgcn_sdot4(a, b, c, false);
|
412 |
| -#elif defined(RDNA3) |
| 414 | +#elif defined(RDNA3) || defined(RDNA4) |
413 | 415 | c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
414 | 416 | #elif defined(RDNA1) || defined(__gfx900__)
|
415 | 417 | int tmp1;
|
|
0 commit comments