Skip to content

Commit 2f538b9

Browse files
Add __hgt2_mask implementation for CUDA 11
1 parent 0bc67dd commit 2f538b9

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

ggml-cuda/common.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,13 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
306306
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
307307
}
308308

309+
#if CUDART_VERSION < 12000
310+
static __device__ __forceinline__ uint __hgt2_mask(const half2 a, const half2 b) {
311+
const uint mask_low = 0x0000FFFF * ( __low2half(a) > __low2half(b));
312+
const uint mask_high = 0xFFFF0000 * (__high2half(a) > __high2half(b));
313+
return mask_low | mask_high;
314+
}
315+
#endif // CUDART_VERSION < 12000
309316

310317
#if defined(GGML_USE_HIPBLAS)
311318
#define __CUDA_ARCH__ 1300

0 commit comments

Comments
 (0)