diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 6d3a3d0970..c978a63461 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -155,7 +155,7 @@ * [x] [`_mm512_div_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ps&expand=5236) * [x] [`_mm512_div_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_pd&expand=5236) * [x] [`_mm512_div_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ps&expand=5236) - * [ ] [`_mm512_extractf32x4_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extractf32x4_ps&expand=5236) + * [x] [`_mm512_extractf32x4_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extractf32x4_ps&expand=5236) * [ ] [`_mm512_extractf64x4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extractf64x4_pd&expand=5236) * [ ] [`_mm512_extracti32x4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti32x4_epi32&expand=5236) * [ ] [`_mm512_extracti64x4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti64x4_epi64&expand=5236) @@ -228,14 +228,14 @@ * [ ] [`_mm512_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=5236) * [ ] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236) * [x] [`_mm512_kand`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kand&expand=5236) - * [ ] [`_mm512_kandn`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kandn&expand=5236) - * [ ] [`_mm512_kmov`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kmov&expand=5236) - * [ ] [`_mm512_knot`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_knot&expand=5236) + * [x] [`_mm512_kandn`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kandn&expand=5236) + * [x] [`_mm512_kmov`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kmov&expand=5236) + * [x] [`_mm512_knot`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_knot&expand=5236) * [x] [`_mm512_kor`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kor&expand=5236) * [ ] [`_mm512_kortestc`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kortestc&expand=5236) * [ ] [`_mm512_kortestz`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kortestz&expand=5236) * [ ] [`_mm512_kunpackb`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kunpackb&expand=5236) - * [ ] [`_mm512_kxnor`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kxnor&expand=5236) + * [x] [`_mm512_kxnor`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kxnor&expand=5236) * [x] [`_mm512_kxor`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kxor&expand=5236) * [ ] [`_mm512_load_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_epi32&expand=5236) * [ ] [`_mm512_load_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_epi64&expand=5236) @@ -543,9 +543,9 @@ * [ ] [`_mm512_mask_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi64&expand=5236) * [ ] [`_mm512_mask_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_pd&expand=5236) * [ ] [`_mm512_mask_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_ps&expand=5236) - * [ ] [`_mm512_mask_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movedup_pd&expand=5236) - * [ ] [`_mm512_mask_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movehdup_ps&expand=5236) - * [ ] [`_mm512_mask_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_moveldup_ps&expand=5236) + * [x] [`_mm512_mask_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movedup_pd&expand=5236) + * [x] [`_mm512_mask_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movehdup_ps&expand=5236) + * [x] [`_mm512_mask_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_moveldup_ps&expand=5236) * [x] [`_mm512_mask_mul_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_epi32&expand=5236) * [x] [`_mm512_mask_mul_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_epu32&expand=5236) * [x] [`_mm512_mask_mul_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pd&expand=5236) @@ -617,11 +617,11 @@ * [ ] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236) * [x] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236) * [x] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236) - * [ ] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236) - * [ ] [`_mm512_mask_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32x4&expand=5236) - * [ ] [`_mm512_mask_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5236) - * [ ] [`_mm512_mask_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x4&expand=5236) - * [ ] [`_mm512_mask_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i64x2&expand=5236) + * [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236) + * [x] [`_mm512_mask_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32x4&expand=5236) + * [x] [`_mm512_mask_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5236) + * [x] [`_mm512_mask_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x4&expand=5236) + * [x] [`_mm512_mask_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i64x2&expand=5236) * [x] [`_mm512_mask_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_pd&expand=5236) * [x] [`_mm512_mask_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_ps&expand=5236) * [x] [`_mm512_mask_sll_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sll_epi32&expand=5236) @@ -841,9 +841,9 @@ * [ ] [`_mm512_maskz_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi64&expand=5236) * [ ] [`_mm512_maskz_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_pd&expand=5236) * [ ] [`_mm512_maskz_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_ps&expand=5236) - * [ ] [`_mm512_maskz_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movedup_pd&expand=5236) - * [ ] [`_mm512_maskz_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movehdup_ps&expand=5236) - * [ ] [`_mm512_maskz_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_moveldup_ps&expand=5236) + * [x] [`_mm512_maskz_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movedup_pd&expand=5236) + * [x] [`_mm512_maskz_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movehdup_ps&expand=5236) + * [x] [`_mm512_maskz_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_moveldup_ps&expand=5236) * [x] [`_mm512_maskz_mul_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_epi32&expand=5236) * [x] [`_mm512_maskz_mul_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_epu32&expand=5236) * [x] [`_mm512_maskz_mul_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pd&expand=5236) @@ -889,11 +889,11 @@ * [ ] [`_mm512_maskz_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=5236) * [ ] [`_mm512_maskz_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=5236) * [ ] [`_mm512_maskz_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=5236) - * [ ] [`_mm512_maskz_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5236) - * [ ] [`_mm512_maskz_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32x4&expand=5236) - * [ ] [`_mm512_maskz_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5236) - * [ ] [`_mm512_maskz_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32x4&expand=5236) - * [ ] [`_mm512_maskz_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i64x2&expand=5236) + * [x] [`_mm512_maskz_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5236) + * [x] [`_mm512_maskz_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32x4&expand=5236) + * [x] [`_mm512_maskz_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5236) + * [x] [`_mm512_maskz_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32x4&expand=5236) + * [x] [`_mm512_maskz_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i64x2&expand=5236) * [x] [`_mm512_maskz_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_pd&expand=5236) * [x] [`_mm512_maskz_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_ps&expand=5236) * [x] [`_mm512_maskz_sll_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sll_epi32&expand=5236) @@ -952,9 +952,9 @@ * [x] [`_mm512_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ps&expand=5236) * [x] [`_mm512_min_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_pd&expand=5236) * [x] [`_mm512_min_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ps&expand=5236) - * [ ] [`_mm512_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movedup_pd&expand=5236) - * [ ] [`_mm512_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movehdup_ps&expand=5236) - * [ ] [`_mm512_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_moveldup_ps&expand=5236) + * [x] [`_mm512_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movedup_pd&expand=5236) + * [x] [`_mm512_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movehdup_ps&expand=5236) + * [x] [`_mm512_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_moveldup_ps&expand=5236) * [x] [`_mm512_mul_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_epi32&expand=5236) * [x] [`_mm512_mul_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_epu32&expand=5236) * [x] [`_mm512_mul_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pd&expand=5236) @@ -1040,7 +1040,7 @@ * [x] [`_mm512_set_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_epi64&expand=5236) * [ ] [`_mm512_set_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_epi8&expand=5236) * [x] [`_mm512_set_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_pd&expand=5236) - * [ ] [`_mm512_set_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ps&expand=5236) + * [x] [`_mm512_set_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ps&expand=5236) * [ ] [`_mm512_setr4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_epi32&expand=5236) * [ ] [`_mm512_setr4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_epi64&expand=5236) * [ ] [`_mm512_setr4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_pd&expand=5236) @@ -1054,11 +1054,11 @@ * [x] [`_mm512_setzero_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ps&expand=5236) * [x] [`_mm512_setzero_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_si512&expand=5236) * [ ] [`_mm512_setzero`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero&expand=5236) - * [ ] [`_mm512_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_epi32&expand=5236) - * [ ] [`_mm512_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f32x4&expand=5236) - * [ ] [`_mm512_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f64x2&expand=5236) - * [ ] [`_mm512_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32x4&expand=5236) - * [ ] [`_mm512_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i64x2&expand=5236) + * [x] [`_mm512_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_epi32&expand=5236) + * [x] [`_mm512_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f32x4&expand=5236) + * [x] [`_mm512_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f64x2&expand=5236) + * [x] [`_mm512_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32x4&expand=5236) + * [x] [`_mm512_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i64x2&expand=5236) * [x] [`_mm512_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_pd&expand=5236) * [x] [`_mm512_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_ps&expand=5236) * [x] [`_mm512_sll_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sll_epi32&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 37d0ca0480..3f9bbfb3e1 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -8507,6 +8507,246 @@ pub unsafe fn _mm512_mask2_permutex2var_pd( transmute(simd_select_bitmask(k, permute, zero)) } +/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_epi32&expand=5150) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] //should be vpshufd, but generate vpermilps +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_shuffle_epi32(a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i { + let imm8 = (imm8 & 0xFF) as u8; + + let a = a.as_i32x16(); + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + a, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28), + 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29), + 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30), + _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28), + 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29), + 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30), + _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12), + 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13), + 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14), + _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15), + } + }; + } + let r: i32x16 = match imm8 & 0x3 { + 0 => shuffle1!(0, 4, 8, 12), + 1 => shuffle1!(1, 5, 9, 13), + 2 => shuffle1!(2, 6, 10, 14), + _ => shuffle1!(3, 7, 11, 15), + }; + transmute(r) +} + +/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_epi32&expand=5148) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))] //should be vpshufd, but generate vpermilps +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_shuffle_epi32( + src: __m512i, + k: __mmask16, + a: __m512i, + imm8: _MM_PERM_ENUM, +) -> __m512i { + let imm8 = (imm8 & 0xFF) as u8; + + let a = a.as_i32x16(); + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + a, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28), + 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29), + 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30), + _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28), + 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29), + 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30), + _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12), + 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13), + 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14), + _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15), + } + }; + } + let shuffle: i32x16 = match imm8 & 0x3 { + 0 => shuffle1!(0, 4, 8, 12), + 1 => shuffle1!(1, 5, 9, 13), + 2 => shuffle1!(2, 6, 10, 14), + _ => shuffle1!(3, 7, 11, 15), + }; + transmute(simd_select_bitmask(k, shuffle, src.as_i32x16())) +} + +/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_epi32&expand=5149) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))] //should be vpshufd, but generate vpermilps +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_maskz_shuffle_epi32(k: __mmask16, a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i { + let imm8 = (imm8 & 0xFF) as u8; + + let a = a.as_i32x16(); + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + a, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28), + 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29), + 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30), + _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28), + 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29), + 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30), + _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12), + 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13), + 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14), + _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15), + } + }; + } + let shuffle: i32x16 = match imm8 & 0x3 { + 0 => shuffle1!(0, 4, 8, 12), + 1 => shuffle1!(1, 5, 9, 13), + 2 => shuffle1!(2, 6, 10, 14), + _ => shuffle1!(3, 7, 11, 15), + }; + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, shuffle, zero)) +} + /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_ps&expand=5203) @@ -8979,9 +9219,1018 @@ pub unsafe fn _mm512_maskz_shuffle_pd(k: __mmask8, a: __m512d, b: __m512d, imm8: }; let zero = _mm512_setzero_pd().as_f64x8(); - transmute(simd_select_bitmask(k, shuffle, zero)) + transmute(simd_select_bitmask(k, shuffle, zero)) +} + +/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_i32&expand=5177) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))] //should be vshufi32x4, but generate vshufi64x2 +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shuffle_i32x4(a: __m512i, b: __m512i, imm8: i32) -> __m512i { + let imm8 = (imm8 & 0xFF) as u8; + + let a = a.as_i32x16(); + let b = b.as_i32x16(); + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + b, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27), + _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19), + 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23), + 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27), + _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3), + 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7), + 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11), + _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15), + } + }; + } + let r: i32x16 = match imm8 & 0x3 { + 0 => shuffle1!(0, 1, 2, 3), + 1 => shuffle1!(4, 5, 6, 7), + 2 => shuffle1!(8, 9, 10, 11), + _ => shuffle1!(12, 13, 14, 15), + }; + + transmute(r) +} + +/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_i32x&expand=5175) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10111111))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shuffle_i32x4( + src: __m512i, + k: __mmask16, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + let imm8 = (imm8 & 0xFF) as u8; + + let a = a.as_i32x16(); + let b = b.as_i32x16(); + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + b, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27), + _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19), + 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23), + 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27), + _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3), + 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7), + 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11), + _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15), + } + }; + } + let shuffle = match imm8 & 0x3 { + 0 => shuffle1!(0, 1, 2, 3), + 1 => shuffle1!(4, 5, 6, 7), + 2 => shuffle1!(8, 9, 10, 11), + _ => shuffle1!(12, 13, 14, 15), + }; + + transmute(simd_select_bitmask(k, shuffle, src.as_i32x16())) +} + +/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_i32&expand=5176) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10111111))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shuffle_i32x4( + k: __mmask16, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + let imm8 = (imm8 & 0xFF) as u8; + + let a = a.as_i32x16(); + let b = b.as_i32x16(); + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + b, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27), + _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19), + 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23), + 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27), + _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3), + 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7), + 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11), + _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15), + } + }; + } + let shuffle = match imm8 & 0x3 { + 0 => shuffle1!(0, 1, 2, 3), + 1 => shuffle1!(4, 5, 6, 7), + 2 => shuffle1!(8, 9, 10, 11), + _ => shuffle1!(12, 13, 14, 15), + }; + + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, shuffle, zero)) +} + +/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_i64x2&expand=5183) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shuffle_i64x2(a: __m512i, b: __m512i, imm8: i32) -> __m512i { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr + ) => { + simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13), + _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, 8, 9), + 1 => shuffle3!($a, $b, $e, $f, 10, 11), + 2 => shuffle3!($a, $b, $e, $f, 12, 13), + _ => shuffle3!($a, $b, $e, $f, 14, 15), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, 0, 1), + 1 => shuffle2!($a, $e, 2, 3), + 2 => shuffle2!($a, $e, 4, 5), + _ => shuffle2!($a, $e, 6, 7), + } + }; + } + match imm8 & 0x3 { + 0 => shuffle1!(0, 1), + 1 => shuffle1!(2, 3), + 2 => shuffle1!(4, 5), + _ => shuffle1!(6, 7), + } +} + +/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_i64x&expand=5181) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shuffle_i64x2( + src: __m512i, + k: __mmask8, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr + ) => { + simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13), + _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, 8, 9), + 1 => shuffle3!($a, $b, $e, $f, 10, 11), + 2 => shuffle3!($a, $b, $e, $f, 12, 13), + _ => shuffle3!($a, $b, $e, $f, 14, 15), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, 0, 1), + 1 => shuffle2!($a, $e, 2, 3), + 2 => shuffle2!($a, $e, 4, 5), + _ => shuffle2!($a, $e, 6, 7), + } + }; + } + let shuffle = match imm8 & 0x3 { + 0 => shuffle1!(0, 1), + 1 => shuffle1!(2, 3), + 2 => shuffle1!(4, 5), + _ => shuffle1!(6, 7), + }; + + transmute(simd_select_bitmask(k, shuffle, src.as_i64x8())) +} + +/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_i64&expand=5182) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shuffle_i64x2( + k: __mmask8, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr + ) => { + simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13), + _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, 8, 9), + 1 => shuffle3!($a, $b, $e, $f, 10, 11), + 2 => shuffle3!($a, $b, $e, $f, 12, 13), + _ => shuffle3!($a, $b, $e, $f, 14, 15), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, 0, 1), + 1 => shuffle2!($a, $e, 2, 3), + 2 => shuffle2!($a, $e, 4, 5), + _ => shuffle2!($a, $e, 6, 7), + } + }; + } + let shuffle = match imm8 & 0x3 { + 0 => shuffle1!(0, 1), + 1 => shuffle1!(2, 3), + 2 => shuffle1!(4, 5), + _ => shuffle1!(6, 7), + }; + + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, shuffle, zero)) +} + +/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_f32x4&expand=5165) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] //should be vshuff32x4, but generate vshuff64x2 +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shuffle_f32x4(a: __m512, b: __m512, imm8: i32) -> __m512 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + b, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27), + _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19), + 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23), + 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27), + _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3), + 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7), + 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11), + _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15), + } + }; + } + match imm8 & 0x3 { + 0 => shuffle1!(0, 1, 2, 3), + 1 => shuffle1!(4, 5, 6, 7), + 2 => shuffle1!(8, 9, 10, 11), + _ => shuffle1!(12, 13, 14, 15), + } +} + +/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_f32&expand=5163) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shuffle_f32x4( + src: __m512, + k: __mmask16, + a: __m512, + b: __m512, + imm8: i32, +) -> __m512 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + b, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27), + _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19), + 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23), + 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27), + _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3), + 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7), + 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11), + _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15), + } + }; + } + let shuffle = match imm8 & 0x3 { + 0 => shuffle1!(0, 1, 2, 3), + 1 => shuffle1!(4, 5, 6, 7), + 2 => shuffle1!(8, 9, 10, 11), + _ => shuffle1!(12, 13, 14, 15), + }; + + transmute(simd_select_bitmask(k, shuffle, src.as_f32x16())) +} + +/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_f32&expand=5164) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + b, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27), + _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19), + 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23), + 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27), + _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3), + 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7), + 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11), + _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15), + } + }; + } + let shuffle = match imm8 & 0x3 { + 0 => shuffle1!(0, 1, 2, 3), + 1 => shuffle1!(4, 5, 6, 7), + 2 => shuffle1!(8, 9, 10, 11), + _ => shuffle1!(12, 13, 14, 15), + }; + + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, shuffle, zero)) +} + +/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_f64x2&expand=5171) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shuffle_f64x2(a: __m512d, b: __m512d, imm8: i32) -> __m512d { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr + ) => { + simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13), + _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, 8, 9), + 1 => shuffle3!($a, $b, $e, $f, 10, 11), + 2 => shuffle3!($a, $b, $e, $f, 12, 13), + _ => shuffle3!($a, $b, $e, $f, 14, 15), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, 0, 1), + 1 => shuffle2!($a, $e, 2, 3), + 2 => shuffle2!($a, $e, 4, 5), + _ => shuffle2!($a, $e, 6, 7), + } + }; + } + match imm8 & 0x3 { + 0 => shuffle1!(0, 1), + 1 => shuffle1!(2, 3), + 2 => shuffle1!(4, 5), + _ => shuffle1!(6, 7), + } +} + +/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_f64x2&expand=5169) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shuffle_f64x2( + src: __m512d, + k: __mmask8, + a: __m512d, + b: __m512d, + imm8: i32, +) -> __m512d { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr + ) => { + simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13), + _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, 8, 9), + 1 => shuffle3!($a, $b, $e, $f, 10, 11), + 2 => shuffle3!($a, $b, $e, $f, 12, 13), + _ => shuffle3!($a, $b, $e, $f, 14, 15), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, 0, 1), + 1 => shuffle2!($a, $e, 2, 3), + 2 => shuffle2!($a, $e, 4, 5), + _ => shuffle2!($a, $e, 6, 7), + } + }; + } + let shuffle = match imm8 & 0x3 { + 0 => shuffle1!(0, 1), + 1 => shuffle1!(2, 3), + 2 => shuffle1!(4, 5), + _ => shuffle1!(6, 7), + }; + + transmute(simd_select_bitmask(k, shuffle, src.as_f64x8())) +} + +/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_f64x2&expand=5170) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shuffle_f64x2( + k: __mmask8, + a: __m512d, + b: __m512d, + imm8: i32, +) -> __m512d { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr + ) => { + simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9), + 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11), + 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13), + _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, $e, $f, 8, 9), + 1 => shuffle3!($a, $b, $e, $f, 10, 11), + 2 => shuffle3!($a, $b, $e, $f, 12, 13), + _ => shuffle3!($a, $b, $e, $f, 14, 15), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, $e, 0, 1), + 1 => shuffle2!($a, $e, 2, 3), + 2 => shuffle2!($a, $e, 4, 5), + _ => shuffle2!($a, $e, 6, 7), + } + }; + } + let shuffle = match imm8 & 0x3 { + 0 => shuffle1!(0, 1), + 1 => shuffle1!(2, 3), + 2 => shuffle1!(4, 5), + _ => shuffle1!(6, 7), + }; + + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, shuffle, zero)) +} + +/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_extractf32x4_ps&expand=2442) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr( + all(test, not(target_os = "windows")), + assert_instr(vextractf32x4, imm8 = 3) +)] +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_extractf32x4_ps(a: __m512, imm8: i32) -> __m128 { + match imm8 & 0x3 { + 0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]), + 1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]), + 2 => simd_shuffle4(a, _mm512_undefined_ps(), [8, 9, 10, 11]), + _ => simd_shuffle4(a, _mm512_undefined_ps(), [12, 13, 14, 15]), + } +} + +/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_moveldup_ps&expand=3862) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovsldup))] +pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 { + let r: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]); + transmute(r) +} + +/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_moveldup_ps&expand=3860) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovsldup))] +pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 { + let mov: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]); + transmute(simd_select_bitmask(k, mov, src.as_f32x16())) +} + +/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_moveldup_ps&expand=3861) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovsldup))] +pub unsafe fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 { + let mov: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, mov, zero)) +} + +/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_movehdup_ps&expand=3852) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovshdup))] +pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 { + let r: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]); + transmute(r) +} + +/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_movehdup&expand=3850) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovshdup))] +pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 { + let mov: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]); + transmute(simd_select_bitmask(k, mov, src.as_f32x16())) +} + +/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_moveh&expand=3851) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovshdup))] +pub unsafe fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 { + let mov: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, mov, zero)) +} + +/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_movedup_pd&expand=3843) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovddup))] +pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d { + let r: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]); + transmute(r) +} + +/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_movedup_pd&expand=3841) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovddup))] +pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d { + let mov: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]); + transmute(simd_select_bitmask(k, mov, src.as_f64x8())) +} + +/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_movedup_pd&expand=3842) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovddup))] +pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d { + let mov: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, mov, zero)) } +/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272) @@ -9217,7 +10466,7 @@ pub unsafe fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(kandw(a, b)) + transmute(a & b) } /// Compute the bitwise AND of 16-bit masks a and b, and store the result in k. @@ -9227,7 +10476,7 @@ pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(kandw(a, b)) + transmute(a & b) } /// Compute the bitwise OR of 16-bit masks a and b, and store the result in k. @@ -9237,7 +10486,7 @@ pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(korw(a, b)) + transmute(a | b) } /// Compute the bitwise OR of 16-bit masks a and b, and store the result in k. @@ -9247,7 +10496,7 @@ pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(korw(a, b)) + transmute(a | b) } /// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k. @@ -9257,7 +10506,7 @@ pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(kxorw(a, b)) + transmute(a ^ b) } /// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k. @@ -9267,7 +10516,76 @@ pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(kxorw(a, b)) + transmute(a ^ b) +} + +/// Compute the bitwise NOT of 16-bit mask a, and store the result in k. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=knot_mask16&expand=3233) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 { + transmute(a ^ 0b11111111_11111111) +} + +/// Compute the bitwise NOT of 16-bit mask a, and store the result in k. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_knot&expand=3231) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 { + transmute(a ^ 0b11111111_11111111) +} + +/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kandn_mask16&expand=3218) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(not))] // generate normal and, not code instead of kandnw +pub unsafe fn _kandn_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { + _mm512_kand(_mm512_knot(a), b) +} + +/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kandn&expand=3216) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(not))] // generate normal and code instead of kandw +pub unsafe fn _mm512_kandn(a: __mmask16, b: __mmask16) -> __mmask16 { + _mm512_kand(_mm512_knot(a), b) +} + +/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kxnor_mask16&expand=3285) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(xor))] // generate normal xor, not code instead of kxnorw +pub unsafe fn _kxnor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { + _mm512_knot(_mm512_kxor(a, b)) +} + +/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kxnor&expand=3283) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kandw +pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 { + _mm512_knot(_mm512_kxor(a, b)) +} + +/// Copy 16-bit mask a to k. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm512_kmov&expand=3228) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw +pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 { + let r: u16 = a; + transmute(r) } /// Sets packed 32-bit integers in `dst` with the supplied values. @@ -10854,6 +12172,263 @@ pub const _MM_MANT_SIGN_ZERO: _MM_MANTISSA_SIGN_ENUM = 0x01; /// DEST = NaN if sign(SRC) = 1 pub const _MM_MANT_SIGN_NAN: _MM_MANTISSA_SIGN_ENUM = 0x02; +pub const _MM_PERM_AAAA: _MM_PERM_ENUM = 0x00; +pub const _MM_PERM_AAAB: _MM_PERM_ENUM = 0x01; +pub const _MM_PERM_AAAC: _MM_PERM_ENUM = 0x02; +pub const _MM_PERM_AAAD: _MM_PERM_ENUM = 0x03; +pub const _MM_PERM_AABA: _MM_PERM_ENUM = 0x04; +pub const _MM_PERM_AABB: _MM_PERM_ENUM = 0x05; +pub const _MM_PERM_AABC: _MM_PERM_ENUM = 0x06; +pub const _MM_PERM_AABD: _MM_PERM_ENUM = 0x07; +pub const _MM_PERM_AACA: _MM_PERM_ENUM = 0x08; +pub const _MM_PERM_AACB: _MM_PERM_ENUM = 0x09; +pub const _MM_PERM_AACC: _MM_PERM_ENUM = 0x0A; +pub const _MM_PERM_AACD: _MM_PERM_ENUM = 0x0B; +pub const _MM_PERM_AADA: _MM_PERM_ENUM = 0x0C; +pub const _MM_PERM_AADB: _MM_PERM_ENUM = 0x0D; +pub const _MM_PERM_AADC: _MM_PERM_ENUM = 0x0E; +pub const _MM_PERM_AADD: _MM_PERM_ENUM = 0x0F; +pub const _MM_PERM_ABAA: _MM_PERM_ENUM = 0x10; +pub const _MM_PERM_ABAB: _MM_PERM_ENUM = 0x11; +pub const _MM_PERM_ABAC: _MM_PERM_ENUM = 0x12; +pub const _MM_PERM_ABAD: _MM_PERM_ENUM = 0x13; +pub const _MM_PERM_ABBA: _MM_PERM_ENUM = 0x14; +pub const _MM_PERM_ABBB: _MM_PERM_ENUM = 0x15; +pub const _MM_PERM_ABBC: _MM_PERM_ENUM = 0x16; +pub const _MM_PERM_ABBD: _MM_PERM_ENUM = 0x17; +pub const _MM_PERM_ABCA: _MM_PERM_ENUM = 0x18; +pub const _MM_PERM_ABCB: _MM_PERM_ENUM = 0x19; +pub const _MM_PERM_ABCC: _MM_PERM_ENUM = 0x1A; +pub const _MM_PERM_ABCD: _MM_PERM_ENUM = 0x1B; +pub const _MM_PERM_ABDA: _MM_PERM_ENUM = 0x1C; +pub const _MM_PERM_ABDB: _MM_PERM_ENUM = 0x1D; +pub const _MM_PERM_ABDC: _MM_PERM_ENUM = 0x1E; +pub const _MM_PERM_ABDD: _MM_PERM_ENUM = 0x1F; +pub const _MM_PERM_ACAA: _MM_PERM_ENUM = 0x20; +pub const _MM_PERM_ACAB: _MM_PERM_ENUM = 0x21; +pub const _MM_PERM_ACAC: _MM_PERM_ENUM = 0x22; +pub const _MM_PERM_ACAD: _MM_PERM_ENUM = 0x23; +pub const _MM_PERM_ACBA: _MM_PERM_ENUM = 0x24; +pub const _MM_PERM_ACBB: _MM_PERM_ENUM = 0x25; +pub const _MM_PERM_ACBC: _MM_PERM_ENUM = 0x26; +pub const _MM_PERM_ACBD: _MM_PERM_ENUM = 0x27; +pub const _MM_PERM_ACCA: _MM_PERM_ENUM = 0x28; +pub const _MM_PERM_ACCB: _MM_PERM_ENUM = 0x29; +pub const _MM_PERM_ACCC: _MM_PERM_ENUM = 0x2A; +pub const _MM_PERM_ACCD: _MM_PERM_ENUM = 0x2B; +pub const _MM_PERM_ACDA: _MM_PERM_ENUM = 0x2C; +pub const _MM_PERM_ACDB: _MM_PERM_ENUM = 0x2D; +pub const _MM_PERM_ACDC: _MM_PERM_ENUM = 0x2E; +pub const _MM_PERM_ACDD: _MM_PERM_ENUM = 0x2F; +pub const _MM_PERM_ADAA: _MM_PERM_ENUM = 0x30; +pub const _MM_PERM_ADAB: _MM_PERM_ENUM = 0x31; +pub const _MM_PERM_ADAC: _MM_PERM_ENUM = 0x32; +pub const _MM_PERM_ADAD: _MM_PERM_ENUM = 0x33; +pub const _MM_PERM_ADBA: _MM_PERM_ENUM = 0x34; +pub const _MM_PERM_ADBB: _MM_PERM_ENUM = 0x35; +pub const _MM_PERM_ADBC: _MM_PERM_ENUM = 0x36; +pub const _MM_PERM_ADBD: _MM_PERM_ENUM = 0x37; +pub const _MM_PERM_ADCA: _MM_PERM_ENUM = 0x38; +pub const _MM_PERM_ADCB: _MM_PERM_ENUM = 0x39; +pub const _MM_PERM_ADCC: _MM_PERM_ENUM = 0x3A; +pub const _MM_PERM_ADCD: _MM_PERM_ENUM = 0x3B; +pub const _MM_PERM_ADDA: _MM_PERM_ENUM = 0x3C; +pub const _MM_PERM_ADDB: _MM_PERM_ENUM = 0x3D; +pub const _MM_PERM_ADDC: _MM_PERM_ENUM = 0x3E; +pub const _MM_PERM_ADDD: _MM_PERM_ENUM = 0x3F; +pub const _MM_PERM_BAAA: _MM_PERM_ENUM = 0x40; +pub const _MM_PERM_BAAB: _MM_PERM_ENUM = 0x41; +pub const _MM_PERM_BAAC: _MM_PERM_ENUM = 0x42; +pub const _MM_PERM_BAAD: _MM_PERM_ENUM = 0x43; +pub const _MM_PERM_BABA: _MM_PERM_ENUM = 0x44; +pub const _MM_PERM_BABB: _MM_PERM_ENUM = 0x45; +pub const _MM_PERM_BABC: _MM_PERM_ENUM = 0x46; +pub const _MM_PERM_BABD: _MM_PERM_ENUM = 0x47; +pub const _MM_PERM_BACA: _MM_PERM_ENUM = 0x48; +pub const _MM_PERM_BACB: _MM_PERM_ENUM = 0x49; +pub const _MM_PERM_BACC: _MM_PERM_ENUM = 0x4A; +pub const _MM_PERM_BACD: _MM_PERM_ENUM = 0x4B; +pub const _MM_PERM_BADA: _MM_PERM_ENUM = 0x4C; +pub const _MM_PERM_BADB: _MM_PERM_ENUM = 0x4D; +pub const _MM_PERM_BADC: _MM_PERM_ENUM = 0x4E; +pub const _MM_PERM_BADD: _MM_PERM_ENUM = 0x4F; +pub const _MM_PERM_BBAA: _MM_PERM_ENUM = 0x50; +pub const _MM_PERM_BBAB: _MM_PERM_ENUM = 0x51; +pub const _MM_PERM_BBAC: _MM_PERM_ENUM = 0x52; +pub const _MM_PERM_BBAD: _MM_PERM_ENUM = 0x53; +pub const _MM_PERM_BBBA: _MM_PERM_ENUM = 0x54; +pub const _MM_PERM_BBBB: _MM_PERM_ENUM = 0x55; +pub const _MM_PERM_BBBC: _MM_PERM_ENUM = 0x56; +pub const _MM_PERM_BBBD: _MM_PERM_ENUM = 0x57; +pub const _MM_PERM_BBCA: _MM_PERM_ENUM = 0x58; +pub const _MM_PERM_BBCB: _MM_PERM_ENUM = 0x59; +pub const _MM_PERM_BBCC: _MM_PERM_ENUM = 0x5A; +pub const _MM_PERM_BBCD: _MM_PERM_ENUM = 0x5B; +pub const _MM_PERM_BBDA: _MM_PERM_ENUM = 0x5C; +pub const _MM_PERM_BBDB: _MM_PERM_ENUM = 0x5D; +pub const _MM_PERM_BBDC: _MM_PERM_ENUM = 0x5E; +pub const _MM_PERM_BBDD: _MM_PERM_ENUM = 0x5F; +pub const _MM_PERM_BCAA: _MM_PERM_ENUM = 0x60; +pub const _MM_PERM_BCAB: _MM_PERM_ENUM = 0x61; +pub const _MM_PERM_BCAC: _MM_PERM_ENUM = 0x62; +pub const _MM_PERM_BCAD: _MM_PERM_ENUM = 0x63; +pub const _MM_PERM_BCBA: _MM_PERM_ENUM = 0x64; +pub const _MM_PERM_BCBB: _MM_PERM_ENUM = 0x65; +pub const _MM_PERM_BCBC: _MM_PERM_ENUM = 0x66; +pub const _MM_PERM_BCBD: _MM_PERM_ENUM = 0x67; +pub const _MM_PERM_BCCA: _MM_PERM_ENUM = 0x68; +pub const _MM_PERM_BCCB: _MM_PERM_ENUM = 0x69; +pub const _MM_PERM_BCCC: _MM_PERM_ENUM = 0x6A; +pub const _MM_PERM_BCCD: _MM_PERM_ENUM = 0x6B; +pub const _MM_PERM_BCDA: _MM_PERM_ENUM = 0x6C; +pub const _MM_PERM_BCDB: _MM_PERM_ENUM = 0x6D; +pub const _MM_PERM_BCDC: _MM_PERM_ENUM = 0x6E; +pub const _MM_PERM_BCDD: _MM_PERM_ENUM = 0x6F; +pub const _MM_PERM_BDAA: _MM_PERM_ENUM = 0x70; +pub const _MM_PERM_BDAB: _MM_PERM_ENUM = 0x71; +pub const _MM_PERM_BDAC: _MM_PERM_ENUM = 0x72; +pub const _MM_PERM_BDAD: _MM_PERM_ENUM = 0x73; +pub const _MM_PERM_BDBA: _MM_PERM_ENUM = 0x74; +pub const _MM_PERM_BDBB: _MM_PERM_ENUM = 0x75; +pub const _MM_PERM_BDBC: _MM_PERM_ENUM = 0x76; +pub const _MM_PERM_BDBD: _MM_PERM_ENUM = 0x77; +pub const _MM_PERM_BDCA: _MM_PERM_ENUM = 0x78; +pub const _MM_PERM_BDCB: _MM_PERM_ENUM = 0x79; +pub const _MM_PERM_BDCC: _MM_PERM_ENUM = 0x7A; +pub const _MM_PERM_BDCD: _MM_PERM_ENUM = 0x7B; +pub const _MM_PERM_BDDA: _MM_PERM_ENUM = 0x7C; +pub const _MM_PERM_BDDB: _MM_PERM_ENUM = 0x7D; +pub const _MM_PERM_BDDC: _MM_PERM_ENUM = 0x7E; +pub const _MM_PERM_BDDD: _MM_PERM_ENUM = 0x7F; +pub const _MM_PERM_CAAA: _MM_PERM_ENUM = 0x80; +pub const _MM_PERM_CAAB: _MM_PERM_ENUM = 0x81; +pub const _MM_PERM_CAAC: _MM_PERM_ENUM = 0x82; +pub const _MM_PERM_CAAD: _MM_PERM_ENUM = 0x83; +pub const _MM_PERM_CABA: _MM_PERM_ENUM = 0x84; +pub const _MM_PERM_CABB: _MM_PERM_ENUM = 0x85; +pub const _MM_PERM_CABC: _MM_PERM_ENUM = 0x86; +pub const _MM_PERM_CABD: _MM_PERM_ENUM = 0x87; +pub const _MM_PERM_CACA: _MM_PERM_ENUM = 0x88; +pub const _MM_PERM_CACB: _MM_PERM_ENUM = 0x89; +pub const _MM_PERM_CACC: _MM_PERM_ENUM = 0x8A; +pub const _MM_PERM_CACD: _MM_PERM_ENUM = 0x8B; +pub const _MM_PERM_CADA: _MM_PERM_ENUM = 0x8C; +pub const _MM_PERM_CADB: _MM_PERM_ENUM = 0x8D; +pub const _MM_PERM_CADC: _MM_PERM_ENUM = 0x8E; +pub const _MM_PERM_CADD: _MM_PERM_ENUM = 0x8F; +pub const _MM_PERM_CBAA: _MM_PERM_ENUM = 0x90; +pub const _MM_PERM_CBAB: _MM_PERM_ENUM = 0x91; +pub const _MM_PERM_CBAC: _MM_PERM_ENUM = 0x92; +pub const _MM_PERM_CBAD: _MM_PERM_ENUM = 0x93; +pub const _MM_PERM_CBBA: _MM_PERM_ENUM = 0x94; +pub const _MM_PERM_CBBB: _MM_PERM_ENUM = 0x95; +pub const _MM_PERM_CBBC: _MM_PERM_ENUM = 0x96; +pub const _MM_PERM_CBBD: _MM_PERM_ENUM = 0x97; +pub const _MM_PERM_CBCA: _MM_PERM_ENUM = 0x98; +pub const _MM_PERM_CBCB: _MM_PERM_ENUM = 0x99; +pub const _MM_PERM_CBCC: _MM_PERM_ENUM = 0x9A; +pub const _MM_PERM_CBCD: _MM_PERM_ENUM = 0x9B; +pub const _MM_PERM_CBDA: _MM_PERM_ENUM = 0x9C; +pub const _MM_PERM_CBDB: _MM_PERM_ENUM = 0x9D; +pub const _MM_PERM_CBDC: _MM_PERM_ENUM = 0x9E; +pub const _MM_PERM_CBDD: _MM_PERM_ENUM = 0x9F; +pub const _MM_PERM_CCAA: _MM_PERM_ENUM = 0xA0; +pub const _MM_PERM_CCAB: _MM_PERM_ENUM = 0xA1; +pub const _MM_PERM_CCAC: _MM_PERM_ENUM = 0xA2; +pub const _MM_PERM_CCAD: _MM_PERM_ENUM = 0xA3; +pub const _MM_PERM_CCBA: _MM_PERM_ENUM = 0xA4; +pub const _MM_PERM_CCBB: _MM_PERM_ENUM = 0xA5; +pub const _MM_PERM_CCBC: _MM_PERM_ENUM = 0xA6; +pub const _MM_PERM_CCBD: _MM_PERM_ENUM = 0xA7; +pub const _MM_PERM_CCCA: _MM_PERM_ENUM = 0xA8; +pub const _MM_PERM_CCCB: _MM_PERM_ENUM = 0xA9; +pub const _MM_PERM_CCCC: _MM_PERM_ENUM = 0xAA; +pub const _MM_PERM_CCCD: _MM_PERM_ENUM = 0xAB; +pub const _MM_PERM_CCDA: _MM_PERM_ENUM = 0xAC; +pub const _MM_PERM_CCDB: _MM_PERM_ENUM = 0xAD; +pub const _MM_PERM_CCDC: _MM_PERM_ENUM = 0xAE; +pub const _MM_PERM_CCDD: _MM_PERM_ENUM = 0xAF; +pub const _MM_PERM_CDAA: _MM_PERM_ENUM = 0xB0; +pub const _MM_PERM_CDAB: _MM_PERM_ENUM = 0xB1; +pub const _MM_PERM_CDAC: _MM_PERM_ENUM = 0xB2; +pub const _MM_PERM_CDAD: _MM_PERM_ENUM = 0xB3; +pub const _MM_PERM_CDBA: _MM_PERM_ENUM = 0xB4; +pub const _MM_PERM_CDBB: _MM_PERM_ENUM = 0xB5; +pub const _MM_PERM_CDBC: _MM_PERM_ENUM = 0xB6; +pub const _MM_PERM_CDBD: _MM_PERM_ENUM = 0xB7; +pub const _MM_PERM_CDCA: _MM_PERM_ENUM = 0xB8; +pub const _MM_PERM_CDCB: _MM_PERM_ENUM = 0xB9; +pub const _MM_PERM_CDCC: _MM_PERM_ENUM = 0xBA; +pub const _MM_PERM_CDCD: _MM_PERM_ENUM = 0xBB; +pub const _MM_PERM_CDDA: _MM_PERM_ENUM = 0xBC; +pub const _MM_PERM_CDDB: _MM_PERM_ENUM = 0xBD; +pub const _MM_PERM_CDDC: _MM_PERM_ENUM = 0xBE; +pub const _MM_PERM_CDDD: _MM_PERM_ENUM = 0xBF; +pub const _MM_PERM_DAAA: _MM_PERM_ENUM = 0xC0; +pub const _MM_PERM_DAAB: _MM_PERM_ENUM = 0xC1; +pub const _MM_PERM_DAAC: _MM_PERM_ENUM = 0xC2; +pub const _MM_PERM_DAAD: _MM_PERM_ENUM = 0xC3; +pub const _MM_PERM_DABA: _MM_PERM_ENUM = 0xC4; +pub const _MM_PERM_DABB: _MM_PERM_ENUM = 0xC5; +pub const _MM_PERM_DABC: _MM_PERM_ENUM = 0xC6; +pub const _MM_PERM_DABD: _MM_PERM_ENUM = 0xC7; +pub const _MM_PERM_DACA: _MM_PERM_ENUM = 0xC8; +pub const _MM_PERM_DACB: _MM_PERM_ENUM = 0xC9; +pub const _MM_PERM_DACC: _MM_PERM_ENUM = 0xCA; +pub const _MM_PERM_DACD: _MM_PERM_ENUM = 0xCB; +pub const _MM_PERM_DADA: _MM_PERM_ENUM = 0xCC; +pub const _MM_PERM_DADB: _MM_PERM_ENUM = 0xCD; +pub const _MM_PERM_DADC: _MM_PERM_ENUM = 0xCE; +pub const _MM_PERM_DADD: _MM_PERM_ENUM = 0xCF; +pub const _MM_PERM_DBAA: _MM_PERM_ENUM = 0xD0; +pub const _MM_PERM_DBAB: _MM_PERM_ENUM = 0xD1; +pub const _MM_PERM_DBAC: _MM_PERM_ENUM = 0xD2; +pub const _MM_PERM_DBAD: _MM_PERM_ENUM = 0xD3; +pub const _MM_PERM_DBBA: _MM_PERM_ENUM = 0xD4; +pub const _MM_PERM_DBBB: _MM_PERM_ENUM = 0xD5; +pub const _MM_PERM_DBBC: _MM_PERM_ENUM = 0xD6; +pub const _MM_PERM_DBBD: _MM_PERM_ENUM = 0xD7; +pub const _MM_PERM_DBCA: _MM_PERM_ENUM = 0xD8; +pub const _MM_PERM_DBCB: _MM_PERM_ENUM = 0xD9; +pub const _MM_PERM_DBCC: _MM_PERM_ENUM = 0xDA; +pub const _MM_PERM_DBCD: _MM_PERM_ENUM = 0xDB; +pub const _MM_PERM_DBDA: _MM_PERM_ENUM = 0xDC; +pub const _MM_PERM_DBDB: _MM_PERM_ENUM = 0xDD; +pub const _MM_PERM_DBDC: _MM_PERM_ENUM = 0xDE; +pub const _MM_PERM_DBDD: _MM_PERM_ENUM = 0xDF; +pub const _MM_PERM_DCAA: _MM_PERM_ENUM = 0xE0; +pub const _MM_PERM_DCAB: _MM_PERM_ENUM = 0xE1; +pub const _MM_PERM_DCAC: _MM_PERM_ENUM = 0xE2; +pub const _MM_PERM_DCAD: _MM_PERM_ENUM = 0xE3; +pub const _MM_PERM_DCBA: _MM_PERM_ENUM = 0xE4; +pub const _MM_PERM_DCBB: _MM_PERM_ENUM = 0xE5; +pub const _MM_PERM_DCBC: _MM_PERM_ENUM = 0xE6; +pub const _MM_PERM_DCBD: _MM_PERM_ENUM = 0xE7; +pub const _MM_PERM_DCCA: _MM_PERM_ENUM = 0xE8; +pub const _MM_PERM_DCCB: _MM_PERM_ENUM = 0xE9; +pub const _MM_PERM_DCCC: _MM_PERM_ENUM = 0xEA; +pub const _MM_PERM_DCCD: _MM_PERM_ENUM = 0xEB; +pub const _MM_PERM_DCDA: _MM_PERM_ENUM = 0xEC; +pub const _MM_PERM_DCDB: _MM_PERM_ENUM = 0xED; +pub const _MM_PERM_DCDC: _MM_PERM_ENUM = 0xEE; +pub const _MM_PERM_DCDD: _MM_PERM_ENUM = 0xEF; +pub const _MM_PERM_DDAA: _MM_PERM_ENUM = 0xF0; +pub const _MM_PERM_DDAB: _MM_PERM_ENUM = 0xF1; +pub const _MM_PERM_DDAC: _MM_PERM_ENUM = 0xF2; +pub const _MM_PERM_DDAD: _MM_PERM_ENUM = 0xF3; +pub const _MM_PERM_DDBA: _MM_PERM_ENUM = 0xF4; +pub const _MM_PERM_DDBB: _MM_PERM_ENUM = 0xF5; +pub const _MM_PERM_DDBC: _MM_PERM_ENUM = 0xF6; +pub const _MM_PERM_DDBD: _MM_PERM_ENUM = 0xF7; +pub const _MM_PERM_DDCA: _MM_PERM_ENUM = 0xF8; +pub const _MM_PERM_DDCB: _MM_PERM_ENUM = 0xF9; +pub const _MM_PERM_DDCC: _MM_PERM_ENUM = 0xFA; +pub const _MM_PERM_DDCD: _MM_PERM_ENUM = 0xFB; +pub const _MM_PERM_DDDA: _MM_PERM_ENUM = 0xFC; +pub const _MM_PERM_DDDB: _MM_PERM_ENUM = 0xFD; +pub const _MM_PERM_DDDC: _MM_PERM_ENUM = 0xFE; +pub const _MM_PERM_DDDD: _MM_PERM_ENUM = 0xFF; + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.pmul.dq.512"] @@ -11088,13 +12663,6 @@ extern "C" { fn vpermi2ps(a: f32x16, idx: i32x16, b: f32x16) -> f32x16; #[link_name = "llvm.x86.avx512.vpermi2var.pd.512"] fn vpermi2pd(a: f64x8, idx: i64x8, b: f64x8) -> f64x8; - - #[link_name = "llvm.x86.avx512.kand.w"] - fn kandw(ma: u16, mb: u16) -> u16; - #[link_name = "llvm.x86.avx512.kor.w"] - fn korw(ma: u16, mb: u16) -> u16; - #[link_name = "llvm.x86.avx512.kxor.w"] - fn kxorw(ma: u16, mb: u16) -> u16; } #[cfg(test)] @@ -16553,6 +18121,34 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_shuffle_epi32() { + let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16); + let r = _mm512_shuffle_epi32(a, _MM_PERM_AADD); + let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_shuffle_epi32() { + let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16); + let r = _mm512_mask_shuffle_epi32(a, 0, a, _MM_PERM_AADD); + assert_eq_m512i(r, a); + let r = _mm512_mask_shuffle_epi32(a, 0b11111111_11111111, a, _MM_PERM_AADD); + let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_shuffle_epi32() { + let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16); + let r = _mm512_maskz_shuffle_epi32(0, a, _MM_PERM_AADD); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shuffle_epi32(0b00000000_11111111, a, _MM_PERM_AADD); + let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_shuffle_ps() { let a = _mm512_setr_ps( @@ -16602,6 +18198,176 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_shuffle_i32x4() { + let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16); + let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15); + let r = _mm512_shuffle_i32x4(a, b, 0b00000000); + let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_shuffle_i32x4() { + let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16); + let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15); + let r = _mm512_mask_shuffle_i32x4(a, 0, a, b, 0b00000000); + assert_eq_m512i(r, a); + let r = _mm512_mask_shuffle_i32x4(a, 0b11111111_11111111, a, b, 0b00000000); + let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_shuffle_i32x4() { + let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16); + let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15); + let r = _mm512_maskz_shuffle_i32x4(0, a, b, 0b00000000); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shuffle_i32x4(0b00000000_11111111, a, b, 0b00000000); + let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_shuffle_f32x4() { + let a = _mm512_setr_ps( + 1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16., + ); + let b = _mm512_setr_ps( + 2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15., + ); + let r = _mm512_shuffle_f32x4(a, b, 0b00000000); + let e = _mm512_setr_ps( + 1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_shuffle_f32x4() { + let a = _mm512_setr_ps( + 1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16., + ); + let b = _mm512_setr_ps( + 2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15., + ); + let r = _mm512_mask_shuffle_f32x4(a, 0, a, b, 0b00000000); + assert_eq_m512(r, a); + let r = _mm512_mask_shuffle_f32x4(a, 0b11111111_11111111, a, b, 0b00000000); + let e = _mm512_setr_ps( + 1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_shuffle_f32x4() { + let a = _mm512_setr_ps( + 1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16., + ); + let b = _mm512_setr_ps( + 2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15., + ); + let r = _mm512_maskz_shuffle_f32x4(0, a, b, 0b00000000); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_shuffle_f32x4(0b00000000_11111111, a, b, 0b00000000); + let e = _mm512_setr_ps( + 1., 4., 5., 8., 1., 4., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_extractf32x4_ps() { + let a = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_extractf32x4_ps(a, 0x1); + let e = _mm_setr_ps(5., 6., 7., 8.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_moveldup_ps() { + let a = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_moveldup_ps(a); + let e = _mm512_setr_ps( + 1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_moveldup_ps() { + let a = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_mask_moveldup_ps(a, 0, a); + assert_eq_m512(r, a); + let r = _mm512_mask_moveldup_ps(a, 0b11111111_11111111, a); + let e = _mm512_setr_ps( + 1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_moveldup_ps() { + let a = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_maskz_moveldup_ps(0, a); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_moveldup_ps(0b00000000_11111111, a); + let e = _mm512_setr_ps( + 1., 1., 3., 3., 5., 5., 7., 7., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_movehdup_ps() { + let a = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_movehdup_ps(a); + let e = _mm512_setr_ps( + 2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_movehdup_ps() { + let a = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_mask_movehdup_ps(a, 0, a); + assert_eq_m512(r, a); + let r = _mm512_mask_movehdup_ps(a, 0b11111111_11111111, a); + let e = _mm512_setr_ps( + 2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_movehdup_ps() { + let a = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let r = _mm512_maskz_movehdup_ps(0, a); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_movehdup_ps(0b00000000_11111111, a); + let e = _mm512_setr_ps( + 2., 2., 4., 4., 6., 6., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi32() { let a = _mm512_set_epi32( @@ -17325,4 +19091,64 @@ mod tests { let e: u16 = 0b11100010_00111000; assert_eq!(r, e); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_knot() { + let a: u16 = 0b11001100_00110011; + let r = _mm512_knot(a); + let e: u16 = 0b00110011_11001100; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_knot_mask16() { + let a: u16 = 0b11001100_00110011; + let r = _knot_mask16(a); + let e: u16 = 0b00110011_11001100; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_kandn() { + let a: u16 = 0b11001100_00110011; + let b: u16 = 0b00101110_00001011; + let r = _mm512_kandn(a, b); + let e: u16 = 0b00100010_00001000; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_kandn_mask16() { + let a: u16 = 0b11001100_00110011; + let b: u16 = 0b00101110_00001011; + let r = _kandn_mask16(a, b); + let e: u16 = 0b00100010_00001000; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_kxnor() { + let a: u16 = 0b11001100_00110011; + let b: u16 = 0b00101110_00001011; + let r = _mm512_kxnor(a, b); + let e: u16 = 0b00011101_11000111; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_kxnor_mask16() { + let a: u16 = 0b11001100_00110011; + let b: u16 = 0b00101110_00001011; + let r = _kxnor_mask16(a, b); + let e: u16 = 0b00011101_11000111; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_kmov() { + let a: u16 = 0b11001100_00110011; + let r = _mm512_kmov(a); + let e: u16 = 0b11001100_00110011; + assert_eq!(r, e); + } } diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 7de6696a7d..29f6440e1b 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -318,6 +318,10 @@ pub type _MM_MANTISSA_NORM_ENUM = i32; #[allow(non_camel_case_types)] pub type _MM_MANTISSA_SIGN_ENUM = i32; +/// The `MM_PERM_ENUM` type used to specify shuffle operations in AVX-512 intrinsics. +#[allow(non_camel_case_types)] +pub type _MM_PERM_ENUM = i32; + #[cfg(test)] mod test; #[cfg(test)] diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index d3993a95d7..036cf36c74 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4188,6 +4188,96 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_shuffle_i64x2() { + let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16); + let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15); + let r = _mm512_shuffle_i64x2(a, b, 0b00000000); + let e = _mm512_setr_epi64(1, 4, 1, 4, 2, 3, 2, 3); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_shuffle_i64x2() { + let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16); + let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15); + let r = _mm512_mask_shuffle_i64x2(a, 0, a, b, 0b00000000); + assert_eq_m512i(r, a); + let r = _mm512_mask_shuffle_i64x2(a, 0b11111111, a, b, 0b00000000); + let e = _mm512_setr_epi64(1, 4, 1, 4, 2, 3, 2, 3); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_shuffle_i64x2() { + let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16); + let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15); + let r = _mm512_maskz_shuffle_i64x2(0, a, b, 0b00000000); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shuffle_i64x2(0b00001111, a, b, 0b00000000); + let e = _mm512_setr_epi64(1, 4, 1, 4, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_shuffle_f64x2() { + let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.); + let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.); + let r = _mm512_shuffle_f64x2(a, b, 0b00000000); + let e = _mm512_setr_pd(1., 4., 1., 4., 2., 3., 2., 3.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_shuffle_f64x2() { + let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.); + let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.); + let r = _mm512_mask_shuffle_f64x2(a, 0, a, b, 0b00000000); + assert_eq_m512d(r, a); + let r = _mm512_mask_shuffle_f64x2(a, 0b11111111, a, b, 0b00000000); + let e = _mm512_setr_pd(1., 4., 1., 4., 2., 3., 2., 3.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_shuffle_f64x2() { + let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.); + let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.); + let r = _mm512_maskz_shuffle_f64x2(0, a, b, 0b00000000); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_shuffle_f64x2(0b00001111, a, b, 0b00000000); + let e = _mm512_setr_pd(1., 4., 1., 4., 0., 0., 0., 0.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_movedup_pd() { + let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm512_movedup_pd(a); + let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_movedup_pd() { + let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm512_mask_movedup_pd(a, 0, a); + assert_eq_m512d(r, a); + let r = _mm512_mask_movedup_pd(a, 0b11111111, a); + let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_movedup_pd() { + let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let r = _mm512_maskz_movedup_pd(0, a); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_movedup_pd(0b00001111, a); + let e = _mm512_setr_pd(1., 1., 3., 3., 0., 0., 0., 0.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi64() { let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3); diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs index e695c40d59..6d9dfe6229 100644 --- a/crates/stdarch-verify/src/lib.rs +++ b/crates/stdarch-verify/src/lib.rs @@ -150,6 +150,7 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream { "_MM_CMPINT_ENUM" => quote! { &MM_CMPINT_ENUM }, "_MM_MANTISSA_NORM_ENUM" => quote! { &MM_MANTISSA_NORM_ENUM }, "_MM_MANTISSA_SIGN_ENUM" => quote! { &MM_MANTISSA_SIGN_ENUM }, + "_MM_PERM_ENUM" => quote! { &MM_PERM_ENUM }, "__m64" => quote! { &M64 }, "bool" => quote! { &BOOL }, "f32" => quote! { &F32 }, diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index 05843c7490..d9adf91999 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -58,6 +58,7 @@ static MMASK16: Type = Type::MMASK16; static MM_CMPINT_ENUM: Type = Type::MM_CMPINT_ENUM; static MM_MANTISSA_NORM_ENUM: Type = Type::MM_MANTISSA_NORM_ENUM; static MM_MANTISSA_SIGN_ENUM: Type = Type::MM_MANTISSA_SIGN_ENUM; +static MM_PERM_ENUM: Type = Type::MM_PERM_ENUM; static TUPLE: Type = Type::Tuple; static CPUID: Type = Type::CpuidResult; @@ -85,6 +86,7 @@ enum Type { MM_CMPINT_ENUM, MM_MANTISSA_NORM_ENUM, MM_MANTISSA_SIGN_ENUM, + MM_PERM_ENUM, Tuple, CpuidResult, Never, @@ -680,6 +682,7 @@ fn equate(t: &Type, intel: &str, intrinsic: &str, is_const: bool) -> Result<(), (&Type::MM_CMPINT_ENUM, "_MM_CMPINT_ENUM") => {} (&Type::MM_MANTISSA_NORM_ENUM, "_MM_MANTISSA_NORM_ENUM") => {} (&Type::MM_MANTISSA_SIGN_ENUM, "_MM_MANTISSA_SIGN_ENUM") => {} + (&Type::MM_PERM_ENUM, "_MM_PERM_ENUM") => {} // This is a macro (?) in C which seems to mutate its arguments, but // that means that we're taking pointers to arguments in rust