From 21da8aba770744b9838fd7057c4ae6991cde445b Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Sun, 14 Feb 2021 14:22:31 -0500
Subject: [PATCH 01/31] cvtepi8_epi64: mm256,mm; cvtepu16_epi32,epi64:
 mm256,mm; cvtepu32_epi64,pd: mm256,mm; cvtepu8_epi32,epi64: mm256,mm

---
 crates/core_arch/avx512f.md            | 100 ++++--
 crates/core_arch/src/x86/avx512f.rs    | 474 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 255 +++++++++++++
 3 files changed, 801 insertions(+), 28 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index e61f25507c..c34df5cfc9 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2025,6 +2025,65 @@
   * [x] [`_mm_maskz_cvtepi8_epi32`]
   * [x] [`_mm256_mask_cvtepi8_epi32`]
   * [x] [`_mm256_maskz_cvtepi8_epi32`]
+  * [x] [`_mm512_cvtepi8_epi64`]
+  * [x] [`_mm512_mask_cvtepi8_epi64`]
+  * [x] [`_mm512_maskz_cvtepi8_epi64`]
+  * [x] [`_mm_mask_cvtepi8_epi64`]
+  * [x] [`_mm_maskz_cvtepi8_epi64`]
+  * [x] [`_mm256_mask_cvtepi8_epi64`]
+  * [x] [`_mm256_maskz_cvtepi8_epi64`]
+  * [x] [`_mm512_cvtepu16_epi32`]
+  * [x] [`_mm512_mask_cvtepu16_epi32`]
+  * [x] [`_mm512_maskz_cvtepu16_epi32`]
+  * [x] [`_mm_mask_cvtepu16_epi32`]
+  * [x] [`_mm_maskz_cvtepu16_epi32`]
+  * [x] [`_mm256_mask_cvtepu16_epi32`]
+  * [x] [`_mm256_maskz_cvtepu16_epi32`]
+  * [x] [`_mm512_cvtepu16_epi64`]
+  * [x] [`_mm512_mask_cvtepu16_epi64`]
+  * [x] [`_mm512_maskz_cvtepu16_epi64`]
+  * [x] [`_mm_mask_cvtepu16_epi64`]
+  * [x] [`_mm_maskz_cvtepu16_epi64`]
+  * [x] [`_mm256_mask_cvtepu16_epi64`]
+  * [x] [`_mm256_maskz_cvtepu16_epi64`]
+  * [x] [`_mm512_cvtepu32_epi64`]
+  * [x] [`_mm512_mask_cvtepu32_epi64`]
+  * [x] [`_mm512_maskz_cvtepu32_epi64`]
+  * [x] [`_mm_mask_cvtepu32_epi64`]
+  * [x] [`_mm_maskz_cvtepu32_epi64`]
+  * [x] [`_mm256_mask_cvtepu32_epi64`]
+  * [x] [`_mm256_maskz_cvtepu32_epi64`]
+  * [x] [`_mm512_cvtepu32_ps`]
+  * [x] [`_mm512_mask_cvtepu32_ps`]
+  * [x] [`_mm512_maskz_cvtepu32_ps`]
+  * [x] [`_mm512_cvtepu32_pd`]
+  * [x] [`_mm512_mask_cvtepu32_pd`]
+  * [x] [`_mm512_maskz_cvtepu32_pd`]
+  * [x] [`_mm_cvtepu32_pd`]
+  * [x] [`_mm_mask_cvtepu32_pd`]
+  * [x] [`_mm_maskz_cvtepu32_pd`]
+  * [x] [`_mm256_cvtepu32_pd`]
+  * [x] [`_mm256_mask_cvtepu32_pd`]
+  * [x] [`_mm256_maskz_cvtepu32_pd`]
+  * [x] [`_mm512_cvtepu8_epi32`]
+  * [x] [`_mm512_mask_cvtepu8_epi32`]
+  * [x] [`_mm512_maskz_cvtepu8_epi32`]
+  * [x] [`_mm_mask_cvtepu8_epi32`]
+  * [x] [`_mm_maskz_cvtepu8_epi32`]
+  * [x] [`_mm256_mask_cvtepu8_epi32`]
+  * [x] [`_mm256_maskz_cvtepu8_epi32`]
+  * [x] [`_mm512_cvtepu8_epi64`]
+  * [x] [`_mm512_mask_cvtepu8_epi64`]
+  * [x] [`_mm512_maskz_cvtepu8_epi64`]
+  * [x] [`_mm_mask_cvtepu8_epi64`]
+  * [x] [`_mm_maskz_cvtepu8_epi64`]
+  * [x] [`_mm256_mask_cvtepu8_epi64`]
+  * [x] [`_mm256_maskz_cvtepu8_epi64`]
+  * [x] [`_mm512_cvtpd_epi32`]
+  * [x] [`_mm512_mask_cvtpd_epi32`]
+  * [x] [`_mm512_maskz_cvtpd_epi32`]
+
+  * [x] [`_mm512_mask_cvtepu32lo_pd`]
 
   * [x] [`_mm512_mask_cvtsepi64_epi32`]
   * [x] [`_mm512_mask_cvtsepi64_epi8`]
@@ -2042,16 +2101,12 @@
   * [x] [`_mm512_cvt_roundps_pd`]
 
   * [x] [`_mm512_mask_cvtsepi64_epi16`]
-  * [x] [`_mm512_cvtepi8_epi64`]
-  * [x] [`_mm512_cvtepu16_epi32`]
-  * [x] [`_mm512_cvtepu16_epi64`]
-  * [x] [`_mm512_cvtepu32_epi64`]
-  * [x] [`_mm512_cvtepu32_pd`]
-  * [x] [`_mm512_cvtepu32_ps`]
+
+
+
   * [x] [`_mm512_cvtepu32lo_pd`]
-  * [x] [`_mm512_cvtepu8_epi32`]
-  * [x] [`_mm512_cvtepu8_epi64`]
-  * [x] [`_mm512_cvtpd_epi32`]
+
+
   * [x] [`_mm512_cvtpd_epu32`]
   * [x] [`_mm512_cvtpd_ps`]
   * [x] [`_mm512_cvtpd_pslo`]
@@ -2100,16 +2155,10 @@
   * [x] [`_mm512_mask_cvt_roundps_epi32`]
   * [x] [`_mm512_mask_cvt_roundps_epu32`]
   * [x] [`_mm512_mask_cvt_roundps_pd`]
-  * [x] [`_mm512_mask_cvtepi8_epi64`]
-  * [x] [`_mm512_mask_cvtepu16_epi32`]
-  * [x] [`_mm512_mask_cvtepu16_epi64`]
-  * [x] [`_mm512_mask_cvtepu32_epi64`]
-  * [x] [`_mm512_mask_cvtepu32_pd`]
-  * [x] [`_mm512_mask_cvtepu32_ps`]
-  * [x] [`_mm512_mask_cvtepu32lo_pd`]
-  * [x] [`_mm512_mask_cvtepu8_epi32`]
-  * [x] [`_mm512_mask_cvtepu8_epi64`]
-  * [x] [`_mm512_mask_cvtpd_epi32`]
+
+
+
+
   * [x] [`_mm512_mask_cvtpd_epu32`]
   * [x] [`_mm512_mask_cvtpd_ps`]
   * [x] [`_mm512_mask_cvtpd_pslo`]
@@ -2150,15 +2199,10 @@
   * [x] [`_mm512_maskz_cvt_roundps_epi32`]
   * [x] [`_mm512_maskz_cvt_roundps_epu32`]
   * [x] [`_mm512_maskz_cvt_roundps_pd`]
-  * [x] [`_mm512_maskz_cvtepi8_epi64`]
-  * [x] [`_mm512_maskz_cvtepu16_epi32`]
-  * [x] [`_mm512_maskz_cvtepu16_epi64`]
-  * [x] [`_mm512_maskz_cvtepu32_epi64`]
-  * [x] [`_mm512_maskz_cvtepu32_pd`]
-  * [x] [`_mm512_maskz_cvtepu32_ps`]
-  * [x] [`_mm512_maskz_cvtepu8_epi32`]
-  * [x] [`_mm512_maskz_cvtepu8_epi64`]
-  * [x] [`_mm512_maskz_cvtpd_epi32`]
+
+
+
+
   * [x] [`_mm512_maskz_cvtpd_epu32`]
   * [x] [`_mm512_maskz_cvtpd_ps`]
   * [x] [`_mm512_maskz_cvtph_ps`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 0e5a1ba461..f18ab81607 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10861,6 +10861,51 @@ pub unsafe fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
     ))
 }
 
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epi32&expand=1675)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm512_cvtpd_epi32(a: __m512d) -> __m256i {
+    transmute(vcvtpd2dq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_epi32&expand=1676)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm512_mask_cvtpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvtpd2dq(
+        a.as_f64x8(),
+        src.as_i32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_epi32&expand=1677)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm512_maskz_cvtpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvtpd2dq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
 /// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_pslo&expand=1715)
@@ -11016,6 +11061,52 @@ pub unsafe fn _mm512_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m512i {
     transmute(simd_select_bitmask(k, convert, zero))
 }
 
+/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi8_epi64&expand=1542)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm256_mask_cvtepi8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+}
+
+/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi8_epi64&expand=1543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm256_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi8_epi64&expand=1539)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm_mask_cvtepi8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi8_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+}
+
+/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi8_epi64&expand=1540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsxbq))]
+pub unsafe fn _mm_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepi8_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu8_epi32&expand=1621)
@@ -11050,6 +11141,52 @@ pub unsafe fn _mm512_maskz_cvtepu8_epi32(k: __mmask16, a: __m128i) -> __m512i {
     transmute(simd_select_bitmask(k, convert, zero))
 }
 
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu8_epi32&expand=1619)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm256_mask_cvtepu8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm256_maskz_cvtepu8_epi32&expand=1620)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm256_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu8_epi32&expand=1616)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm_mask_cvtepu8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu8_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in th elow 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_cvtepu8_epi32&expand=1617)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbd))]
+pub unsafe fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu8_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
 /// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu8_epi64&expand=1630)
@@ -11085,6 +11222,52 @@ pub unsafe fn _mm512_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m512i {
     transmute(simd_select_bitmask(k, convert, zero))
 }
 
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu8_epi64&expand=1628)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm256_mask_cvtepu8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu8_epi64&expand=1629)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm256_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu8_epi64&expand=1625)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm_mask_cvtepu8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu8_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu8_epi64&expand=1626)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxbq))]
+pub unsafe fn _mm_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu8_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_epi32&expand=1389)
@@ -11279,6 +11462,52 @@ pub unsafe fn _mm512_maskz_cvtepu16_epi32(k: __mmask16, a: __m256i) -> __m512i {
     transmute(simd_select_bitmask(k, convert, zero))
 }
 
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_epi32&expand=1551)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm256_mask_cvtepu16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_epi32&expand=1552)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm256_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_epi32&expand=1548)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm_mask_cvtepu16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu16_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+}
+
+/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_epi32&expand=1549)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwd))]
+pub unsafe fn _mm_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu16_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_epi64&expand=1562)
@@ -11313,6 +11542,52 @@ pub unsafe fn _mm512_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m512i {
     transmute(simd_select_bitmask(k, convert, zero))
 }
 
+/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_epi64&expand=1560)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm256_mask_cvtepu16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_epi64&expand=1561)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm256_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_epi64&expand=1557)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm_mask_cvtepu16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu16_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+}
+
+/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_epi64&expand=1558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxwq))]
+pub unsafe fn _mm_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu16_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_epi64&expand=1428)
@@ -11427,6 +11702,52 @@ pub unsafe fn _mm512_maskz_cvtepu32_epi64(k: __mmask8, a: __m256i) -> __m512i {
     transmute(simd_select_bitmask(k, convert, zero))
 }
 
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_epi64&expand=1569)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm256_mask_cvtepu32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_epi64&expand=1570)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm256_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_epi64&expand=1566)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm_mask_cvtepu32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu32_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+}
+
+/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_epi64&expand=1567)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovzxdq))]
+pub unsafe fn _mm_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let convert = _mm_cvtepu32_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ps&expand=1455)
@@ -11655,6 +11976,75 @@ pub unsafe fn _mm512_maskz_cvtepu32_pd(k: __mmask8, a: __m256i) -> __m512d {
     transmute(simd_select_bitmask(k, convert, zero))
 }
 
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_pd&expand=1577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm256_cvtepu32_pd(a: __m128i) -> __m256d {
+    let a = a.as_u32x4();
+    transmute::<f64x4, _>(simd_cast(a))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_pd&expand=1578)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm256_mask_cvtepu32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
+    let convert = _mm256_cvtepu32_pd(a).as_f64x4();
+    transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_pd&expand=1579)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
+    let convert = _mm256_cvtepu32_pd(a).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_pd&expand=1574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
+    let a = a.as_u32x4();
+    let u64: u32x2 = simd_shuffle2(a, a, [0, 1]);
+    transmute::<f64x2, _>(simd_cast(u64))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_pd&expand=1575)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm_mask_cvtepu32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    let convert = _mm_cvtepu32_pd(a).as_f64x2();
+    transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_pd&expand=1576)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtudq2pd))]
+pub unsafe fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
+    let convert = _mm_cvtepu32_pd(a).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, convert, zero))
+}
+
 /// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32lo_pd&expand=1464)
@@ -41991,6 +42381,48 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepu8_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi32() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu8_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtepi16_epi32() {
         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -42091,6 +42523,48 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi32(-1);
+        let r = _mm256_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu16_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu16_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm_mask_cvtepu16_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu16_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu16_epi32() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu16_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu16_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtepi32_ps() {
         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 29854d858b..cbb1a312b1 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -2901,6 +2901,35 @@ mod tests {
         assert_eq_m256(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_cvtpd_epi32(a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvtpd_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtpd_epi32(src, 0b11111111, a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtpd_epi32() {
+        let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
+        let r = _mm512_maskz_cvtpd_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtpd_epi32(0b11111111, a);
+        let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtpd_pslo() {
         let v2 = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
@@ -2953,6 +2982,48 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepi8_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepi8_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepi8_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepi8_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepi8_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepi8_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepi8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepi8_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepi8_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtepu8_epi64() {
         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -2982,6 +3053,48 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepu8_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu8_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu8_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu8_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepu8_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu8_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu8_epi64() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu8_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu8_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtepi16_epi64() {
         let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
@@ -3082,6 +3195,48 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepu16_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu16_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu16_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu16_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepu16_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu16_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu16_epi64() {
+        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu16_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu16_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtepi32_epi64() {
         let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
@@ -3182,6 +3337,48 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm256_set1_epi64x(-1);
+        let r = _mm256_mask_cvtepu32_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtepu32_epi64(src, 0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu32_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtepu32_epi64(0b00001111, a);
+        let e = _mm256_set_epi64x(12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm_set1_epi64x(-1);
+        let r = _mm_mask_cvtepu32_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtepu32_epi64(src, 0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu32_epi64() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu32_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtepu32_epi64(0b00000011, a);
+        let e = _mm_set_epi64x(14, 15);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtepi32_pd() {
         let a = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
@@ -3282,6 +3479,64 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_cvtepu32_pd(a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm256_set1_pd(-1.);
+        let r = _mm256_mask_cvtepu32_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_cvtepu32_pd(src, 0b00001111, a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm256_maskz_cvtepu32_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_cvtepu32_pd(0b00001111, a);
+        let e = _mm256_set_pd(12., 13., 14., 15.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_cvtepu32_pd(a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let src = _mm_set1_pd(-1.);
+        let r = _mm_mask_cvtepu32_pd(src, 0, a);
+        assert_eq_m128d(r, src);
+        let r = _mm_mask_cvtepu32_pd(src, 0b00000011, a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtepu32_pd() {
+        let a = _mm_set_epi32(12, 13, 14, 15);
+        let r = _mm_maskz_cvtepu32_pd(0, a);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_cvtepu32_pd(0b00000011, a);
+        let e = _mm_set_pd(14., 15.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtepi32lo_pd() {
         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);

From 9e55a04bb39d18a938dda09d41ae6b6abd059b73 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Mon, 15 Feb 2021 08:54:23 -0500
Subject: [PATCH 02/31] cvtpd_epi32,epu32: mm256,mm

---
 crates/core_arch/avx512f.md            |  35 +++--
 crates/core_arch/src/x86/avx512f.rs    | 180 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 129 ++++++++++++++++++
 3 files changed, 336 insertions(+), 8 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index c34df5cfc9..b51b502f43 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2065,6 +2065,8 @@
   * [x] [`_mm256_cvtepu32_pd`]
   * [x] [`_mm256_mask_cvtepu32_pd`]
   * [x] [`_mm256_maskz_cvtepu32_pd`]
+  * [x] [`_mm512_cvtepu32lo_pd`]
+  * [x] [`_mm512_mask_cvtepu32lo_pd`]
   * [x] [`_mm512_cvtepu8_epi32`]
   * [x] [`_mm512_mask_cvtepu8_epi32`]
   * [x] [`_mm512_maskz_cvtepu8_epi32`]
@@ -2082,8 +2084,25 @@
   * [x] [`_mm512_cvtpd_epi32`]
   * [x] [`_mm512_mask_cvtpd_epi32`]
   * [x] [`_mm512_maskz_cvtpd_epi32`]
+  * [x] [`_mm_mask_cvtpd_epi32`]
+  * [x] [`_mm_maskz_cvtpd_epi32`]
+  * [x] [`_mm256_mask_cvtpd_epi32`]
+  * [x] [`_mm256_maskz_cvtpd_epi32`]
+  * [x] [`_mm512_cvtpd_epu32`]
+  * [x] [`_mm512_mask_cvtpd_epu32`]
+  * [x] [`_mm512_maskz_cvtpd_epu32`]
+  * [x] [`_mm_cvtpd_epu32`]
+  * [x] [`_mm_mask_cvtpd_epu32`]
+  * [x] [`_mm_maskz_cvtpd_epu32`]
+  * [x] [`_mm256_cvtpd_epu32`]
+  * [x] [`_mm256_mask_cvtpd_epu32`]
+  * [x] [`_mm256_maskz_cvtpd_epu32`]
+  * [x] [`_mm512_cvtpd_ps`]
+  * [x] [`_mm512_mask_cvtpd_ps`]
+  * [x] [`_mm512_maskz_cvtpd_ps`]
+
+
 
-  * [x] [`_mm512_mask_cvtepu32lo_pd`]
 
   * [x] [`_mm512_mask_cvtsepi64_epi32`]
   * [x] [`_mm512_mask_cvtsepi64_epi8`]
@@ -2104,11 +2123,11 @@
 
 
 
-  * [x] [`_mm512_cvtepu32lo_pd`]
 
 
-  * [x] [`_mm512_cvtpd_epu32`]
-  * [x] [`_mm512_cvtpd_ps`]
+
+
+
   * [x] [`_mm512_cvtpd_pslo`]
   * [x] [`_mm512_cvtph_ps`]
   * [x] [`_mm512_cvtps_epi32`]
@@ -2159,8 +2178,8 @@
 
 
 
-  * [x] [`_mm512_mask_cvtpd_epu32`]
-  * [x] [`_mm512_mask_cvtpd_ps`]
+
+
   * [x] [`_mm512_mask_cvtpd_pslo`]
   * [x] [`_mm512_mask_cvtph_ps`]
   * [x] [`_mm512_mask_cvtps_epi32`]
@@ -2203,8 +2222,8 @@
 
 
 
-  * [x] [`_mm512_maskz_cvtpd_epu32`]
-  * [x] [`_mm512_maskz_cvtpd_ps`]
+
+
   * [x] [`_mm512_maskz_cvtph_ps`]
   * [x] [`_mm512_maskz_cvtps_epi32`]
   * [x] [`_mm512_maskz_cvtps_epu32`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index f18ab81607..7a588086bf 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10906,6 +10906,179 @@ pub unsafe fn _mm512_maskz_cvtpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
     ))
 }
 
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_epi32&expand=1673)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm256_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    let convert = _mm256_cvtpd_epi32(a);
+    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_epi32&expand=1674)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm256_maskz_cvtpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    let convert = _mm256_cvtpd_epi32(a);
+    transmute(simd_select_bitmask(
+        k,
+        convert.as_i32x4(),
+        _mm_setzero_si128().as_i32x4(),
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_epi32&expand=1670)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    let convert = _mm_cvtpd_epi32(a);
+    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_epi32&expand=1671)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm_maskz_cvtpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    let convert = _mm_cvtpd_epi32(a);
+    transmute(simd_select_bitmask(
+        k,
+        convert.as_i32x4(),
+        _mm_setzero_si128().as_i32x4(),
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epu32&expand=1693)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm512_cvtpd_epu32(a: __m512d) -> __m256i {
+    transmute(vcvtpd2udq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_epu32&expand=1694)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm512_mask_cvtpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvtpd2udq(
+        a.as_f64x8(),
+        src.as_u32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_epu32&expand=1695)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm512_maskz_cvtpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
+    transmute(vcvtpd2udq(
+        a.as_f64x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_epu32&expand=1690)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm256_cvtpd_epu32(a: __m256d) -> __m128i {
+    transmute(vcvtpd2udq256(
+        a.as_f64x4(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_epu32&expand=1691)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm256_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvtpd2udq256(a.as_f64x4(), src.as_u32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_epu32&expand=1692)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm256_maskz_cvtpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvtpd2udq256(
+        a.as_f64x4(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epu32&expand=1687)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm_cvtpd_epu32(a: __m128d) -> __m128i {
+    transmute(vcvtpd2udq128(
+        a.as_f64x2(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_epu32&expand=1688)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvtpd2udq128(a.as_f64x2(), src.as_u32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_epu32&expand=1689)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2udq))]
+pub unsafe fn _mm_maskz_cvtpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvtpd2udq128(
+        a.as_f64x2(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
 /// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_pslo&expand=1715)
@@ -37119,10 +37292,17 @@ extern "C" {
     fn vcvtps2pd(a: f32x8, src: f64x8, mask: u8, sae: i32) -> f64x8;
     #[link_name = "llvm.x86.avx512.mask.cvtpd2ps.512"]
     fn vcvtpd2ps(a: f64x8, src: f32x8, mask: u8, rounding: i32) -> f32x8;
+
     #[link_name = "llvm.x86.avx512.mask.cvtpd2dq.512"]
     fn vcvtpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
+
     #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.512"]
     fn vcvtpd2udq(a: f64x8, src: u32x8, mask: u8, rounding: i32) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.256"]
+    fn vcvtpd2udq256(a: f64x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.128"]
+    fn vcvtpd2udq128(a: f64x2, src: u32x4, mask: u8) -> u32x4;
+
     #[link_name = "llvm.x86.avx512.sitofp.round.v16f32.v16i32"]
     fn vcvtdq2ps(a: i32x16, rounding: i32) -> f32x16;
     #[link_name = "llvm.x86.avx512.uitofp.round.v16f32.v16i32"]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index cbb1a312b1..5627f61f34 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -2930,6 +2930,135 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_epi32() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtpd_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, -6, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_epi32() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let r = _mm256_maskz_cvtpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtpd_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, -6, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtpd_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvtpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtpd_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtpd_epu32() {
+        let a = _mm512_setr_pd(0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5);
+        let r = _mm512_cvtpd_epu32(a);
+        let e = _mm256_setr_epi32(0, 2, 2, 4, 4, 6, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtpd_epu32() {
+        let a = _mm512_setr_pd(0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm512_mask_cvtpd_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm512_mask_cvtpd_epu32(src, 0b11111111, a);
+        let e = _mm256_setr_epi32(0, 2, 2, 4, 4, 6, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_cvtpd_epu32() {
+        let a = _mm512_setr_pd(0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5);
+        let r = _mm512_maskz_cvtpd_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm512_maskz_cvtpd_epu32(0b11111111, a);
+        let e = _mm256_setr_epi32(0, 2, 2, 4, 4, 6, 6, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_cvtpd_epu32(a);
+        let e = _mm_set_epi32(4, 6, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtpd_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 6, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_maskz_cvtpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtpd_epu32(0b00001111, a);
+        let e = _mm_set_epi32(4, 6, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_cvtpd_epu32(a);
+        let e = _mm_set_epi32(0, 0, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtpd_epu32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_maskz_cvtpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtpd_epu32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 8);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtpd_pslo() {
         let v2 = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);

From b34d37882a561b5b015a0652b470c50530281543 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Mon, 15 Feb 2021 09:21:26 -0500
Subject: [PATCH 03/31] cvtpd_ps: mm256,mm

---
 crates/core_arch/avx512f.md            | 14 +++++---
 crates/core_arch/src/x86/avx512f.rs    | 46 ++++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 42 +++++++++++++++++++++++
 3 files changed, 98 insertions(+), 4 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index b51b502f43..10456afaf7 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2100,7 +2100,13 @@
   * [x] [`_mm512_cvtpd_ps`]
   * [x] [`_mm512_mask_cvtpd_ps`]
   * [x] [`_mm512_maskz_cvtpd_ps`]
-
+  * [x] [`_mm_mask_cvtpd_ps`]
+  * [x] [`_mm_maskz_cvtpd_ps`]
+  * [x] [`_mm256_mask_cvtpd_ps`]
+  * [x] [`_mm256_maskz_cvtpd_ps`]
+  * [x] [`_mm512_cvtpd_pslo`]
+  * [x] [`_mm512_mask_cvtpd_pslo`]
+  * [x] [`_mm512_cvtph_ps`]
 
 
 
@@ -2128,8 +2134,8 @@
 
 
 
-  * [x] [`_mm512_cvtpd_pslo`]
-  * [x] [`_mm512_cvtph_ps`]
+
+
   * [x] [`_mm512_cvtps_epi32`]
   * [x] [`_mm512_cvtps_epu32`]
   * [x] [`_mm512_cvtps_pd`]
@@ -2180,7 +2186,7 @@
 
 
 
-  * [x] [`_mm512_mask_cvtpd_pslo`]
+
   * [x] [`_mm512_mask_cvtph_ps`]
   * [x] [`_mm512_mask_cvtps_epi32`]
   * [x] [`_mm512_mask_cvtps_epu32`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7a588086bf..7d792673f4 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10861,6 +10861,52 @@ pub unsafe fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
     ))
 }
 
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ps&expand=1710)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm256_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m256d) -> __m128 {
+    let convert = _mm256_cvtpd_ps(a);
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ps&expand=1711)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm256_maskz_cvtpd_ps(k: __mmask8, a: __m256d) -> __m128 {
+    let convert = _mm256_cvtpd_ps(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), zero))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ps&expand=1707)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m128d) -> __m128 {
+    let convert = _mm_cvtpd_ps(a);
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ps&expand=1708)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm_maskz_cvtpd_ps(k: __mmask8, a: __m128d) -> __m128 {
+    let convert = _mm_cvtpd_ps(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), zero))
+}
+
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epi32&expand=1675)
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 5627f61f34..c5fb635bfd 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -2901,6 +2901,48 @@ mod tests {
         assert_eq_m256(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtpd_ps() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let src = _mm_set1_ps(0.);
+        let r = _mm256_mask_cvtpd_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm256_mask_cvtpd_ps(src, 0b00001111, a);
+        let e = _mm_set_ps(4., -5.5, 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtpd_ps() {
+        let a = _mm256_set_pd(4., -5.5, 6., -7.5);
+        let r = _mm256_maskz_cvtpd_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm256_maskz_cvtpd_ps(0b00001111, a);
+        let e = _mm_set_ps(4., -5.5, 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtpd_ps() {
+        let a = _mm_set_pd(6., -7.5);
+        let src = _mm_set1_ps(0.);
+        let r = _mm_mask_cvtpd_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtpd_ps(src, 0b00000011, a);
+        let e = _mm_set_ps(0., 0., 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtpd_ps() {
+        let a = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvtpd_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtpd_ps(0b00000011, a);
+        let e = _mm_set_ps(0., 0., 6., -7.5);
+        assert_eq_m128(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);

From 73f1573a49ddd952852c38a186e8fde640d84295 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Mon, 15 Feb 2021 13:24:24 -0500
Subject: [PATCH 04/31] cvtps_epi32,epu32,ph: mm256,mm;

---
 crates/core_arch/avx512f.md         |  73 ++--
 crates/core_arch/src/x86/avx512f.rs | 510 ++++++++++++++++++++++++++--
 2 files changed, 516 insertions(+), 67 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 10456afaf7..f57471bfd8 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2107,8 +2107,40 @@
   * [x] [`_mm512_cvtpd_pslo`]
   * [x] [`_mm512_mask_cvtpd_pslo`]
   * [x] [`_mm512_cvtph_ps`]
-
-
+  * [x] [`_mm512_mask_cvtph_ps`]
+  * [x] [`_mm512_maskz_cvtph_ps`]
+  * [x] [`_mm_mask_cvtph_ps`]
+  * [x] [`_mm_maskz_cvtph_ps`]
+  * [x] [`_mm256_mask_cvtph_ps`]
+  * [x] [`_mm256_maskz_cvtph_ps`]
+  * [x] [`_mm512_cvtps_epi32`]
+  * [x] [`_mm512_mask_cvtps_epi32`]
+  * [x] [`_mm512_maskz_cvtps_epi32`]
+  * [x] [`_mm_mask_cvtps_epi32`]
+  * [x] [`_mm_maskz_cvtps_epi32`]
+  * [x] [`_mm256_mask_cvtps_epi32`]
+  * [x] [`_mm256_maskz_cvtps_epi32`]
+  * [x] [`_mm512_cvtps_epu32`]
+  * [x] [`_mm512_mask_cvtps_epu32`]
+  * [x] [`_mm512_maskz_cvtps_epu32`]
+  * [x] [`_mm_cvtps_epu32`]
+  * [x] [`_mm_mask_cvtps_epu32`]
+  * [x] [`_mm_maskz_cvtps_epu32`]
+  * [x] [`_mm256_cvtps_epu32`]
+  * [x] [`_mm256_mask_cvtps_epu32`]
+  * [x] [`_mm256_maskz_cvtps_epu32`]
+  * [x] [`_mm512_cvtps_pd`]
+  * [x] [`_mm512_mask_cvtps_pd`]
+  * [x] [`_mm512_maskz_cvtps_pd`]
+  * [x] [`_mm512_cvtps_ph`]
+  * [x] [`_mm512_mask_cvtps_ph`]
+  * [x] [`_mm512_maskz_cvtps_ph`]
+  * [x] [`_mm_mask_cvtps_ph`]
+  * [x] [`_mm_maskz_cvtps_ph`]
+  * [x] [`_mm256_mask_cvtps_ph`]
+  * [x] [`_mm256_maskz_cvtps_ph`]
+  * [x] [`_mm512_cvtpslo_pd`]
+  * [x] [`_mm512_mask_cvtpslo_pd`]
 
   * [x] [`_mm512_mask_cvtsepi64_epi32`]
   * [x] [`_mm512_mask_cvtsepi64_epi8`]
@@ -2126,21 +2158,6 @@
   * [x] [`_mm512_cvt_roundps_pd`]
 
   * [x] [`_mm512_mask_cvtsepi64_epi16`]
-
-
-
-
-
-
-
-
-
-
-  * [x] [`_mm512_cvtps_epi32`]
-  * [x] [`_mm512_cvtps_epu32`]
-  * [x] [`_mm512_cvtps_pd`]
-  * [x] [`_mm512_cvtps_ph`]
-  * [x] [`_mm512_cvtpslo_pd`]
   * [x] [`_mm512_cvtsepi32_epi16`]
   * [x] [`_mm512_cvtsepi32_epi8`]
   * [x] [`_mm512_cvtsepi64_epi16`]
@@ -2181,18 +2198,6 @@
   * [x] [`_mm512_mask_cvt_roundps_epu32`]
   * [x] [`_mm512_mask_cvt_roundps_pd`]
 
-
-
-
-
-
-
-  * [x] [`_mm512_mask_cvtph_ps`]
-  * [x] [`_mm512_mask_cvtps_epi32`]
-  * [x] [`_mm512_mask_cvtps_epu32`]
-  * [x] [`_mm512_mask_cvtps_pd`]
-  * [x] [`_mm512_mask_cvtps_ph`]
-  * [x] [`_mm512_mask_cvtpslo_pd`]
   * [x] [`_mm512_mask_cvtsepi32_epi16`]
   * [x] [`_mm512_mask_cvtsepi32_epi8`]
   * [ ] [`_mm512_mask_cvtsepi32_storeu_epi16`]
@@ -2225,16 +2230,6 @@
   * [x] [`_mm512_maskz_cvt_roundps_epu32`]
   * [x] [`_mm512_maskz_cvt_roundps_pd`]
 
-
-
-
-
-
-  * [x] [`_mm512_maskz_cvtph_ps`]
-  * [x] [`_mm512_maskz_cvtps_epi32`]
-  * [x] [`_mm512_maskz_cvtps_epu32`]
-  * [x] [`_mm512_maskz_cvtps_pd`]
-  * [x] [`_mm512_maskz_cvtps_ph`]
   * [x] [`_mm512_maskz_cvtsepi32_epi16`]
   * [x] [`_mm512_maskz_cvtsepi32_epi8`]
   * [x] [`_mm512_maskz_cvtsepi64_epi16`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7d792673f4..4907e11758 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10696,6 +10696,52 @@ pub unsafe fn _mm512_maskz_cvtps_epi32(k: __mmask16, a: __m512) -> __m512i {
     ))
 }
 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtps_epi32&expand=1735)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm256_mask_cvtps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    let convert = _mm256_cvtps_epi32(a);
+    transmute(simd_select_bitmask(k, convert.as_i32x8(), src.as_i32x8()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtps_epi32&expand=1736)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm256_maskz_cvtps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    let convert = _mm256_cvtps_epi32(a);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, convert.as_i32x8(), zero))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtps_epi32&expand=1732)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm_mask_cvtps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    let convert = _mm_cvtps_epi32(a);
+    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtps_epi32&expand=1733)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm_maskz_cvtps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    let convert = _mm_cvtps_epi32(a);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, convert.as_i32x4(), zero))
+}
+
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
 ///    
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_epu32&expand=1755)
@@ -10741,6 +10787,82 @@ pub unsafe fn _mm512_maskz_cvtps_epu32(k: __mmask16, a: __m512) -> __m512i {
     ))
 }
 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_epu32&expand=1752)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm256_cvtps_epu32(a: __m256) -> __m256i {
+    transmute(vcvtps2udq256(
+        a.as_f32x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtps_epu32&expand=1753)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm256_mask_cvtps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvtps2udq256(a.as_f32x8(), src.as_u32x8(), k))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtps_epu32&expand=1754)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm256_maskz_cvtps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvtps2udq256(
+        a.as_f32x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        k,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_epu32&expand=1749)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm_cvtps_epu32(a: __m128) -> __m128i {
+    transmute(vcvtps2udq128(
+        a.as_f32x4(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtps_epu32&expand=1750)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm_mask_cvtps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvtps2udq128(a.as_f32x4(), src.as_u32x4(), k))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtps_epu32&expand=1751)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2udq))]
+pub unsafe fn _mm_maskz_cvtps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvtps2udq128(
+        a.as_f32x4(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_pd&expand=1769)
@@ -13880,8 +14002,7 @@ pub unsafe fn _mm512_maskz_cvt_roundps_ph(k: __mmask16, a: __m512, sae: i32) ->
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
@@ -13891,28 +14012,27 @@ pub unsafe fn _mm512_maskz_cvt_roundps_ph(k: __mmask16, a: __m512, sae: i32) ->
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvt_roundps_ph&expand=1352)   
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
+#[cfg_attr(test, assert_instr(vcvtps2ph, imm8 = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm256_mask_cvt_roundps_ph(
     src: __m128i,
     k: __mmask8,
     a: __m256,
-    sae: i32,
+    imm8: i32,
 ) -> __m128i {
     let a = a.as_f32x8();
     let src = src.as_i16x8();
     macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph256(a, $imm4, src, k)
+        ($imm8:expr) => {
+            vcvtps2ph256(a, $imm8, src, k)
         };
     }
-    let r = constify_imm4_round!(sae, call);
+    let r = constify_imm8_sae!(imm8, call);
     transmute(r)
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
@@ -13922,23 +14042,22 @@ pub unsafe fn _mm256_mask_cvt_roundps_ph(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvt_roundps_ph&expand=1353)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
+#[cfg_attr(test, assert_instr(vcvtps2ph, imm8 = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_cvt_roundps_ph(k: __mmask8, a: __m256, sae: i32) -> __m128i {
+pub unsafe fn _mm256_maskz_cvt_roundps_ph(k: __mmask8, a: __m256, imm8: i32) -> __m128i {
     let a = a.as_f32x8();
     let zero = _mm_setzero_si128().as_i16x8();
     macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph256(a, $imm4, zero, k)
+        ($imm8:expr) => {
+            vcvtps2ph256(a, $imm8, zero, k)
         };
     }
-    let r = constify_imm4_round!(sae, call);
+    let r = constify_imm8_sae!(imm8, call);
     transmute(r)
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
@@ -13948,23 +14067,22 @@ pub unsafe fn _mm256_maskz_cvt_roundps_ph(k: __mmask8, a: __m256, sae: i32) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundps_ph&expand=1350)   
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
+#[cfg_attr(test, assert_instr(vcvtps2ph, imm8 = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_cvt_roundps_ph(src: __m128i, k: __mmask8, a: __m128, sae: i32) -> __m128i {
+pub unsafe fn _mm_mask_cvt_roundps_ph(src: __m128i, k: __mmask8, a: __m128, imm8: i32) -> __m128i {
     let a = a.as_f32x4();
     let src = src.as_i16x8();
     macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph128(a, $imm4, src, k)
+        ($imm8:expr) => {
+            vcvtps2ph128(a, $imm8, src, k)
         };
     }
-    let r = constify_imm4_round!(sae, call);
+    let r = constify_imm8_sae!(imm8, call);
     transmute(r)
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
@@ -13974,17 +14092,17 @@ pub unsafe fn _mm_mask_cvt_roundps_ph(src: __m128i, k: __mmask8, a: __m128, sae:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundps_ph&expand=1351)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
+#[cfg_attr(test, assert_instr(vcvtps2ph, imm8 = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_cvt_roundps_ph(k: __mmask8, a: __m128, sae: i32) -> __m128i {
+pub unsafe fn _mm_maskz_cvt_roundps_ph(k: __mmask8, a: __m128, imm8: i32) -> __m128i {
     let a = a.as_f32x4();
     let zero = _mm_setzero_si128().as_i16x8();
     macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph128(a, $imm4, zero, k)
+        ($imm8:expr) => {
+            vcvtps2ph128(a, $imm8, zero, k)
         };
     }
-    let r = constify_imm4_round!(sae, call);
+    let r = constify_imm8_sae!(imm8, call);
     transmute(r)
 }
 
@@ -14048,6 +14166,106 @@ pub unsafe fn _mm512_maskz_cvtps_ph(k: __mmask16, a: __m512, sae: i32) -> __m256
     transmute(r)
 }
 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtps_ph&expand=1776)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, imm8 = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_mask_cvtps_ph(src: __m128i, k: __mmask8, a: __m256, imm8: i32) -> __m128i {
+    let a = a.as_f32x8();
+    let src = src.as_i16x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vcvtps2ph256(a, $imm8, src, k)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtps_ph&expand=1777)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, imm8 = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_maskz_cvtps_ph(k: __mmask8, a: __m256, imm8: i32) -> __m128i {
+    let a = a.as_f32x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vcvtps2ph256(a, $imm8, zero, k)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtps_ph&expand=1773)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, imm8 = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_mask_cvtps_ph(src: __m128i, k: __mmask8, a: __m128, imm8: i32) -> __m128i {
+    let a = a.as_f32x4();
+    let src = src.as_i16x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vcvtps2ph128(a, $imm8, src, k)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
+/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
+///    _MM_FROUND_TO_NEG_INF     // round down\
+///    _MM_FROUND_TO_POS_INF     // round up\
+///    _MM_FROUND_TO_ZERO        // truncate\
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///    
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtps_ph&expand=1774)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtps2ph, imm8 = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_maskz_cvtps_ph(k: __mmask8, a: __m128, imm8: i32) -> __m128i {
+    let a = a.as_f32x4();
+    let zero = _mm_setzero_si128().as_i16x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vcvtps2ph128(a, $imm8, zero, k)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///    
@@ -14158,6 +14376,52 @@ pub unsafe fn _mm512_maskz_cvtph_ps(k: __mmask16, a: __m256i) -> __m512 {
     ))
 }
 
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_ps&expand=1721)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm256_mask_cvtph_ps(src: __m256, k: __mmask8, a: __m128i) -> __m256 {
+    let convert = _mm256_cvtph_ps(a);
+    transmute(simd_select_bitmask(k, convert.as_f32x8(), src.as_f32x8()))
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_ps&expand=1722)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm256_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m256 {
+    let convert = _mm256_cvtph_ps(a);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, convert.as_f32x8(), zero))
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_ps&expand=1718)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm_mask_cvtph_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    let convert = _mm_cvtph_ps(a);
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_ps&expand=1719)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvtph2ps))]
+pub unsafe fn _mm_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m128 {
+    let convert = _mm_cvtph_ps(a);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, convert.as_f32x4(), zero))
+}
+
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///    
@@ -37332,8 +37596,14 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.mask.cvtps2dq.512"]
     fn vcvtps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
+
     #[link_name = "llvm.x86.avx512.mask.cvtps2udq.512"]
     fn vcvtps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.256"]
+    fn vcvtps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.128"]
+    fn vcvtps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
+
     #[link_name = "llvm.x86.avx512.mask.cvtps2pd.512"]
     fn vcvtps2pd(a: f32x8, src: f64x8, mask: u8, sae: i32) -> f64x8;
     #[link_name = "llvm.x86.avx512.mask.cvtpd2ps.512"]
@@ -42472,6 +42742,48 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtps_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvtps_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtps_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtps_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvtps_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtps_epu32() {
         let a = _mm512_setr_ps(
@@ -42507,6 +42819,64 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_cvtps_epu32(a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvtps_epu32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvtps_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvtps_epu32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_cvtps_epu32(a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtps_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvtps_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_epu32(0b00001111, a);
+        let e = _mm_set_epi32(12, 14, 14, 16);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtepi8_epi32() {
         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -43840,6 +44210,48 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtps_ph(src, 0, a, _MM_FROUND_NO_EXC);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtps_ph(src, 0b11111111, a, _MM_FROUND_NO_EXC);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtps_ph() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_maskz_cvtps_ph(0, a, _MM_FROUND_NO_EXC);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtps_ph(0b11111111, a, _MM_FROUND_NO_EXC);
+        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtps_ph() {
+        let a = _mm_set1_ps(1.);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtps_ph(src, 0, a, _MM_FROUND_NO_EXC);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtps_ph(src, 0b00001111, a, _MM_FROUND_NO_EXC);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtps_ph() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_maskz_cvtps_ph(0, a, _MM_FROUND_NO_EXC);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtps_ph(0b00001111, a, _MM_FROUND_NO_EXC);
+        let e = _mm_setr_epi64x(4323521613979991040, 0);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundph_ps() {
         let a = _mm256_setr_epi64x(
@@ -43936,6 +44348,48 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let src = _mm256_set1_ps(0.);
+        let r = _mm256_mask_cvtph_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_cvtph_ps(src, 0b11111111, a);
+        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let r = _mm256_maskz_cvtph_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_cvtph_ps(0b11111111, a);
+        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let src = _mm_set1_ps(0.);
+        let r = _mm_mask_cvtph_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_cvtph_ps(src, 0b00001111, a);
+        let e = _mm_setr_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtph_ps() {
+        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
+        let r = _mm_maskz_cvtph_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_cvtph_ps(0b00001111, a);
+        let e = _mm_setr_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtt_roundps_epi32() {
         let a = _mm512_setr_ps(

From 3e2275bcace45a102af531946041a522fa997378 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Tue, 16 Feb 2021 10:50:20 -0500
Subject: [PATCH 05/31] cvtsepi32_epi8,epi16: mm256,mm

---
 crates/core_arch/avx512f.md         |   34 +-
 crates/core_arch/src/x86/avx512f.rs | 1076 ++++++++++++---------------
 2 files changed, 510 insertions(+), 600 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index f57471bfd8..2347aa8189 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2141,6 +2141,29 @@
   * [x] [`_mm256_maskz_cvtps_ph`]
   * [x] [`_mm512_cvtpslo_pd`]
   * [x] [`_mm512_mask_cvtpslo_pd`]
+  * [x] [`_mm512_cvtsepi32_epi16`]
+  * [x] [`_mm512_mask_cvtsepi32_epi16`]
+  * [x] [`_mm512_maskz_cvtsepi32_epi16`]
+  * [x] [`_mm_cvtsepi32_epi16`]
+  * [x] [`_mm_mask_cvtsepi32_epi16`]
+  * [x] [`_mm_maskz_cvtsepi32_epi16`]
+  * [x] [`_mm256_cvtsepi32_epi16`]
+  * [x] [`_mm256_mask_cvtsepi32_epi16`]
+  * [x] [`_mm256_maskz_cvtsepi32_epi16`]
+  * [x] [`_mm512_cvtsepi32_epi8`]
+  * [x] [`_mm512_mask_cvtsepi32_epi8`]
+  * [x] [`_mm512_maskz_cvtsepi32_epi8`]
+  * [x] [`_mm_cvtsepi32_epi8`]
+  * [x] [`_mm_mask_cvtsepi32_epi8`]
+  * [x] [`_mm_maskz_cvtsepi32_epi8`]
+  * [x] [`_mm256_cvtsepi32_epi8`]
+  * [x] [`_mm256_mask_cvtsepi32_epi8`]
+  * [x] [`_mm256_maskz_cvtsepi32_epi8`]
+  * [ ] [`_mm512_mask_cvtsepi32_storeu_epi16`]
+  * [ ] [`_mm512_mask_cvtsepi32_storeu_epi8`]
+
+
+
 
   * [x] [`_mm512_mask_cvtsepi64_epi32`]
   * [x] [`_mm512_mask_cvtsepi64_epi8`]
@@ -2158,8 +2181,7 @@
   * [x] [`_mm512_cvt_roundps_pd`]
 
   * [x] [`_mm512_mask_cvtsepi64_epi16`]
-  * [x] [`_mm512_cvtsepi32_epi16`]
-  * [x] [`_mm512_cvtsepi32_epi8`]
+
   * [x] [`_mm512_cvtsepi64_epi16`]
   * [x] [`_mm512_cvtsepi64_epi32`]
   * [x] [`_mm512_cvtsepi64_epi8`]
@@ -2198,10 +2220,7 @@
   * [x] [`_mm512_mask_cvt_roundps_epu32`]
   * [x] [`_mm512_mask_cvt_roundps_pd`]
 
-  * [x] [`_mm512_mask_cvtsepi32_epi16`]
-  * [x] [`_mm512_mask_cvtsepi32_epi8`]
-  * [ ] [`_mm512_mask_cvtsepi32_storeu_epi16`]
-  * [ ] [`_mm512_mask_cvtsepi32_storeu_epi8`]
+
   * [x] [`_mm512_mask_cvtt_roundpd_epi32`]
   * [x] [`_mm512_mask_cvtt_roundpd_epu32`]
   * [x] [`_mm512_mask_cvtt_roundps_epi32`]
@@ -2230,8 +2249,7 @@
   * [x] [`_mm512_maskz_cvt_roundps_epu32`]
   * [x] [`_mm512_maskz_cvt_roundps_pd`]
 
-  * [x] [`_mm512_maskz_cvtsepi32_epi16`]
-  * [x] [`_mm512_maskz_cvtsepi32_epi8`]
+
   * [x] [`_mm512_maskz_cvtsepi64_epi16`]
   * [x] [`_mm512_maskz_cvtsepi64_epi32`]
   * [x] [`_mm512_maskz_cvtsepi64_epi8`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 4907e11758..0f74b4a7b2 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -12980,6 +12980,74 @@ pub unsafe fn _mm512_maskz_cvtsepi32_epi16(k: __mmask16, a: __m512i) -> __m256i
     ))
 }
 
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi32_epi16&expand=1816)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm256_cvtsepi32_epi16(a: __m256i) -> __m128i {
+    transmute(vpmovsdw256(
+        a.as_i32x8(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_epi16&expand=1817)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm256_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsdw256(a.as_i32x8(), src.as_i16x8(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi32_epi16&expand=1818)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm256_maskz_cvtsepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsdw256(a.as_i32x8(), _mm_setzero_si128().as_i16x8(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi32_epi16&expand=1813)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm_cvtsepi32_epi16(a: __m128i) -> __m128i {
+    transmute(vpmovsdw128(
+        a.as_i32x4(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_epi16&expand=1814)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsdw128(a.as_i32x4(), src.as_i16x8(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi32_epi16&expand=1815)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm_maskz_cvtsepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsdw128(a.as_i32x4(), _mm_setzero_si128().as_i16x8(), k))
+}
+
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi32_epi8&expand=1828)
@@ -13014,6 +13082,74 @@ pub unsafe fn _mm512_maskz_cvtsepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
     transmute(vpmovsdb(a.as_i32x16(), _mm_setzero_si128().as_i8x16(), k))
 }
 
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi32_epi8&expand=1825)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm256_cvtsepi32_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovsdb256(
+        a.as_i32x8(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_epi8&expand=1826)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm256_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsdb256(a.as_i32x8(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi32_epi8&expand=1827)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm256_maskz_cvtsepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsdb256(a.as_i32x8(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi32_epi8&expand=1822)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm_cvtsepi32_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovsdb128(
+        a.as_i32x4(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_epi8&expand=1823)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsdb128(a.as_i32x4(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi32_epi8&expand=1824)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm_maskz_cvtsepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsdb128(a.as_i32x4(), _mm_setzero_si128().as_i8x16(), k))
+}
+
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi32&expand=1852)
@@ -37699,8 +37835,18 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.mask.pmovs.dw.512"]
     fn vpmovsdw(a: i32x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.256"]
+    fn vpmovsdw256(a: i32x8, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.128"]
+    fn vpmovsdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
+
     #[link_name = "llvm.x86.avx512.mask.pmovs.db.512"]
     fn vpmovsdb(a: i32x16, src: i8x16, mask: u16) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.256"]
+    fn vpmovsdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.128"]
+    fn vpmovsdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;
+
     #[link_name = "llvm.x86.avx512.mask.pmovs.qd.512"]
     fn vpmovsqd(a: i64x8, src: i32x8, mask: u8) -> i32x8;
     #[link_name = "llvm.x86.avx512.mask.pmovs.qw.512"]
@@ -41664,23 +41810,12 @@ mod tests {
             c,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
         );
         assert_eq_m512(r, e);
     }
@@ -41693,6 +41828,7 @@ mod tests {
         let r =
             _mm512_maskz_fmadd_round_ps(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
         assert_eq_m512(r, _mm512_setzero_ps());
+        #[rustfmt::skip]
         let r = _mm512_maskz_fmadd_round_ps(
             0b00000000_11111111,
             a,
@@ -41700,23 +41836,12 @@ mod tests {
             c,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
         );
         assert_eq_m512(r, e);
     }
@@ -41736,23 +41861,12 @@ mod tests {
             0b00000000_11111111,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
         );
         assert_eq_m512(r, e);
     }
@@ -41785,23 +41899,12 @@ mod tests {
             c,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
         );
         assert_eq_m512(r, e);
     }
@@ -41821,23 +41924,12 @@ mod tests {
             c,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
         );
         assert_eq_m512(r, e);
     }
@@ -41857,23 +41949,12 @@ mod tests {
             0b00000000_11111111,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            -0.99999994,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
         );
         assert_eq_m512(r, e);
     }
@@ -41884,23 +41965,12 @@ mod tests {
         let b = _mm512_set1_ps(1.);
         let c = _mm512_set1_ps(-1.);
         let r = _mm512_fmaddsub_round_ps(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
         );
         assert_eq_m512(r, e);
         let r = _mm512_fmaddsub_round_ps(a, b, c, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
@@ -41931,23 +42001,12 @@ mod tests {
             c,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
         );
         assert_eq_m512(r, e);
     }
@@ -41972,23 +42031,12 @@ mod tests {
             c,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
         );
         assert_eq_m512(r, e);
     }
@@ -42013,23 +42061,12 @@ mod tests {
             0b00000000_11111111,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            1.0000001, -0.99999994, 1.0000001, -0.99999994,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
         );
         assert_eq_m512(r, e);
     }
@@ -42040,23 +42077,12 @@ mod tests {
         let b = _mm512_set1_ps(1.);
         let c = _mm512_set1_ps(-1.);
         let r = _mm512_fmsubadd_round_ps(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
         );
         assert_eq_m512(r, e);
         let r = _mm512_fmsubadd_round_ps(a, b, c, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
@@ -42087,23 +42113,12 @@ mod tests {
             c,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
-            0.00000007,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
+            0.00000007, 0.00000007, 0.00000007, 0.00000007,
         );
         assert_eq_m512(r, e);
     }
@@ -42128,23 +42143,12 @@ mod tests {
             c,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
         );
         assert_eq_m512(r, e);
     }
@@ -42169,23 +42173,12 @@ mod tests {
             0b00000000_11111111,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -0.99999994,
-            1.0000001,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
-            -1.,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
+            -1., -1., -1., -1.,
+            -1., -1., -1., -1.,
         );
         assert_eq_m512(r, e);
     }
@@ -42574,23 +42567,12 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_fixupimm_round_ps() {
+        #[rustfmt::skip]
         let a = _mm512_set_ps(
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
         );
         let b = _mm512_set1_ps(f32::MAX);
         let c = _mm512_set1_epi32(i32::MAX);
@@ -42610,23 +42592,12 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_fixupimm_round_ps() {
+        #[rustfmt::skip]
         let a = _mm512_set_ps(
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            f32::NAN,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
-            1.,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
+            1., 1., 1., 1.,
+            1., 1., 1., 1.,
         );
         let b = _mm512_set1_ps(f32::MAX);
         let c = _mm512_set1_epi32(i32::MAX);
@@ -43449,285 +43420,294 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtsepi32_epi16() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MAX,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
         );
         let r = _mm512_cvtsepi32_epi16(a);
+        #[rustfmt::skip]
         let e = _mm256_set_epi16(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i16::MIN,
-            i16::MAX,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
         );
         assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cvtsepi32_epi16() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MAX,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
         );
         let src = _mm256_set1_epi16(-1);
         let r = _mm512_mask_cvtsepi32_epi16(src, 0, a);
         assert_eq_m256i(r, src);
         let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a);
+        #[rustfmt::skip]
         let e = _mm256_set_epi16(
-            -1,
-            -1,
-            -1,
-            -1,
-            -1,
-            -1,
-            -1,
-            -1,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i16::MIN,
-            i16::MAX,
+            -1, -1, -1, -1,
+            -1, -1, -1, -1,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
         );
         assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvtsepi32_epi16() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MAX,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
         );
         let r = _mm512_maskz_cvtsepi32_epi16(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
         let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a);
+        #[rustfmt::skip]
         let e = _mm256_set_epi16(
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i16::MIN,
-            i16::MAX,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            8, 9, 10, 11,
+            12, 13, i16::MIN, i16::MAX,
         );
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_cvtsepi32_epi16(a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let src = _mm_set1_epi16(-1);
+        let r = _mm256_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi32_epi16() {
+        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm256_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_cvtsepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtsepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi32_epi16() {
+        let a = _mm_set_epi32(4, 5, 6, 7);
+        let r = _mm_maskz_cvtsepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtsepi32_epi8() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MAX,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
         );
         let r = _mm512_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
         let e = _mm_set_epi8(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i8::MIN,
-            i8::MAX,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
         );
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cvtsepi32_epi8() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MAX,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
         );
         let src = _mm_set1_epi8(-1);
         let r = _mm512_mask_cvtsepi32_epi8(src, 0, a);
         assert_eq_m128i(r, src);
         let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a);
+        #[rustfmt::skip]
         let e = _mm_set_epi8(
-            -1,
-            -1,
-            -1,
-            -1,
-            -1,
-            -1,
-            -1,
-            -1,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i8::MIN,
-            i8::MAX,
+            -1, -1, -1, -1,
+            -1, -1, -1, -1,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
         );
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvtsepi32_epi8() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MAX,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MAX,
         );
         let r = _mm512_maskz_cvtsepi32_epi8(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
         let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a);
+        #[rustfmt::skip]
         let e = _mm_set_epi8(
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i8::MIN,
-            i8::MAX,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            8, 9, 10, 11,
+            12, 13, i8::MIN, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi32_epi8(src, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi32_epi8() {
+        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm256_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi32_epi8(0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            9, 10, 11, 12,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let r = _mm_cvtsepi32_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtsepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi32_epi8(src, 0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi32_epi8() {
+        let a = _mm_set_epi32(13, 14, 15, 16);
+        let r = _mm_maskz_cvtsepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi32_epi8(0b00001111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13, 14, 15, 16,
         );
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtusepi32_epi16() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MIN,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
         );
         let r = _mm512_cvtusepi32_epi16(a);
         let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
@@ -43736,23 +43716,12 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cvtusepi32_epi16() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MIN,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
         );
         let src = _mm256_set1_epi16(-1);
         let r = _mm512_mask_cvtusepi32_epi16(src, 0, a);
@@ -43764,23 +43733,12 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvtusepi32_epi16() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MIN,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
         );
         let r = _mm512_maskz_cvtusepi32_epi16(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
@@ -43791,23 +43749,12 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtusepi32_epi8() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MIN,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
         );
         let r = _mm512_cvtusepi32_epi8(a);
         let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
@@ -43816,23 +43763,12 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cvtusepi32_epi8() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MIN,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
         );
         let src = _mm_set1_epi8(-1);
         let r = _mm512_mask_cvtusepi32_epi8(src, 0, a);
@@ -43844,23 +43780,12 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvtusepi32_epi8() {
+        #[rustfmt::skip]
         let a = _mm512_set_epi32(
-            0,
-            1,
-            2,
-            3,
-            4,
-            5,
-            6,
-            7,
-            8,
-            9,
-            10,
-            11,
-            12,
-            13,
-            i32::MIN,
-            i32::MIN,
+            0, 1, 2, 3,
+            4, 5, 6, 7,
+            8, 9, 10, 11,
+            12, 13, i32::MIN, i32::MIN,
         );
         let r = _mm512_maskz_cvtusepi32_epi8(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
@@ -44014,23 +43939,12 @@ mod tests {
     unsafe fn test_mm512_cvt_roundepu32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
         let r = _mm512_cvt_roundepu32_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            0.,
-            4294967300.,
-            2.,
-            4294967300.,
-            4.,
-            4294967300.,
-            6.,
-            4294967300.,
-            8.,
-            10.,
-            10.,
-            12.,
-            12.,
-            14.,
-            14.,
-            16.,
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            8., 10., 10., 12.,
+            12., 14., 14., 16.,
         );
         assert_eq_m512(r, e);
     }
@@ -44048,23 +43962,12 @@ mod tests {
             a,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            0.,
-            4294967300.,
-            2.,
-            4294967300.,
-            4.,
-            4294967300.,
-            6.,
-            4294967300.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
         );
         assert_eq_m512(r, e);
     }
@@ -44079,23 +43982,12 @@ mod tests {
             a,
             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
+        #[rustfmt::skip]
         let e = _mm512_setr_ps(
-            0.,
-            4294967300.,
-            2.,
-            4294967300.,
-            4.,
-            4294967300.,
-            6.,
-            4294967300.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
-            0.,
+            0., 4294967300., 2., 4294967300.,
+            4., 4294967300., 6., 4294967300.,
+            0., 0., 0., 0.,
+            0., 0., 0., 0.,
         );
         assert_eq_m512(r, e);
     }

From 600103645cd19680b30c2750a5dd7932aea9fc57 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Tue, 16 Feb 2021 17:23:02 -0500
Subject: [PATCH 06/31] cvtsepi32_mask_storeu_epi16,epi8: mm256,mm

---
 crates/core_arch/avx512f.md         |   9 +-
 crates/core_arch/src/x86/avx512f.rs | 162 ++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+), 3 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 2347aa8189..5ac0c344cf 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2159,9 +2159,12 @@
   * [x] [`_mm256_cvtsepi32_epi8`]
   * [x] [`_mm256_mask_cvtsepi32_epi8`]
   * [x] [`_mm256_maskz_cvtsepi32_epi8`]
-  * [ ] [`_mm512_mask_cvtsepi32_storeu_epi16`]
-  * [ ] [`_mm512_mask_cvtsepi32_storeu_epi8`]
-
+  * [x] [`_mm512_mask_cvtsepi32_storeu_epi16`]
+  * [x] [`_mm_mask_cvtsepi32_storeu_epi16`]
+  * [x] [`_mm256_mask_cvtsepi32_storeu_epi16`]
+  * [x] [`_mm512_mask_cvtsepi32_storeu_epi8`]
+  * [x] [`_mm_mask_cvtsepi32_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtsepi32_storeu_epi8`]
 
 
 
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 0f74b4a7b2..ac0f82a7bd 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -30910,6 +30910,36 @@ pub unsafe fn _mm_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a:
     vpmovdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
 }
 
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi32_storeu_epi16&expand=1833)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovsdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_storeu_epi16&expand=1832)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_storeu_epi16&expand=1831)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdw))]
+pub unsafe fn _mm_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+}
+
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_storeu_epi8&expand=1463)
@@ -30940,6 +30970,36 @@ pub unsafe fn _mm_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: _
     vpmovdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
 }
 
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi32_storeu_epi8&expand=1836)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovsdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_storeu_epi8&expand=1835)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+}
+
+/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_storeu_epi8&expand=1834)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsdb))]
+pub unsafe fn _mm_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+}
+
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi16&expand=1513)
@@ -37804,6 +37864,13 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.128"]
     fn vpmovdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
 
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.512"]
+    fn vpmovsdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.256"]
+    fn vpmovsdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.128"]
+    fn vpmovsdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
     #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.512"]
     fn vpmovdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
     #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.256"]
@@ -37811,6 +37878,13 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.128"]
     fn vpmovdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
 
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.512"]
+    fn vpmovsdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.256"]
+    fn vpmovsdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.128"]
+    fn vpmovsdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
     #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.512"]
     fn vpmovqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
     #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.256"]
@@ -50542,6 +50616,33 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(i16::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set1_epi16(i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cvtepi32_storeu_epi8() {
         let a = _mm512_set1_epi32(9);
@@ -50569,6 +50670,67 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+            i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_storeu_epi32() {
         let a = _mm512_set1_epi32(9);

From 61a27154d9a00b1cbb9833c94c60738b59d55748 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Tue, 16 Feb 2021 18:02:36 -0500
Subject: [PATCH 07/31] cvtsepi64_epi16,epi32: mm256,mm

---
 crates/core_arch/avx512f.md            |  36 +++---
 crates/core_arch/src/x86/avx512f.rs    | 146 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 116 ++++++++++++++++++++
 3 files changed, 282 insertions(+), 16 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 5ac0c344cf..bebbafdaa9 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2165,11 +2165,28 @@
   * [x] [`_mm512_mask_cvtsepi32_storeu_epi8`]
   * [x] [`_mm_mask_cvtsepi32_storeu_epi8`]
   * [x] [`_mm256_mask_cvtsepi32_storeu_epi8`]
-
-
-
+  * [x] [`_mm512_cvtsepi64_epi16`]
+  * [x] [`_mm512_mask_cvtsepi64_epi16`]
+  * [x] [`_mm512_maskz_cvtsepi64_epi16`]
+  * [x] [`_mm_cvtsepi64_epi16`]
+  * [x] [`_mm_mask_cvtsepi64_epi16`]
+  * [x] [`_mm_maskz_cvtsepi64_epi16`]
+  * [x] [`_mm256_cvtsepi64_epi16`]
+  * [x] [`_mm256_mask_cvtsepi64_epi16`]
+  * [x] [`_mm256_maskz_cvtsepi64_epi16`]
+  * [x] [`_mm512_cvtsepi64_epi32`]
   * [x] [`_mm512_mask_cvtsepi64_epi32`]
+  * [x] [`_mm512_maskz_cvtsepi64_epi32`]
+  * [x] [`_mm_cvtsepi64_epi32`]
+  * [x] [`_mm_mask_cvtsepi64_epi32`]
+  * [x] [`_mm_maskz_cvtsepi64_epi32`]
+  * [x] [`_mm256_cvtsepi64_epi32`]
+  * [x] [`_mm256_mask_cvtsepi64_epi32`]
+  * [x] [`_mm256_maskz_cvtsepi64_epi32`]
+  * [x] [`_mm512_cvtsepi64_epi8`]
   * [x] [`_mm512_mask_cvtsepi64_epi8`]
+  * [x] [`_mm512_maskz_cvtsepi64_epi8`]
+
   * [ ] [`_mm512_mask_cvtsepi64_storeu_epi16`]
   * [ ] [`_mm512_mask_cvtsepi64_storeu_epi32`]
   * [ ] [`_mm512_mask_cvtsepi64_storeu_epi8`]
@@ -2182,12 +2199,6 @@
   * [x] [`_mm512_cvt_roundps_epi32`]
   * [x] [`_mm512_cvt_roundps_epu32`]
   * [x] [`_mm512_cvt_roundps_pd`]
-
-  * [x] [`_mm512_mask_cvtsepi64_epi16`]
-
-  * [x] [`_mm512_cvtsepi64_epi16`]
-  * [x] [`_mm512_cvtsepi64_epi32`]
-  * [x] [`_mm512_cvtsepi64_epi8`]
   * [x] [`_mm512_cvtt_roundpd_epi32`]
   * [x] [`_mm512_cvtt_roundpd_epu32`]
   * [x] [`_mm512_cvtt_roundps_epi32`]
@@ -2222,8 +2233,6 @@
   * [x] [`_mm512_mask_cvt_roundps_epi32`]
   * [x] [`_mm512_mask_cvt_roundps_epu32`]
   * [x] [`_mm512_mask_cvt_roundps_pd`]
-
-
   * [x] [`_mm512_mask_cvtt_roundpd_epi32`]
   * [x] [`_mm512_mask_cvtt_roundpd_epu32`]
   * [x] [`_mm512_mask_cvtt_roundps_epi32`]
@@ -2251,11 +2260,6 @@
   * [x] [`_mm512_maskz_cvt_roundps_epi32`]
   * [x] [`_mm512_maskz_cvt_roundps_epu32`]
   * [x] [`_mm512_maskz_cvt_roundps_pd`]
-
-
-  * [x] [`_mm512_maskz_cvtsepi64_epi16`]
-  * [x] [`_mm512_maskz_cvtsepi64_epi32`]
-  * [x] [`_mm512_maskz_cvtsepi64_epi8`]
   * [x] [`_mm512_maskz_cvtt_roundpd_epi32`]
   * [x] [`_mm512_maskz_cvtt_roundpd_epu32`]
   * [x] [`_mm512_maskz_cvtt_roundps_epi32`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index ac0f82a7bd..a9f1a2cfe1 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -13184,6 +13184,74 @@ pub unsafe fn _mm512_maskz_cvtsepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
     transmute(vpmovsqd(a.as_i64x8(), _mm256_setzero_si256().as_i32x8(), k))
 }
 
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi64_epi32&expand=1849)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm256_cvtsepi64_epi32(a: __m256i) -> __m128i {
+    transmute(vpmovsqd256(
+        a.as_i64x4(),
+        _mm_setzero_si128().as_i32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_epi32&expand=1850)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm256_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqd256(a.as_i64x4(), src.as_i32x4(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi64_epi32&expand=1851)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm256_maskz_cvtsepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqd256(a.as_i64x4(), _mm_setzero_si128().as_i32x4(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi64_epi32&expand=1846)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm_cvtsepi64_epi32(a: __m128i) -> __m128i {
+    transmute(vpmovsqd128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_epi32&expand=1847)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqd128(a.as_i64x2(), src.as_i32x4(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi64_epi32&expand=1848)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm_maskz_cvtsepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqd128(a.as_i64x2(), _mm_setzero_si128().as_i32x4(), k))
+}
+
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi16&expand=1843)
@@ -13218,6 +13286,74 @@ pub unsafe fn _mm512_maskz_cvtsepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
     transmute(vpmovsqw(a.as_i64x8(), _mm_setzero_si128().as_i16x8(), k))
 }
 
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi64_epi16&expand=1840)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm256_cvtsepi64_epi16(a: __m256i) -> __m128i {
+    transmute(vpmovsqw256(
+        a.as_i64x4(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_epi16&expand=1841)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm256_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqw256(a.as_i64x4(), src.as_i16x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi64_epi16&expand=1842)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm256_maskz_cvtsepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqw256(a.as_i64x4(), _mm_setzero_si128().as_i16x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi64_epi16&expand=1837)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm_cvtsepi64_epi16(a: __m128i) -> __m128i {
+    transmute(vpmovsqw128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_epi16&expand=1838)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqw128(a.as_i64x2(), src.as_i16x8(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi64_epi16&expand=1839)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm_maskz_cvtsepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqw128(a.as_i64x2(), _mm_setzero_si128().as_i16x8(), k))
+}
+
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi8&expand=1861)
@@ -37923,8 +38059,18 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.mask.pmovs.qd.512"]
     fn vpmovsqd(a: i64x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.256"]
+    fn vpmovsqd256(a: i64x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.128"]
+    fn vpmovsqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;
+
     #[link_name = "llvm.x86.avx512.mask.pmovs.qw.512"]
     fn vpmovsqw(a: i64x8, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.256"]
+    fn vpmovsqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.128"]
+    fn vpmovsqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;
+
     #[link_name = "llvm.x86.avx512.mask.pmovs.qb.512"]
     fn vpmovsqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
     #[link_name = "llvm.x86.avx512.mask.pmovus.dw.512"]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index c5fb635bfd..2dd0ed1b4d 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4036,6 +4036,64 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_cvtsepi64_epi32(a);
+        let e = _mm_set_epi32(4, 5, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi32(-1);
+        let r = _mm256_mask_cvtsepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi64_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_maskz_cvtsepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi64_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi64_epi32() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_cvtsepi64_epi32(a);
+        let e = _mm_set_epi32(0, 0, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_epi32() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtsepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi64_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi64_epi32() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_maskz_cvtsepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi64_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, i32::MIN, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtsepi64_epi16() {
         let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);
@@ -4065,6 +4123,64 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_cvtsepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtsepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_maskz_cvtsepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi64_epi16() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_cvtsepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_epi16() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtsepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi64_epi16(src, 0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi64_epi16() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_maskz_cvtsepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi64_epi16(0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MIN, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtsepi64_epi8() {
         let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MAX);

From 4f331c80d4c5e39c0d19a2278cc745e27c911080 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Tue, 16 Feb 2021 20:14:42 -0500
Subject: [PATCH 08/31] cvtsepi64_mask_storeu_epi8,epi16,epi32: mm256,mm

---
 crates/core_arch/avx512f.md            |  18 ++-
 crates/core_arch/src/x86/avx512f.rs    | 186 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 184 +++++++++++++++++++++---
 3 files changed, 363 insertions(+), 25 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index bebbafdaa9..6bec064472 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2186,10 +2186,22 @@
   * [x] [`_mm512_cvtsepi64_epi8`]
   * [x] [`_mm512_mask_cvtsepi64_epi8`]
   * [x] [`_mm512_maskz_cvtsepi64_epi8`]
+  * [x] [`_mm_cvtsepi64_epi8`]
+  * [x] [`_mm_mask_cvtsepi64_epi8`]
+  * [x] [`_mm_maskz_cvtsepi64_epi8`]
+  * [x] [`_mm256_cvtsepi64_epi8`]
+  * [x] [`_mm256_mask_cvtsepi64_epi8`]
+  * [x] [`_mm256_maskz_cvtsepi64_epi8`]
+  * [x] [`_mm512_mask_cvtsepi64_storeu_epi16`]
+  * [x] [`_mm_mask_cvtsepi64_storeu_epi16`]
+  * [x] [`_mm256_mask_cvtsepi64_storeu_epi16`]
+  * [x] [`_mm512_mask_cvtsepi64_storeu_epi32`]
+  * [x] [`_mm_mask_cvtsepi64_storeu_epi32`]
+  * [x] [`_mm256_mask_cvtsepi64_storeu_epi32`]
+  * [x] [`_mm512_mask_cvtsepi64_storeu_epi8`]
+  * [x] [`_mm_mask_cvtsepi64_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtsepi64_storeu_epi8`]
 
-  * [ ] [`_mm512_mask_cvtsepi64_storeu_epi16`]
-  * [ ] [`_mm512_mask_cvtsepi64_storeu_epi32`]
-  * [ ] [`_mm512_mask_cvtsepi64_storeu_epi8`]
   * [x] [`_mm512_cvt_roundepi32_ps`]
   * [x] [`_mm512_cvt_roundepu32_ps`]
   * [x] [`_mm512_cvt_roundpd_epi32`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a9f1a2cfe1..78fd13b426 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -13388,6 +13388,74 @@ pub unsafe fn _mm512_maskz_cvtsepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
     transmute(vpmovsqb(a.as_i64x8(), _mm_setzero_si128().as_i8x16(), k))
 }
 
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi64_epi8&expand=1858)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm256_cvtsepi64_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovsqb256(
+        a.as_i64x4(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_epi8&expand=1859)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm256_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqb256(a.as_i64x4(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi64_epi8&expand=1860)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm256_maskz_cvtsepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovsqb256(a.as_i64x4(), _mm_setzero_si128().as_i8x16(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi64_epi8&expand=1855)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm_cvtsepi64_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovsqb128(
+        a.as_i64x2(),
+        _mm_setzero_si128().as_i8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_epi8&expand=1856)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqb128(a.as_i64x2(), src.as_i8x16(), k))
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi64_epi8&expand=1857)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm_maskz_cvtsepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovsqb128(a.as_i64x2(), _mm_setzero_si128().as_i8x16(), k))
+}
+
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi32_epi16&expand=2054)
@@ -31166,6 +31234,36 @@ pub unsafe fn _mm_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a:
     vpmovqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
 }
 
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi16&expand=1866)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovsqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_storeu_epi16&expand=1865)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_storeu_epi16&expand=1864)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqw))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi8&expand=1519)
@@ -31196,6 +31294,36 @@ pub unsafe fn _mm_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: _
     vpmovqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
 }
 
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi8&expand=1872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovsqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_storeu_epi8&expand=1871)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_storeu_epi8&expand=1870)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqb))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
 ///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi32&expand=1516)
@@ -31226,6 +31354,36 @@ pub unsafe fn _mm_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a:
     vpmovqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
 }
 
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi32&expand=1869)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovsqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_storeu_epi32&expand=1868)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovsqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_storeu_epi32&expand=1867)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovsqd))]
+pub unsafe fn _mm_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovsqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
 /// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_epi32&expand=5628)
@@ -38027,12 +38185,28 @@ extern "C" {
     fn vpmovqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
     #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.128"]
     fn vpmovqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.512"]
+    fn vpmovsqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.256"]
+    fn vpmovsqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.128"]
+    fn vpmovsqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
     #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.512"]
     fn vpmovqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
     #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.256"]
     fn vpmovqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
     #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.128"]
     fn vpmovqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.512"]
+    fn vpmovsqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.256"]
+    fn vpmovsqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.128"]
+    fn vpmovsqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
     #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.512"]
     fn vpmovqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
     #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.256"]
@@ -38040,6 +38214,13 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.128"]
     fn vpmovqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
 
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.512"]
+    fn vpmovsqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.256"]
+    fn vpmovsqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.128"]
+    fn vpmovsqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
     #[link_name = "llvm.x86.avx512.mask.pmov.qb.512"]
     fn vpmovqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
 
@@ -38073,6 +38254,11 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.mask.pmovs.qb.512"]
     fn vpmovsqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.256"]
+    fn vpmovsqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.128"]
+    fn vpmovsqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;
+
     #[link_name = "llvm.x86.avx512.mask.pmovus.dw.512"]
     fn vpmovusdw(a: u32x16, src: u16x16, mask: u16) -> u16x16;
     #[link_name = "llvm.x86.avx512.mask.pmovus.db.512"]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 2dd0ed1b4d..c54878444f 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4196,23 +4196,12 @@ mod tests {
         let r = _mm512_mask_cvtsepi64_epi8(src, 0, a);
         assert_eq_m128i(r, src);
         let r = _mm512_mask_cvtsepi64_epi8(src, 0b00001111, a);
+        #[rustfmt::skip]
         let e = _mm_set_epi8(
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            -1,
-            -1,
-            -1,
-            -1,
-            4,
-            5,
-            i8::MIN,
-            i8::MAX,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            -1, -1, -1, -1,
+            4, 5, i8::MIN, i8::MAX,
         );
         assert_eq_m128i(r, e);
     }
@@ -4227,6 +4216,64 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtsepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_cvtsepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtsepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtsepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtsepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, i64::MIN, i64::MAX);
+        let r = _mm256_maskz_cvtsepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtsepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtsepi64_epi8() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_cvtsepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_epi8() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtsepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtsepi64_epi8(src, 0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtsepi64_epi8() {
+        let a = _mm_set_epi64x(i64::MIN, i64::MAX);
+        let r = _mm_maskz_cvtsepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtsepi64_epi8(0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MIN, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtusepi64_epi32() {
         let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
@@ -10869,7 +10916,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_cvtepi64_storeu_epi16() {
-        let a = _mm256_set1_epi32(9);
+        let a = _mm256_set1_epi64x(9);
         let mut r = _mm_set1_epi16(0);
         _mm256_mask_cvtepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
         let e = _mm_set_epi16(0, 0, 0, 0, 9, 9, 9, 9);
@@ -10878,13 +10925,40 @@ mod tests {
 
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_cvtepi64_storeu_epi16() {
-        let a = _mm_set1_epi32(9);
+        let a = _mm_set1_epi64x(9);
         let mut r = _mm_set1_epi16(0);
         _mm_mask_cvtepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
         let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 9, 9);
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_storeu_epi16() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtsepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set1_epi16(i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_storeu_epi16() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm256_mask_cvtsepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_storeu_epi16() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtsepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, i16::MAX, i16::MAX);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cvtepi64_storeu_epi8() {
         let a = _mm512_set1_epi64(9);
@@ -10896,7 +10970,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_cvtepi64_storeu_epi8() {
-        let a = _mm256_set1_epi32(9);
+        let a = _mm256_set1_epi64x(9);
         let mut r = _mm_set1_epi8(0);
         _mm256_mask_cvtepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
         let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9);
@@ -10905,13 +10979,52 @@ mod tests {
 
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_cvtepi64_storeu_epi8() {
-        let a = _mm_set1_epi32(9);
+        let a = _mm_set1_epi64x(9);
         let mut r = _mm_set1_epi8(0);
         _mm_mask_cvtepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
         let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9);
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_storeu_epi8() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm512_mask_cvtsepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_storeu_epi8() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtsepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_storeu_epi8() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtsepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, i8::MAX, i8::MAX);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cvtepi64_storeu_epi32() {
         let a = _mm512_set1_epi64(9);
@@ -10923,7 +11036,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_cvtepi64_storeu_epi32() {
-        let a = _mm256_set1_epi32(9);
+        let a = _mm256_set1_epi64x(9);
         let mut r = _mm_set1_epi32(0);
         _mm256_mask_cvtepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b11111111, a);
         let e = _mm_set_epi32(9, 9, 9, 9);
@@ -10932,13 +11045,40 @@ mod tests {
 
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_cvtepi64_storeu_epi32() {
-        let a = _mm_set1_epi32(9);
+        let a = _mm_set1_epi64x(9);
         let mut r = _mm_set1_epi16(0);
         _mm_mask_cvtepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b11111111, a);
         let e = _mm_set_epi32(0, 0, 9, 9);
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtsepi64_storeu_epi32() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtsepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm256_set1_epi32(i32::MAX);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtsepi64_storeu_epi32() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi32(0);
+        _mm256_mask_cvtsepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b00001111, a);
+        let e = _mm_set1_epi32(i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtsepi64_storeu_epi32() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtsepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, i32::MAX, i32::MAX);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_storeu_epi64() {
         let a = _mm512_set1_epi64(9);

From 1acd7aa741f55d8ebc5b82aba18273d77e9878cd Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Tue, 16 Feb 2021 20:42:30 -0500
Subject: [PATCH 09/31] cvtusepi32_epi16: mm256,mm

---
 crates/core_arch/avx512f.md         |  60 ++++++++----
 crates/core_arch/src/x86/avx512f.rs | 139 ++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+), 20 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 6bec064472..58e29749ab 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2201,6 +2201,43 @@
   * [x] [`_mm512_mask_cvtsepi64_storeu_epi8`]
   * [x] [`_mm_mask_cvtsepi64_storeu_epi8`]
   * [x] [`_mm256_mask_cvtsepi64_storeu_epi8`]
+  * [x] [`_mm512_cvtusepi32_epi16`]
+  * [x] [`_mm512_mask_cvtusepi32_epi16`]
+  * [x] [`_mm512_maskz_cvtusepi32_epi16`]
+  * [x] [`_mm_cvtusepi32_epi16`]
+  * [x] [`_mm_mask_cvtusepi32_epi16`]
+  * [x] [`_mm_maskz_cvtusepi32_epi16`]
+  * [x] [`_mm256_cvtusepi32_epi16`]
+  * [x] [`_mm256_mask_cvtusepi32_epi16`]
+  * [x] [`_mm256_maskz_cvtusepi32_epi16`]
+  * [x] [`_mm512_cvtusepi32_epi8`]
+  * [x] [`_mm512_mask_cvtusepi32_epi8`]
+  * [x] [`_mm512_maskz_cvtusepi32_epi8`]
+
+
+
+
+  * [x] [`_mm512_cvtusepi64_epi16`]
+  * [x] [`_mm512_mask_cvtusepi64_epi16`]
+  * [x] [`_mm512_maskz_cvtusepi64_epi16`]
+
+  * [x] [`_mm512_cvtusepi64_epi32`]
+  * [x] [`_mm512_mask_cvtusepi64_epi32`]
+  * [x] [`_mm512_maskz_cvtusepi64_epi32`]
+
+  * [x] [`_mm512_cvtusepi64_epi8`]
+  * [x] [`_mm512_mask_cvtusepi64_epi8`]
+  * [x] [`_mm512_maskz_cvtusepi64_epi8`]
+
+
+  * [ ] [`_mm512_mask_cvtusepi32_storeu_epi16`]
+  * [ ] [`_mm512_mask_cvtusepi32_storeu_epi8`]
+
+
+
+  * [ ] [`_mm512_mask_cvtusepi64_storeu_epi16`]
+  * [ ] [`_mm512_mask_cvtusepi64_storeu_epi32`]
+  * [ ] [`_mm512_mask_cvtusepi64_storeu_epi8`]
 
   * [x] [`_mm512_cvt_roundepi32_ps`]
   * [x] [`_mm512_cvt_roundepu32_ps`]
@@ -2219,11 +2256,7 @@
   * [x] [`_mm512_cvttpd_epu32`]
   * [x] [`_mm512_cvttps_epi32`]
   * [x] [`_mm512_cvttps_epu32`]
-  * [x] [`_mm512_cvtusepi32_epi16`]
-  * [x] [`_mm512_cvtusepi32_epi8`]
-  * [x] [`_mm512_cvtusepi64_epi16`]
-  * [x] [`_mm512_cvtusepi64_epi32`]
-  * [x] [`_mm512_cvtusepi64_epi8`]
+
   * [x] [`_mm512_int2mask`]
   * [x] [`_mm512_kand`]
   * [x] [`_mm512_kandn`]
@@ -2253,16 +2286,7 @@
   * [x] [`_mm512_mask_cvttpd_epu32`]
   * [x] [`_mm512_mask_cvttps_epi32`]
   * [x] [`_mm512_mask_cvttps_epu32`]
-  * [x] [`_mm512_mask_cvtusepi32_epi16`]
-  * [x] [`_mm512_mask_cvtusepi32_epi8`]
-  * [ ] [`_mm512_mask_cvtusepi32_storeu_epi16`]
-  * [ ] [`_mm512_mask_cvtusepi32_storeu_epi8`]
-  * [x] [`_mm512_mask_cvtusepi64_epi16`]
-  * [x] [`_mm512_mask_cvtusepi64_epi32`]
-  * [x] [`_mm512_mask_cvtusepi64_epi8`]
-  * [ ] [`_mm512_mask_cvtusepi64_storeu_epi16`]
-  * [ ] [`_mm512_mask_cvtusepi64_storeu_epi32`]
-  * [ ] [`_mm512_mask_cvtusepi64_storeu_epi8`]
+
   * [x] [`_mm512_maskz_cvt_roundepi32_ps`]
   * [x] [`_mm512_maskz_cvt_roundepu32_ps`]
   * [x] [`_mm512_maskz_cvt_roundpd_epi32`]
@@ -2280,11 +2304,7 @@
   * [x] [`_mm512_maskz_cvttpd_epu32`]
   * [x] [`_mm512_maskz_cvttps_epi32`]
   * [x] [`_mm512_maskz_cvttps_epu32`]
-  * [x] [`_mm512_maskz_cvtusepi32_epi16`]
-  * [x] [`_mm512_maskz_cvtusepi32_epi8`]
-  * [x] [`_mm512_maskz_cvtusepi64_epi16`]
-  * [x] [`_mm512_maskz_cvtusepi64_epi32`]
-  * [x] [`_mm512_maskz_cvtusepi64_epi8`]
+
   * [x] [`_mm_add_round_sd`]
   * [x] [`_mm_add_round_ss`]
   * [x] [`_mm_cmp_round_sd_mask`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 78fd13b426..faacc9fb14 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -13494,6 +13494,82 @@ pub unsafe fn _mm512_maskz_cvtusepi32_epi16(k: __mmask16, a: __m512i) -> __m256i
     ))
 }
 
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi32_epi16&expand=2051)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm256_cvtusepi32_epi16(a: __m256i) -> __m128i {
+    transmute(vpmovusdw256(
+        a.as_u32x8(),
+        _mm_setzero_si128().as_u16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_epi16&expand=2052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm256_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusdw256(a.as_u32x8(), src.as_u16x8(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi32_epi16&expand=2053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm256_maskz_cvtusepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusdw256(
+        a.as_u32x8(),
+        _mm_setzero_si128().as_u16x8(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi32_epi16&expand=2048)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm_cvtusepi32_epi16(a: __m128i) -> __m128i {
+    transmute(vpmovusdw128(
+        a.as_u32x4(),
+        _mm_setzero_si128().as_u16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_epi16&expand=2049)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusdw128(a.as_u32x4(), src.as_u16x8(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi32_epi16&expand=2050)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm_maskz_cvtusepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusdw128(
+        a.as_u32x4(),
+        _mm_setzero_si128().as_u16x8(),
+        k,
+    ))
+}
+
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi32_epi8&expand=2063)
@@ -38261,6 +38337,11 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.mask.pmovus.dw.512"]
     fn vpmovusdw(a: u32x16, src: u16x16, mask: u16) -> u16x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.256"]
+    fn vpmovusdw256(a: u32x8, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.128"]
+    fn vpmovusdw128(a: u32x4, src: u16x8, mask: u8) -> u16x8;
+
     #[link_name = "llvm.x86.avx512.mask.pmovus.db.512"]
     fn vpmovusdb(a: u32x16, src: u8x16, mask: u16) -> u8x16;
     #[link_name = "llvm.x86.avx512.mask.pmovus.qd.512"]
@@ -44153,6 +44234,64 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_cvtusepi32_epi16(a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi32_epi16(src, 0b11111111, a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi32_epi16() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi32_epi16(0b11111111, a);
+        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_cvtusepi32_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtusepi32_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi32_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi32_epi16() {
+        let a = _mm_set_epi32(5, 6, 7, 8);
+        let r = _mm_maskz_cvtusepi32_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi32_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtusepi32_epi8() {
         #[rustfmt::skip]

From 1cc0120d856f6f2d508c0eb2b52b2c403d1a0bee Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Wed, 17 Feb 2021 09:04:56 -0500
Subject: [PATCH 10/31] cvtusepi32_epi8: mm256,mm

---
 crates/core_arch/avx512f.md         |  17 +++-
 crates/core_arch/src/x86/avx512f.rs | 139 ++++++++++++++++++++++++++++
 2 files changed, 152 insertions(+), 4 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 58e29749ab..ace00dab1b 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2213,8 +2213,18 @@
   * [x] [`_mm512_cvtusepi32_epi8`]
   * [x] [`_mm512_mask_cvtusepi32_epi8`]
   * [x] [`_mm512_maskz_cvtusepi32_epi8`]
-
-
+  * [x] [`_mm_cvtusepi32_epi8`]
+  * [x] [`_mm_mask_cvtusepi32_epi8`]
+  * [x] [`_mm_maskz_cvtusepi32_epi8`]
+  * [x] [`_mm256_cvtusepi32_epi8`]
+  * [x] [`_mm256_mask_cvtusepi32_epi8`]
+  * [x] [`_mm256_maskz_cvtusepi32_epi8`]
+  * [ ] [`_mm512_mask_cvtusepi32_storeu_epi16`]
+  * [_] [`_mm_mask_cvtusepi32_storeu_epi16`]
+  * [_] [`_mm256_mask_cvtusepi32_storeu_epi16`]
+  * [ ] [`_mm512_mask_cvtusepi32_storeu_epi8`]
+  * [_] [`_mm_mask_cvtusepi32_storeu_epi8`]
+  * [_] [`_mm256_mask_cvtusepi32_storeu_epi8`]
 
 
   * [x] [`_mm512_cvtusepi64_epi16`]
@@ -2230,8 +2240,7 @@
   * [x] [`_mm512_maskz_cvtusepi64_epi8`]
 
 
-  * [ ] [`_mm512_mask_cvtusepi32_storeu_epi16`]
-  * [ ] [`_mm512_mask_cvtusepi32_storeu_epi8`]
+
 
 
 
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index faacc9fb14..e5915e061e 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -13604,6 +13604,82 @@ pub unsafe fn _mm512_maskz_cvtusepi32_epi8(k: __mmask16, a: __m512i) -> __m128i
     transmute(vpmovusdb(a.as_u32x16(), _mm_setzero_si128().as_u8x16(), k))
 }
 
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi32_epi8&expand=2060)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm256_cvtusepi32_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovusdb256(
+        a.as_u32x8(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_epi8&expand=2061)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm256_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusdb256(a.as_u32x8(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi32_epi8&expand=2062)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm256_maskz_cvtusepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusdb256(
+        a.as_u32x8(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi32_epi8&expand=2057)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm_cvtusepi32_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovusdb128(
+        a.as_u32x4(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_epi8&expand=2058)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusdb128(a.as_u32x4(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi32_epi8&expand=2059)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm_maskz_cvtusepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusdb128(
+        a.as_u32x4(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi64_epi32&expand=2087)
@@ -38344,6 +38420,11 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.mask.pmovus.db.512"]
     fn vpmovusdb(a: u32x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.256"]
+    fn vpmovusdb256(a: u32x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.128"]
+    fn vpmovusdb128(a: u32x4, src: u8x16, mask: u8) -> u8x16;
+
     #[link_name = "llvm.x86.avx512.mask.pmovus.qd.512"]
     fn vpmovusqd(a: u64x8, src: u32x8, mask: u8) -> u32x8;
     #[link_name = "llvm.x86.avx512.mask.pmovus.qw.512"]
@@ -44339,6 +44420,64 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let r = _mm256_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi32_epi8(src, 0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi32_epi8() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
+        let r = _mm256_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi32_epi8(0b11111111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let r = _mm_cvtusepi32_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtusepi32_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi32_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi32_epi8() {
+        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
+        let r = _mm_maskz_cvtusepi32_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi32_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundps_epi32() {
         let a = _mm512_setr_ps(

From f7269a82e3774364027f7d63860d540eb0f280c7 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Wed, 17 Feb 2021 12:59:44 -0500
Subject: [PATCH 11/31] cvtusepi32_mask_storeu_epi8,epi16: mm256, mm

---
 crates/core_arch/avx512f.md         |  14 +-
 crates/core_arch/src/x86/avx512f.rs | 191 +++++++++++++++++++++++-----
 2 files changed, 165 insertions(+), 40 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index ace00dab1b..cab55111d6 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2219,14 +2219,12 @@
   * [x] [`_mm256_cvtusepi32_epi8`]
   * [x] [`_mm256_mask_cvtusepi32_epi8`]
   * [x] [`_mm256_maskz_cvtusepi32_epi8`]
-  * [ ] [`_mm512_mask_cvtusepi32_storeu_epi16`]
-  * [_] [`_mm_mask_cvtusepi32_storeu_epi16`]
-  * [_] [`_mm256_mask_cvtusepi32_storeu_epi16`]
-  * [ ] [`_mm512_mask_cvtusepi32_storeu_epi8`]
-  * [_] [`_mm_mask_cvtusepi32_storeu_epi8`]
-  * [_] [`_mm256_mask_cvtusepi32_storeu_epi8`]
-
-
+  * [x] [`_mm512_mask_cvtusepi32_storeu_epi16`]
+  * [x] [`_mm_mask_cvtusepi32_storeu_epi16`]
+  * [x] [`_mm256_mask_cvtusepi32_storeu_epi16`]
+  * [x] [`_mm512_mask_cvtusepi32_storeu_epi8`]
+  * [x] [`_mm_mask_cvtusepi32_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtusepi32_storeu_epi8`]
   * [x] [`_mm512_cvtusepi64_epi16`]
   * [x] [`_mm512_mask_cvtusepi64_epi16`]
   * [x] [`_mm512_maskz_cvtusepi64_epi16`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index e5915e061e..ce3c75307a 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -31296,6 +31296,36 @@ pub unsafe fn _mm_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a:
     vpmovsdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
 }
 
+/// Convert packed unsigned 32-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_storeu_epi16&expand=2068)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovusdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_storeu_epi16&expand=2067)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_storeu_epi16&expand=2066)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdw))]
+pub unsafe fn _mm_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+}
+
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_storeu_epi8&expand=1463)
@@ -31356,6 +31386,36 @@ pub unsafe fn _mm_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a:
     vpmovsdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
 }
 
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_storeu_epi8&expand=2071)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
+    vpmovusdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_storeu_epi8&expand=2070)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
+}
+
+/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_storeu_epi8&expand=2069)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusdb))]
+pub unsafe fn _mm_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
+}
+
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi16&expand=1513)
@@ -38317,6 +38377,13 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.128"]
     fn vpmovsdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
 
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.512"]
+    fn vpmovusdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.256"]
+    fn vpmovusdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.128"]
+    fn vpmovusdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
     #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.512"]
     fn vpmovdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
     #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.256"]
@@ -38331,6 +38398,13 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.128"]
     fn vpmovsdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
 
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.512"]
+    fn vpmovusdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.256"]
+    fn vpmovusdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.128"]
+    fn vpmovusdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);
+
     #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.512"]
     fn vpmovqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
     #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.256"]
@@ -51253,6 +51327,42 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm256_set1_epi16(u16::MAX as i16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm256_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set1_epi16(u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_storeu_epi16() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(
+            0,
+            0,
+            0,
+            0,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cvtepi32_storeu_epi8() {
         let a = _mm512_set1_epi32(9);
@@ -51294,23 +51404,12 @@ mod tests {
         let a = _mm256_set1_epi32(i32::MAX);
         let mut r = _mm_set1_epi8(0);
         _mm256_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
         let e = _mm_set_epi8(
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            i8::MAX,
-            i8::MAX,
-            i8::MAX,
-            i8::MAX,
-            i8::MAX,
-            i8::MAX,
-            i8::MAX,
-            i8::MAX,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
         );
         assert_eq_m128i(r, e);
     }
@@ -51320,23 +51419,51 @@ mod tests {
         let a = _mm_set1_epi32(i32::MAX);
         let mut r = _mm_set1_epi8(0);
         _mm_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
         let e = _mm_set_epi8(
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            i8::MAX,
-            i8::MAX,
-            i8::MAX,
-            i8::MAX,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm512_set1_epi32(i32::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
+        let e = _mm_set1_epi8(u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm256_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi32_storeu_epi8() {
+        let a = _mm_set1_epi32(i32::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
         );
         assert_eq_m128i(r, e);
     }

From 5fd20117a74138667a8f2dc06e9fa6238a806248 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Wed, 17 Feb 2021 14:12:09 -0500
Subject: [PATCH 12/31] cvtusepi64_epi8,epi16,epi32: mm256,mm

---
 crates/core_arch/avx512f.md            |  32 +++-
 crates/core_arch/src/x86/avx512f.rs    | 242 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 174 ++++++++++++++++++
 3 files changed, 440 insertions(+), 8 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index cab55111d6..fdee3b5518 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2228,23 +2228,39 @@
   * [x] [`_mm512_cvtusepi64_epi16`]
   * [x] [`_mm512_mask_cvtusepi64_epi16`]
   * [x] [`_mm512_maskz_cvtusepi64_epi16`]
-
+  * [x] [`_mm_cvtusepi64_epi16`]
+  * [x] [`_mm_mask_cvtusepi64_epi16`]
+  * [x] [`_mm_maskz_cvtusepi64_epi16`]
+  * [x] [`_mm256_cvtusepi64_epi16`]
+  * [x] [`_mm256_mask_cvtusepi64_epi16`]
+  * [x] [`_mm256_maskz_cvtusepi64_epi16`]
   * [x] [`_mm512_cvtusepi64_epi32`]
   * [x] [`_mm512_mask_cvtusepi64_epi32`]
   * [x] [`_mm512_maskz_cvtusepi64_epi32`]
-
+  * [x] [`_mm_cvtusepi64_epi32`]
+  * [x] [`_mm_mask_cvtusepi64_epi32`]
+  * [x] [`_mm_maskz_cvtusepi64_epi32`]
+  * [x] [`_mm256_cvtusepi64_epi32`]
+  * [x] [`_mm256_mask_cvtusepi64_epi32`]
+  * [x] [`_mm256_maskz_cvtusepi64_epi32`]
   * [x] [`_mm512_cvtusepi64_epi8`]
   * [x] [`_mm512_mask_cvtusepi64_epi8`]
   * [x] [`_mm512_maskz_cvtusepi64_epi8`]
-
-
-
-
-
-
+  * [x] [`_mm_cvtusepi64_epi8`]
+  * [x] [`_mm_mask_cvtusepi64_epi8`]
+  * [x] [`_mm_maskz_cvtusepi64_epi8`]
+  * [x] [`_mm256_cvtusepi64_epi8`]
+  * [x] [`_mm256_mask_cvtusepi64_epi8`]
+  * [x] [`_mm256_maskz_cvtusepi64_epi8`]
   * [ ] [`_mm512_mask_cvtusepi64_storeu_epi16`]
+  * [_] [`_mm_mask_cvtusepi64_storeu_epi16`]
+  * [_] [`_mm256_mask_cvtusepi64_storeu_epi16`]
   * [ ] [`_mm512_mask_cvtusepi64_storeu_epi32`]
+  * [_] [`_mm_mask_cvtusepi64_storeu_epi32`]
+  * [_] [`_mm256_mask_cvtusepi64_storeu_epi32`]
   * [ ] [`_mm512_mask_cvtusepi64_storeu_epi8`]
+  * [_] [`_mm_mask_cvtusepi64_storeu_epi8`]
+  * [_] [`_mm256_mask_cvtusepi64_storeu_epi8`]
 
   * [x] [`_mm512_cvt_roundepi32_ps`]
   * [x] [`_mm512_cvt_roundepu32_ps`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index ce3c75307a..31a85f5e64 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -13718,6 +13718,82 @@ pub unsafe fn _mm512_maskz_cvtusepi64_epi32(k: __mmask8, a: __m512i) -> __m256i
     ))
 }
 
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi64_epi32&expand=2084)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm256_cvtusepi64_epi32(a: __m256i) -> __m128i {
+    transmute(vpmovusqd256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_epi32&expand=2085)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm256_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqd256(a.as_u64x4(), src.as_u32x4(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi64_epi32&expand=2086)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm256_maskz_cvtusepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqd256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi64_epi32&expand=2081)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm_cvtusepi64_epi32(a: __m128i) -> __m128i {
+    transmute(vpmovusqd128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_epi32&expand=2082)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqd128(a.as_u64x2(), src.as_u32x4(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi64_epi32&expand=2083)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm_maskz_cvtusepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqd128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi64_epi16&expand=2078)
@@ -13752,6 +13828,82 @@ pub unsafe fn _mm512_maskz_cvtusepi64_epi16(k: __mmask8, a: __m512i) -> __m128i
     transmute(vpmovusqw(a.as_u64x8(), _mm_setzero_si128().as_u16x8(), k))
 }
 
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi64_epi16&expand=2075)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm256_cvtusepi64_epi16(a: __m256i) -> __m128i {
+    transmute(vpmovusqw256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_epi16&expand=2076)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm256_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqw256(a.as_u64x4(), src.as_u16x8(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi64_epi16&expand=2077)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm256_maskz_cvtusepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqw256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u16x8(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi64_epi16&expand=2072)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm_cvtusepi64_epi16(a: __m128i) -> __m128i {
+    transmute(vpmovusqw128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u16x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_epi16&expand=2073)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqw128(a.as_u64x2(), src.as_u16x8(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi64_epi16&expand=2074)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm_maskz_cvtusepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqw128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u16x8(),
+        k,
+    ))
+}
+
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi64_epi8&expand=2096)
@@ -13786,6 +13938,82 @@ pub unsafe fn _mm512_maskz_cvtusepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
     transmute(vpmovusqb(a.as_u64x8(), _mm_setzero_si128().as_u8x16(), k))
 }
 
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi64_epi8&expand=2093)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm256_cvtusepi64_epi8(a: __m256i) -> __m128i {
+    transmute(vpmovusqb256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_epi8&expand=2094)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm256_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqb256(a.as_u64x4(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi64_epi8&expand=2095)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm256_maskz_cvtusepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    transmute(vpmovusqb256(
+        a.as_u64x4(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi64_epi8&expand=2090)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm_cvtusepi64_epi8(a: __m128i) -> __m128i {
+    transmute(vpmovusqb128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u8x16(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_epi8&expand=2091)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqb128(a.as_u64x2(), src.as_u8x16(), k))
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi64_epi8&expand=2092)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpmovusqb128(
+        a.as_u64x2(),
+        _mm_setzero_si128().as_u8x16(),
+        k,
+    ))
+}
+
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -38501,10 +38729,24 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.mask.pmovus.qd.512"]
     fn vpmovusqd(a: u64x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.256"]
+    fn vpmovusqd256(a: u64x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.128"]
+    fn vpmovusqd128(a: u64x2, src: u32x4, mask: u8) -> u32x4;
+
     #[link_name = "llvm.x86.avx512.mask.pmovus.qw.512"]
     fn vpmovusqw(a: u64x8, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.256"]
+    fn vpmovusqw256(a: u64x4, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.128"]
+    fn vpmovusqw128(a: u64x2, src: u16x8, mask: u8) -> u16x8;
+
     #[link_name = "llvm.x86.avx512.mask.pmovus.qb.512"]
     fn vpmovusqb(a: u64x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.256"]
+    fn vpmovusqb256(a: u64x4, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.128"]
+    fn vpmovusqb128(a: u64x2, src: u8x16, mask: u8) -> u8x16;
 
     #[link_name = "llvm.x86.avx512.gather.dpd.512"]
     fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index c54878444f..7f2aa53360 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4303,6 +4303,64 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_cvtusepi64_epi32(a);
+        let e = _mm_set_epi32(4, 5, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvtusepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi64_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi64_epi32() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_maskz_cvtusepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi64_epi32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi64_epi32() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_cvtusepi64_epi32(a);
+        let e = _mm_set_epi32(0, 0, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_epi32() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvtusepi64_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi64_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi64_epi32() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_maskz_cvtusepi64_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi64_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtusepi64_epi16() {
         let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
@@ -4332,6 +4390,64 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_cvtusepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm256_mask_cvtusepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi64_epi16(src, 0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi64_epi16() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_maskz_cvtusepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi64_epi16(0b00001111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi64_epi16() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_cvtusepi64_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_epi16() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let src = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtusepi64_epi16(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi64_epi16(src, 0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi64_epi16() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_maskz_cvtusepi64_epi16(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi64_epi16(0b00000011, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 6, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtusepi64_epi8() {
         let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, i64::MIN, i64::MIN);
@@ -4361,6 +4477,64 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvtusepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_cvtusepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm256_mask_cvtusepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvtusepi64_epi8(src, 0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvtusepi64_epi8() {
+        let a = _mm256_set_epi64x(4, 5, 6, i64::MAX);
+        let r = _mm256_maskz_cvtusepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvtusepi64_epi8(0b00001111, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvtusepi64_epi8() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_cvtusepi64_epi8(a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_epi8() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let src = _mm_set1_epi8(0);
+        let r = _mm_mask_cvtusepi64_epi8(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvtusepi64_epi8(src, 0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvtusepi64_epi8() {
+        let a = _mm_set_epi64x(6, i64::MAX);
+        let r = _mm_maskz_cvtusepi64_epi8(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvtusepi64_epi8(0b00000011, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, u8::MAX as i8);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);

From 2e5c4937249e18981fd94e20c738c8d7ea7802a9 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Wed, 17 Feb 2021 15:50:12 -0500
Subject: [PATCH 13/31] cvtusepi64_mask_storeu_epi8,epi16,epi32: mm256,mm

---
 crates/core_arch/avx512f.md            |  18 ++--
 crates/core_arch/src/x86/avx512f.rs    | 111 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 108 ++++++++++++++++++++++++
 3 files changed, 228 insertions(+), 9 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index fdee3b5518..7239d55263 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2252,15 +2252,15 @@
   * [x] [`_mm256_cvtusepi64_epi8`]
   * [x] [`_mm256_mask_cvtusepi64_epi8`]
   * [x] [`_mm256_maskz_cvtusepi64_epi8`]
-  * [ ] [`_mm512_mask_cvtusepi64_storeu_epi16`]
-  * [_] [`_mm_mask_cvtusepi64_storeu_epi16`]
-  * [_] [`_mm256_mask_cvtusepi64_storeu_epi16`]
-  * [ ] [`_mm512_mask_cvtusepi64_storeu_epi32`]
-  * [_] [`_mm_mask_cvtusepi64_storeu_epi32`]
-  * [_] [`_mm256_mask_cvtusepi64_storeu_epi32`]
-  * [ ] [`_mm512_mask_cvtusepi64_storeu_epi8`]
-  * [_] [`_mm_mask_cvtusepi64_storeu_epi8`]
-  * [_] [`_mm256_mask_cvtusepi64_storeu_epi8`]
+  * [x] [`_mm512_mask_cvtusepi64_storeu_epi16`]
+  * [x] [`_mm_mask_cvtusepi64_storeu_epi16`]
+  * [x] [`_mm256_mask_cvtusepi64_storeu_epi16`]
+  * [x] [`_mm512_mask_cvtusepi64_storeu_epi32`]
+  * [x] [`_mm_mask_cvtusepi64_storeu_epi32`]
+  * [x] [`_mm256_mask_cvtusepi64_storeu_epi32`]
+  * [x] [`_mm512_mask_cvtusepi64_storeu_epi8`]
+  * [x] [`_mm_mask_cvtusepi64_storeu_epi8`]
+  * [x] [`_mm256_mask_cvtusepi64_storeu_epi8`]
 
   * [x] [`_mm512_cvt_roundepi32_ps`]
   * [x] [`_mm512_cvt_roundepu32_ps`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 31a85f5e64..f4e85b0687 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -31704,6 +31704,36 @@ pub unsafe fn _mm_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a:
     vpmovsqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
 }
 
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_storeu_epi16&expand=2101)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovusqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_storeu_epi16&expand=2100)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_storeu_epi16&expand=2099)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqw))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi8&expand=1519)
@@ -31764,6 +31794,36 @@ pub unsafe fn _mm_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a:
     vpmovsqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
 }
 
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_storeu_epi8&expand=2107)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovusqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_storeu_epi8&expand=2106)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_storeu_epi8&expand=2105)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqb))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
 ///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi32&expand=1516)
@@ -31824,6 +31884,36 @@ pub unsafe fn _mm_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a:
     vpmovsqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
 }
 
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_storeu_epi32&expand=2104)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
+    vpmovusqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_storeu_epi32&expand=2103)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
+    vpmovusqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
+}
+
+/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_storeu_epi32&expand=2102)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpmovusqd))]
+pub unsafe fn _mm_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
+    vpmovusqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
+}
+
 /// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_epi32&expand=5628)
@@ -38647,6 +38737,13 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.128"]
     fn vpmovsqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
 
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.512"]
+    fn vpmovusqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.256"]
+    fn vpmovusqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.128"]
+    fn vpmovusqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
     #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.512"]
     fn vpmovqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
     #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.256"]
@@ -38661,6 +38758,13 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.128"]
     fn vpmovsqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
 
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.512"]
+    fn vpmovusqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.256"]
+    fn vpmovusqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.128"]
+    fn vpmovusqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
     #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.512"]
     fn vpmovqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
     #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.256"]
@@ -38675,6 +38779,13 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.128"]
     fn vpmovsqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
 
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.512"]
+    fn vpmovusqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.256"]
+    fn vpmovusqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
+    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.128"]
+    fn vpmovusqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);
+
     #[link_name = "llvm.x86.avx512.mask.pmov.qb.512"]
     fn vpmovqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
 
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 7f2aa53360..7a7408510e 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -11133,6 +11133,42 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_storeu_epi16() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_undefined_si128();
+        _mm512_mask_cvtusepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set1_epi16(u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_storeu_epi16() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm256_mask_cvtusepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(
+            0,
+            0,
+            0,
+            0,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+            u16::MAX as i16,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_storeu_epi16() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtusepi64_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, u16::MAX as i16, u16::MAX as i16);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cvtepi64_storeu_epi8() {
         let a = _mm512_set1_epi64(9);
@@ -11199,6 +11235,51 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_storeu_epi8() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm512_mask_cvtusepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_storeu_epi8() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm256_mask_cvtusepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_storeu_epi8() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi8(0);
+        _mm_mask_cvtusepi64_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        #[rustfmt::skip]
+        let e = _mm_set_epi8(
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, u8::MAX as i8, u8::MAX as i8,
+        );
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cvtepi64_storeu_epi32() {
         let a = _mm512_set1_epi64(9);
@@ -11253,6 +11334,33 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cvtusepi64_storeu_epi32() {
+        let a = _mm512_set1_epi64(i64::MAX);
+        let mut r = _mm256_undefined_si256();
+        _mm512_mask_cvtusepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b11111111, a);
+        let e = _mm256_set1_epi32(u32::MAX as i32);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvtusepi64_storeu_epi32() {
+        let a = _mm256_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi32(0);
+        _mm256_mask_cvtusepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b00001111, a);
+        let e = _mm_set1_epi32(u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvtusepi64_storeu_epi32() {
+        let a = _mm_set1_epi64x(i64::MAX);
+        let mut r = _mm_set1_epi16(0);
+        _mm_mask_cvtusepi64_storeu_epi32(&mut r as *mut _ as *mut i8, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, u32::MAX as i32, u32::MAX as i32);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_storeu_epi64() {
         let a = _mm512_set1_epi64(9);

From 4cd987abf6d29b789d2af9565e4a1650bdea6094 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Wed, 17 Feb 2021 16:05:31 -0500
Subject: [PATCH 14/31] cvtsi512_si32

---
 crates/core_arch/src/x86/avx512f.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index f4e85b0687..cad1d53985 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -25469,6 +25469,17 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
     transmute(a)
 }
 
+/// Copy the lower 32-bit integer in a to dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsi512_si32&expand=1882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovd))]
+pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
+    let extract: i32 = simd_extract(a.as_i32x16(), 0);
+    transmute(extract)
+}
+
 /// Broadcast the low packed 32-bit integer from a to all elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=545)
@@ -55053,4 +55064,12 @@ mod tests {
         let e: i32 = 0;
         assert_eq!(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cvtsi512_si32() {
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm512_cvtsi512_si32(a);
+        let e: i32 = 1;
+        assert_eq!(r, e);
+    }
 }

From 8a5874013fee7899a5592aa724b3422cac17d360 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Wed, 17 Feb 2021 16:06:42 -0500
Subject: [PATCH 15/31] update avx512f.md

---
 crates/core_arch/avx512f.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 7239d55263..302ea2a1b6 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2261,6 +2261,7 @@
   * [x] [`_mm512_mask_cvtusepi64_storeu_epi8`]
   * [x] [`_mm_mask_cvtusepi64_storeu_epi8`]
   * [x] [`_mm256_mask_cvtusepi64_storeu_epi8`]
+  * [x] ['_mm512_cvtsi512_si32']
 
   * [x] [`_mm512_cvt_roundepi32_ps`]
   * [x] [`_mm512_cvt_roundepu32_ps`]

From f014dfcb2048b0d90e75841d04cec6d27874e733 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Wed, 17 Feb 2021 16:17:34 -0500
Subject: [PATCH 16/31] remove msvc check on mm512_cvtsi512_si32

---
 crates/core_arch/src/x86/avx512f.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index cad1d53985..7bf011cab3 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -25474,7 +25474,10 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsi512_si32&expand=1882)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmovd))]
+#[cfg_attr(
+    all(test, not(target_os = "windows")),
+    assert_instr(vmovd)
+)]
 pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
     let extract: i32 = simd_extract(a.as_i32x16(), 0);
     transmute(extract)

From d1e1570be4279cb1f21a6d45f68849e6f5fec48d Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Wed, 17 Feb 2021 16:22:15 -0500
Subject: [PATCH 17/31] cargo fmt

---
 crates/core_arch/src/x86/avx512f.rs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7bf011cab3..9ec81d922a 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -25474,10 +25474,7 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsi512_si32&expand=1882)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(
-    all(test, not(target_os = "windows")),
-    assert_instr(vmovd)
-)]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(vmovd))]
 pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
     let extract: i32 = simd_extract(a.as_i32x16(), 0);
     transmute(extract)

From ada076e3f31a327160c04dde28f722953b1c3512 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Thu, 18 Feb 2021 11:12:32 -0500
Subject: [PATCH 18/31] cvttpd_epi32,epu32: mm256,mm

---
 crates/core_arch/avx512f.md            | 104 ++++++++++---------
 crates/core_arch/src/x86/avx512f.rs    | 134 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 100 ++++++++++++++++++
 3 files changed, 291 insertions(+), 47 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 302ea2a1b6..4aa261d395 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2262,72 +2262,69 @@
   * [x] [`_mm_mask_cvtusepi64_storeu_epi8`]
   * [x] [`_mm256_mask_cvtusepi64_storeu_epi8`]
   * [x] ['_mm512_cvtsi512_si32']
-
-  * [x] [`_mm512_cvt_roundepi32_ps`]
-  * [x] [`_mm512_cvt_roundepu32_ps`]
-  * [x] [`_mm512_cvt_roundpd_epi32`]
-  * [x] [`_mm512_cvt_roundpd_epu32`]
-  * [x] [`_mm512_cvt_roundpd_ps`]
-  * [x] [`_mm512_cvt_roundph_ps`]
-  * [x] [`_mm512_cvt_roundps_epi32`]
-  * [x] [`_mm512_cvt_roundps_epu32`]
-  * [x] [`_mm512_cvt_roundps_pd`]
-  * [x] [`_mm512_cvtt_roundpd_epi32`]
-  * [x] [`_mm512_cvtt_roundpd_epu32`]
-  * [x] [`_mm512_cvtt_roundps_epi32`]
-  * [x] [`_mm512_cvtt_roundps_epu32`]
   * [x] [`_mm512_cvttpd_epi32`]
-  * [x] [`_mm512_cvttpd_epu32`]
-  * [x] [`_mm512_cvttps_epi32`]
-  * [x] [`_mm512_cvttps_epu32`]
-
-  * [x] [`_mm512_int2mask`]
-  * [x] [`_mm512_kand`]
-  * [x] [`_mm512_kandn`]
-  * [x] [`_mm512_kmov`]
-  * [x] [`_mm512_knot`]
-  * [x] [`_mm512_kor`]
-  * [x] [`_mm512_kortestc`]
-  * [ ] [`_mm512_kortestz`]
-  * [x] [`_mm512_kunpackb`]
-  * [x] [`_mm512_kxnor`]
-  * [x] [`_mm512_kxor`]
-  * [x] [`_mm512_mask2int`]
-  * [x] [`_mm512_mask_cvt_roundepi32_ps`]
-  * [x] [`_mm512_mask_cvt_roundepu32_ps`]
-  * [x] [`_mm512_mask_cvt_roundpd_epi32`]
-  * [x] [`_mm512_mask_cvt_roundpd_epu32`]
-  * [x] [`_mm512_mask_cvt_roundpd_ps`]
-  * [x] [`_mm512_mask_cvt_roundph_ps`]
-  * [x] [`_mm512_mask_cvt_roundps_epi32`]
-  * [x] [`_mm512_mask_cvt_roundps_epu32`]
-  * [x] [`_mm512_mask_cvt_roundps_pd`]
-  * [x] [`_mm512_mask_cvtt_roundpd_epi32`]
-  * [x] [`_mm512_mask_cvtt_roundpd_epu32`]
-  * [x] [`_mm512_mask_cvtt_roundps_epi32`]
-  * [x] [`_mm512_mask_cvtt_roundps_epu32`]
   * [x] [`_mm512_mask_cvttpd_epi32`]
+  * [x] [`_mm512_maskz_cvttpd_epi32`]
+  * [x] [`_mm_mask_cvttpd_epi32`]
+  * [x] [`_mm_maskz_cvttpd_epi32`]
+  * [x] [`_mm256_mask_cvttpd_epi32`]
+  * [x] [`_mm256_maskz_cvttpd_epi32`]
+  * [x] [`_mm512_cvttpd_epu32`]
   * [x] [`_mm512_mask_cvttpd_epu32`]
+  * [x] [`_mm512_maskz_cvttpd_epu32`]
+  * [x] [`_mm_cvttpd_epu32`]
+  * [x] [`_mm_mask_cvttpd_epu32`]
+  * [x] [`_mm_maskz_cvttpd_epu32`]
+  * [x] [`_mm256_cvttpd_epu32`]
+  * [x] [`_mm256_mask_cvttpd_epu32`]
+  * [x] [`_mm256_maskz_cvttpd_epu32`]
+  * [x] [`_mm512_cvttps_epi32`]
   * [x] [`_mm512_mask_cvttps_epi32`]
+  * [x] [`_mm512_maskz_cvttps_epi32`]
+
+  * [x] [`_mm512_cvttps_epu32`]
   * [x] [`_mm512_mask_cvttps_epu32`]
+  * [x] [`_mm512_maskz_cvttps_epu32`]
 
+  * [x] [`_mm512_cvt_roundepi32_ps`]
+  * [x] [`_mm512_mask_cvt_roundepi32_ps`]
   * [x] [`_mm512_maskz_cvt_roundepi32_ps`]
+  * [x] [`_mm512_cvt_roundepu32_ps`]
+  * [x] [`_mm512_mask_cvt_roundepu32_ps`]
   * [x] [`_mm512_maskz_cvt_roundepu32_ps`]
+  * [x] [`_mm512_cvt_roundpd_epi32`]
+  * [x] [`_mm512_mask_cvt_roundpd_epi32`]
   * [x] [`_mm512_maskz_cvt_roundpd_epi32`]
+  * [x] [`_mm512_cvt_roundpd_epu32`]
+  * [x] [`_mm512_mask_cvt_roundpd_epu32`]
   * [x] [`_mm512_maskz_cvt_roundpd_epu32`]
+  * [x] [`_mm512_cvt_roundpd_ps`]
+  * [x] [`_mm512_mask_cvt_roundpd_ps`]
   * [x] [`_mm512_maskz_cvt_roundpd_ps`]
+  * [x] [`_mm512_cvt_roundph_ps`]
+  * [x] [`_mm512_mask_cvt_roundph_ps`]
   * [x] [`_mm512_maskz_cvt_roundph_ps`]
+  * [x] [`_mm512_cvt_roundps_epi32`]
+  * [x] [`_mm512_mask_cvt_roundps_epi32`]
   * [x] [`_mm512_maskz_cvt_roundps_epi32`]
+  * [x] [`_mm512_cvt_roundps_epu32`]
+  * [x] [`_mm512_mask_cvt_roundps_epu32`]
   * [x] [`_mm512_maskz_cvt_roundps_epu32`]
+  * [x] [`_mm512_cvt_roundps_pd`]
+  * [x] [`_mm512_mask_cvt_roundps_pd`]
   * [x] [`_mm512_maskz_cvt_roundps_pd`]
+  * [x] [`_mm512_cvtt_roundpd_epi32`]
+  * [x] [`_mm512_mask_cvtt_roundpd_epi32`]
   * [x] [`_mm512_maskz_cvtt_roundpd_epi32`]
+  * [x] [`_mm512_cvtt_roundpd_epu32`]
+  * [x] [`_mm512_mask_cvtt_roundpd_epu32`]
   * [x] [`_mm512_maskz_cvtt_roundpd_epu32`]
+  * [x] [`_mm512_cvtt_roundps_epi32`]
+  * [x] [`_mm512_mask_cvtt_roundps_epi32`]
   * [x] [`_mm512_maskz_cvtt_roundps_epi32`]
+  * [x] [`_mm512_cvtt_roundps_epu32`]
+  * [x] [`_mm512_mask_cvtt_roundps_epu32`]
   * [x] [`_mm512_maskz_cvtt_roundps_epu32`]
-  * [x] [`_mm512_maskz_cvttpd_epi32`]
-  * [x] [`_mm512_maskz_cvttpd_epu32`]
-  * [x] [`_mm512_maskz_cvttps_epi32`]
-  * [x] [`_mm512_maskz_cvttps_epu32`]
 
   * [x] [`_mm_add_round_sd`]
   * [x] [`_mm_add_round_ss`]
@@ -2614,4 +2611,17 @@
   * [x] [`_mm_sqrt_round_ss`]
   * [x] [`_mm_sub_round_sd`]
   * [x] [`_mm_sub_round_ss`]
+
+  * [x] [`_mm512_int2mask`]
+  * [x] [`_mm512_kand`]
+  * [x] [`_mm512_kandn`]
+  * [x] [`_mm512_kmov`]
+  * [x] [`_mm512_knot`]
+  * [x] [`_mm512_kor`]
+  * [x] [`_mm512_kortestc`]
+  * [ ] [`_mm512_kortestz`]
+  * [x] [`_mm512_kunpackb`]
+  * [x] [`_mm512_kxnor`]
+  * [x] [`_mm512_kxor`]
+  * [x] [`_mm512_mask2int`]
 </p>
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 9ec81d922a..8f2cf068fd 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -15537,6 +15537,54 @@ pub unsafe fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
     ))
 }
 
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttpd_epi32&expand=1945)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm256_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvttpd2dq256(a.as_f64x4(), src.as_i32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttpd_epi32&expand=1946)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm256_maskz_cvttpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvttpd2dq256(
+        a.as_f64x4(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttpd_epi32&expand=1942)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvttpd2dq128(a.as_f64x2(), src.as_i32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttpd_epi32&expand=1943)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm_maskz_cvttpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvttpd2dq128(
+        a.as_f64x2(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttpd_epu32&expand=1965)
@@ -15582,6 +15630,82 @@ pub unsafe fn _mm512_maskz_cvttpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
     ))
 }
 
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttpd_epu32&expand=1962)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm256_cvttpd_epu32(a: __m256d) -> __m128i {
+    transmute(vcvttpd2udq256(
+        a.as_f64x4(),
+        _mm_setzero_si128().as_i32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttpd_epu32&expand=1963)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm256_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvttpd2udq256(a.as_f64x4(), src.as_i32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttpd_epu32&expand=1964)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm256_maskz_cvttpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    transmute(vcvttpd2udq256(
+        a.as_f64x4(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epu32&expand=1959)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm_cvttpd_epu32(a: __m128d) -> __m128i {
+    transmute(vcvttpd2udq128(
+        a.as_f64x2(),
+        _mm_setzero_si128().as_i32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttpd_epu32&expand=1960)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvttpd2udq128(a.as_f64x2(), src.as_i32x4(), k))
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttpd_epu32&expand=1961)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttpd2udq))]
+pub unsafe fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    transmute(vcvttpd2udq128(
+        a.as_f64x2(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
 /// Returns vector of type `__m512d` with all elements set to zero.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_pd&expand=5018)
@@ -38669,10 +38793,20 @@ extern "C" {
     fn vcvttps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
     #[link_name = "llvm.x86.avx512.mask.cvttps2udq.512"]
     fn vcvttps2udq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> u32x16;
+
     #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.512"]
     fn vcvttpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.256"]
+    fn vcvttpd2dq256(a: f64x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.128"]
+    fn vcvttpd2dq128(a: f64x2, src: i32x4, mask: u8) -> i32x4;
+
     #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.512"]
     fn vcvttpd2udq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.256"]
+    fn vcvttpd2udq256(a: f64x4, src: i32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.128"]
+    fn vcvttpd2udq128(a: f64x2, src: i32x4, mask: u8) -> u32x4;
 
     #[link_name = "llvm.x86.avx512.mask.pmov.dw.128"]
     fn vpmovdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 7a7408510e..c7798dda2f 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4622,6 +4622,48 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttpd_epi32() {
+        let a = _mm256_setr_pd(4., -5.5, 6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvttpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvttpd_epi32(src, 0b00001111, a);
+        let e = _mm_setr_epi32(4, -5, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttpd_epi32() {
+        let a = _mm256_setr_pd(4., -5.5, 6., -7.5);
+        let r = _mm256_maskz_cvttpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvttpd_epi32(0b00001111, a);
+        let e = _mm_setr_epi32(4, -5, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttpd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttpd_epi32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttpd_epi32() {
+        let a = _mm_set_pd(6., -7.5);
+        let r = _mm_maskz_cvttpd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttpd_epi32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, -7);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvttpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
@@ -4651,6 +4693,64 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvttpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_cvttpd_epu32(a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm256_mask_cvttpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm256_mask_cvttpd_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttpd_epu32() {
+        let a = _mm256_set_pd(4., 5.5, 6., 7.5);
+        let r = _mm256_maskz_cvttpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm256_maskz_cvttpd_epu32(0b00001111, a);
+        let e = _mm_set_epi32(4, 5, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvttpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_cvttpd_epu32(a);
+        let e = _mm_set_epi32(0, 0, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttpd_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttpd_epu32(src, 0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttpd_epu32() {
+        let a = _mm_set_pd(6., 7.5);
+        let r = _mm_maskz_cvttpd_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttpd_epu32(0b00000011, a);
+        let e = _mm_set_epi32(0, 0, 6, 7);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_add_round_pd() {
         let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007);

From 63da44add4e7a1fd15a7256bc2bc75cfd2be8b8a Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Thu, 18 Feb 2021 11:56:25 -0500
Subject: [PATCH 19/31] cvttps_epi32,epu32: mm256,mm

---
 crates/core_arch/avx512f.md         |  14 +-
 crates/core_arch/src/x86/avx512f.rs | 247 +++++++++++++++++++++++++++-
 2 files changed, 250 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 4aa261d395..575ac94400 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2281,11 +2281,19 @@
   * [x] [`_mm512_cvttps_epi32`]
   * [x] [`_mm512_mask_cvttps_epi32`]
   * [x] [`_mm512_maskz_cvttps_epi32`]
-
+  * [x] [`_mm_mask_cvttps_epi32`]
+  * [x] [`_mm_maskz_cvttps_epi32`]
+  * [x] [`_mm256_mask_cvttps_epi32`]
+  * [x] [`_mm256_maskz_cvttps_epi32`]
   * [x] [`_mm512_cvttps_epu32`]
   * [x] [`_mm512_mask_cvttps_epu32`]
   * [x] [`_mm512_maskz_cvttps_epu32`]
-
+  * [x] [`_mm_cvttps_epu32`]
+  * [x] [`_mm_mask_cvttps_epu32`]
+  * [x] [`_mm_maskz_cvttps_epu32`]
+  * [x] [`_mm256_cvttps_epu32`]
+  * [x] [`_mm256_mask_cvttps_epu32`]
+  * [x] [`_mm256_maskz_cvttps_epu32`]
   * [x] [`_mm512_cvt_roundepi32_ps`]
   * [x] [`_mm512_mask_cvt_roundepi32_ps`]
   * [x] [`_mm512_maskz_cvt_roundepi32_ps`]
@@ -2325,7 +2333,6 @@
   * [x] [`_mm512_cvtt_roundps_epu32`]
   * [x] [`_mm512_mask_cvtt_roundps_epu32`]
   * [x] [`_mm512_maskz_cvtt_roundps_epu32`]
-
   * [x] [`_mm_add_round_sd`]
   * [x] [`_mm_add_round_ss`]
   * [x] [`_mm_cmp_round_sd_mask`]
@@ -2611,7 +2618,6 @@
   * [x] [`_mm_sqrt_round_ss`]
   * [x] [`_mm_sub_round_sd`]
   * [x] [`_mm_sub_round_ss`]
-
   * [x] [`_mm512_int2mask`]
   * [x] [`_mm512_kand`]
   * [x] [`_mm512_kandn`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 8f2cf068fd..9721543502 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -15217,7 +15217,7 @@ pub unsafe fn _mm512_maskz_cvtt_roundps_epi32(k: __mmask16, a: __m512, sae: i32)
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm512_cvtt_roundps_epu32(a: __m512, sae: i32) -> __m512i {
     let a = a.as_f32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
+    let zero = _mm512_setzero_si512().as_u32x16();
     macro_rules! call {
         ($imm4:expr) => {
             vcvttps2udq(a, zero, 0b11111111_11111111, $imm4)
@@ -15242,7 +15242,7 @@ pub unsafe fn _mm512_mask_cvtt_roundps_epu32(
     sae: i32,
 ) -> __m512i {
     let a = a.as_f32x16();
-    let src = src.as_i32x16();
+    let src = src.as_u32x16();
     macro_rules! call {
         ($imm4:expr) => {
             vcvttps2udq(a, src, k, $imm4)
@@ -15262,7 +15262,7 @@ pub unsafe fn _mm512_mask_cvtt_roundps_epu32(
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_cvtt_roundps_epu32(k: __mmask16, a: __m512, sae: i32) -> __m512i {
     let a = a.as_f32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
+    let zero = _mm512_setzero_si512().as_u32x16();
     macro_rules! call {
         ($imm4:expr) => {
             vcvttps2udq(a, zero, k, $imm4)
@@ -15427,6 +15427,54 @@ pub unsafe fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
     ))
 }
 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttps_epi32&expand=1982)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm256_mask_cvttps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvttps2dq256(a.as_f32x8(), src.as_i32x8(), k))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttps_epi32&expand=1983)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm256_maskz_cvttps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvttps2dq256(
+        a.as_f32x8(),
+        _mm256_setzero_si256().as_i32x8(),
+        k,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttps_epi32&expand=1979)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm_mask_cvttps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvttps2dq128(a.as_f32x4(), src.as_i32x4(), k))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttps_epi32&expand=1980)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm_maskz_cvttps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvttps2dq128(
+        a.as_f32x4(),
+        _mm_setzero_si128().as_i32x4(),
+        k,
+    ))
+}
+
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttps_epu32&expand=2002)
@@ -15436,7 +15484,7 @@ pub unsafe fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
 pub unsafe fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
     transmute(vcvttps2udq(
         a.as_f32x16(),
-        _mm512_setzero_si512().as_i32x16(),
+        _mm512_setzero_si512().as_u32x16(),
         0b11111111_11111111,
         _MM_FROUND_CUR_DIRECTION,
     ))
@@ -15451,7 +15499,7 @@ pub unsafe fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
 pub unsafe fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
     transmute(vcvttps2udq(
         a.as_f32x16(),
-        src.as_i32x16(),
+        src.as_u32x16(),
         k,
         _MM_FROUND_CUR_DIRECTION,
     ))
@@ -15466,12 +15514,88 @@ pub unsafe fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) ->
 pub unsafe fn _mm512_maskz_cvttps_epu32(k: __mmask16, a: __m512) -> __m512i {
     transmute(vcvttps2udq(
         a.as_f32x16(),
-        _mm512_setzero_si512().as_i32x16(),
+        _mm512_setzero_si512().as_u32x16(),
         k,
         _MM_FROUND_CUR_DIRECTION,
     ))
 }
 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttps_epu32&expand=1999)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm256_cvttps_epu32(a: __m256) -> __m256i {
+    transmute(vcvttps2udq256(
+        a.as_f32x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttps_epu32&expand=2000)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm256_mask_cvttps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvttps2udq256(a.as_f32x8(), src.as_u32x8(), k))
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttps_epu32&expand=2001)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm256_maskz_cvttps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    transmute(vcvttps2udq256(
+        a.as_f32x8(),
+        _mm256_setzero_si256().as_u32x8(),
+        k,
+    ))
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_epu32&expand=1996)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm_cvttps_epu32(a: __m128) -> __m128i {
+    transmute(vcvttps2udq128(
+        a.as_f32x4(),
+        _mm_setzero_si128().as_u32x4(),
+        0b11111111,
+    ))
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttps_epu32&expand=1997)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm_mask_cvttps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvttps2udq128(a.as_f32x4(), src.as_u32x4(), k))
+}
+
+/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttps_epu32&expand=1998)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vcvttps2udq))]
+pub unsafe fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    transmute(vcvttps2udq128(
+        a.as_f32x4(),
+        _mm_setzero_si128().as_u32x4(),
+        k,
+    ))
+}
+
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
@@ -38791,8 +38915,17 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.mask.cvttps2dq.512"]
     fn vcvttps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.256"]
+    fn vcvttps2dq256(a: f32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.128"]
+    fn vcvttps2dq128(a: f32x4, src: i32x4, mask: u8) -> i32x4;
+
     #[link_name = "llvm.x86.avx512.mask.cvttps2udq.512"]
-    fn vcvttps2udq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> u32x16;
+    fn vcvttps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.256"]
+    fn vcvttps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.128"]
+    fn vcvttps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
 
     #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.512"]
     fn vcvttpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
@@ -45643,6 +45776,48 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvttps_epi32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epi32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvttps_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvttps_epi32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttps_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttps_epi32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epi32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvttps_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttps_epi32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvttps_epu32() {
         let a = _mm512_setr_ps(
@@ -45678,6 +45853,64 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_cvttps_epu32(a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let src = _mm256_set1_epi32(0);
+        let r = _mm256_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_cvttps_epu32(src, 0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_cvttps_epu32() {
+        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
+        let r = _mm256_maskz_cvttps_epu32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_cvttps_epu32(0b11111111, a);
+        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_cvttps_epu32(a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let src = _mm_set1_epi32(0);
+        let r = _mm_mask_cvttps_epu32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_cvttps_epu32(src, 0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_cvttps_epu32() {
+        let a = _mm_set_ps(12., 13.5, 14., 15.5);
+        let r = _mm_maskz_cvttps_epu32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_cvttps_epu32(0b00001111, a);
+        let e = _mm_set_epi32(12, 13, 14, 15);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_i32gather_ps() {
         let mut arr = [0f32; 256];

From 6b765a68bccfcf426a2e5f058a8cdf14d545e1cd Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Thu, 18 Feb 2021 12:29:25 -0500
Subject: [PATCH 20/31] test github ci

---
 .github/workflows/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c7cec5a858..11652cd14d 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -139,8 +139,8 @@ jobs:
         - target: aarch64-unknown-linux-gnu
           os: ubuntu-latest
         # Temporarily disabled because otool crashes with "Out of memory", seems Github CI issue
-        #- target: x86_64-apple-darwin
-        #  os: macos-latest
+        - target: x86_64-apple-darwin
+          os: macos-latest
         - target: x86_64-pc-windows-msvc
           os: windows-latest
         - target: i686-pc-windows-msvc

From 6a40fe9eeb5819822d548c9e803745f671fab649 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Thu, 18 Feb 2021 12:44:26 -0500
Subject: [PATCH 21/31] remove osX github CI

---
 .github/workflows/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 11652cd14d..c7cec5a858 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -139,8 +139,8 @@ jobs:
         - target: aarch64-unknown-linux-gnu
           os: ubuntu-latest
         # Temporarily disabled because otool crashes with "Out of memory", seems Github CI issue
-        - target: x86_64-apple-darwin
-          os: macos-latest
+        #- target: x86_64-apple-darwin
+        #  os: macos-latest
         - target: x86_64-pc-windows-msvc
           os: windows-latest
         - target: i686-pc-windows-msvc

From 43a76c914efc25bdc86447e6958ed881b1d5fcf9 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Fri, 19 Feb 2021 08:39:02 -0500
Subject: [PATCH 22/31] mm_cvt_roundi64_ss,sd

---
 crates/core_arch/avx512f.md         | 17 ++++----
 crates/core_arch/src/x86/avx512f.rs | 66 +++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 9 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 575ac94400..b3a6f12fa0 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1,5 +1,4 @@
 <summary>["AVX512F"]</summary><p>
- 
   * [x] [`_mm512_abs_epi32`]
   * [x] [`_mm512_mask_abs_epi32`]
   * [x] [`_mm512_maskz_abs_epi32`]
@@ -2342,8 +2341,8 @@
   * [x] [`_mm_comi_round_sd`]
   * [x] [`_mm_comi_round_ss`]
   * [x] [`_mm_cvt_roundi32_ss`]
-  * [ ] [`_mm_cvt_roundi64_sd`]
-  * [ ] [`_mm_cvt_roundi64_ss`]
+  * [x] [`_mm_cvt_roundi64_sd`]
+  * [x] [`_mm_cvt_roundi64_ss`]
   * [x] [`_mm_cvt_roundsd_i32`]
   * [ ] [`_mm_cvt_roundsd_i64`]
   * [x] [`_mm_cvt_roundsd_si32`]
@@ -2482,8 +2481,8 @@
   * [x] [`_mm_mask_getmant_round_ss`]
   * [x] [`_mm_mask_getmant_sd`]
   * [x] [`_mm_mask_getmant_ss`]
-  * [ ] [`_mm_mask_load_sd`]
-  * [ ] [`_mm_mask_load_ss`]
+  * [ ] [`_mm_mask_load_sd`] //need i1
+  * [ ] [`_mm_mask_load_ss`] //need i1
   * [x] [`_mm_mask_max_round_sd`]
   * [x] [`_mm_mask_max_round_ss`]
   * [x] [`_mm_mask_max_sd`]
@@ -2514,8 +2513,8 @@
   * [x] [`_mm_mask_sqrt_round_ss`]
   * [x] [`_mm_mask_sqrt_sd`]
   * [x] [`_mm_mask_sqrt_ss`]
-  * [ ] [`_mm_mask_store_sd`]
-  * [ ] [`_mm_mask_store_ss`]
+  * [ ] [`_mm_mask_store_sd`] //need i1
+  * [ ] [`_mm_mask_store_ss`] //need i1
   * [x] [`_mm_mask_sub_round_sd`]
   * [x] [`_mm_mask_sub_round_ss`]
   * [x] [`_mm_mask_sub_sd`]
@@ -2560,8 +2559,8 @@
   * [x] [`_mm_maskz_getmant_round_ss`]
   * [x] [`_mm_maskz_getmant_sd`]
   * [x] [`_mm_maskz_getmant_ss`]
-  * [ ] [`_mm_maskz_load_sd`]
-  * [ ] [`_mm_maskz_load_ss`]
+  * [ ] [`_mm_maskz_load_sd`] //need i1
+  * [ ] [`_mm_maskz_load_ss`] //need i1
   * [x] [`_mm_maskz_max_round_sd`]
   * [x] [`_mm_maskz_max_round_ss`]
   * [x] [`_mm_maskz_max_sd`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 9721543502..03abf5060a 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -38047,6 +38047,54 @@ pub unsafe fn _mm_cvt_roundi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
     transmute(r)
 }
 
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_ss&expand=1314)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsi2ss64(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_sd&expand=1313)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsi2sd(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
@@ -55262,6 +55310,24 @@ mod tests {
         assert_eq_m128(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundi64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundi64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsi32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);

From a2452bf0adbda620487a0e315cf0fdf15e2ba613 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Fri, 19 Feb 2021 10:45:00 -0500
Subject: [PATCH 23/31] test

---
 crates/core_arch/avx512f.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index b3a6f12fa0..6f23d3306f 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1,4 +1,5 @@
 <summary>["AVX512F"]</summary><p>
+
   * [x] [`_mm512_abs_epi32`]
   * [x] [`_mm512_mask_abs_epi32`]
   * [x] [`_mm512_maskz_abs_epi32`]

From 3a06fdb38f2606e47e322eaa4576e50d2cf6c0d2 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Fri, 19 Feb 2021 11:06:42 -0500
Subject: [PATCH 24/31] _mm_cvt_roundsd_i64

---
 crates/core_arch/avx512f.md         |  2 +-
 crates/core_arch/src/x86/avx512f.rs | 38 +++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 6f23d3306f..b6ea6e5520 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2345,7 +2345,7 @@
   * [x] [`_mm_cvt_roundi64_sd`]
   * [x] [`_mm_cvt_roundi64_ss`]
   * [x] [`_mm_cvt_roundsd_i32`]
-  * [ ] [`_mm_cvt_roundsd_i64`]
+  * [x] [`_mm_cvt_roundsd_i64`]
   * [x] [`_mm_cvt_roundsd_si32`]
   * [ ] [`_mm_cvt_roundsd_si64`]
   * [x] [`_mm_cvt_roundsd_ss`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 03abf5060a..6fae81a3ad 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -37977,6 +37977,31 @@ pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d, rounding: i32) -> i32 {
     transmute(r)
 }
 
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_i64&expand=1358)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundsd_i64(a: __m128d, rounding: i32) -> i64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
@@ -39567,10 +39592,13 @@ extern "C" {
     fn vcvtss2usi(a: f32x4, rounding: i32) -> u32;
     #[link_name = "llvm.x86.avx512.vcvtss2usi64"]
     fn vcvtss2usi64(a: f32x4, rounding: i32) -> u64;
+
     #[link_name = "llvm.x86.avx512.vcvtsd2si32"]
     fn vcvtsd2si(a: f64x2, rounding: i32) -> i32;
+
     #[link_name = "llvm.x86.avx512.vcvtsd2si64"]
     fn vcvtsd2si64(a: f64x2, rounding: i32) -> i64;
+
     #[link_name = "llvm.x86.avx512.vcvtsd2usi32"]
     fn vcvtsd2usi(a: f64x2, rounding: i32) -> u32;
     #[link_name = "llvm.x86.avx512.vcvtsd2usi64"]
@@ -39582,8 +39610,10 @@ extern "C" {
     fn vcvtsi2ss64(a: f32x4, b: i64, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.cvtsi2sd64"]
     fn vcvtsi2sd(a: f64x2, b: i64, rounding: i32) -> f64x2;
+
     #[link_name = "llvm.x86.avx512.cvtusi2ss"]
     fn vcvtusi2ss(a: f32x4, b: u32, rounding: i32) -> f32x4;
+
     #[link_name = "llvm.x86.avx512.cvtusi642ss"]
     fn vcvtusi2ss64(a: f32x4, b: u64, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.cvtusi642sd"]
@@ -55277,6 +55307,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_i64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_u32() {
         let a = _mm_set_pd(1., -1.5);

From f2e539270812d89e8562179e5ac593fce4f6aa90 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Fri, 19 Feb 2021 13:04:50 -0500
Subject: [PATCH 25/31] mm_cvtroundsd_si64, mm_cvtroundsd_u64,
 mm_cvtroundsi64_ss,sd, mm_cvtroundss_i64,si64,u64; mm_cvtroundu64_ss,sd

---
 crates/core_arch/avx512f.md         |  48 ++---
 crates/core_arch/src/x86/avx512f.rs | 300 +++++++++++++++++++++++++++-
 2 files changed, 316 insertions(+), 32 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index b6ea6e5520..00d97569f0 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2347,55 +2347,55 @@
   * [x] [`_mm_cvt_roundsd_i32`]
   * [x] [`_mm_cvt_roundsd_i64`]
   * [x] [`_mm_cvt_roundsd_si32`]
-  * [ ] [`_mm_cvt_roundsd_si64`]
+  * [x] [`_mm_cvt_roundsd_si64`] -
   * [x] [`_mm_cvt_roundsd_ss`]
   * [x] [`_mm_cvt_roundsd_u32`]
-  * [ ] [`_mm_cvt_roundsd_u64`]
+  * [x] [`_mm_cvt_roundsd_u64`] -
   * [x] [`_mm_cvt_roundsi32_ss`]
-  * [ ] [`_mm_cvt_roundsi64_sd`]
-  * [ ] [`_mm_cvt_roundsi64_ss`]
+  * [x] [`_mm_cvt_roundsi64_sd`] -
+  * [x] [`_mm_cvt_roundsi64_ss`] -
   * [x] [`_mm_cvt_roundss_i32`]
-  * [ ] [`_mm_cvt_roundss_i64`]
+  * [x] [`_mm_cvt_roundss_i64`] -
   * [x] [`_mm_cvt_roundss_sd`]
   * [x] [`_mm_cvt_roundss_si32`]
-  * [ ] [`_mm_cvt_roundss_si64`]
+  * [x] [`_mm_cvt_roundss_si64`] -
   * [x] [`_mm_cvt_roundss_u32`]
-  * [ ] [`_mm_cvt_roundss_u64`]
+  * [x] [`_mm_cvt_roundss_u64`] -
   * [x] [`_mm_cvt_roundu32_ss`]
-  * [ ] [`_mm_cvt_roundu64_sd`]
-  * [ ] [`_mm_cvt_roundu64_ss`]
+  * [x] [`_mm_cvt_roundu64_sd`] -
+  * [x] [`_mm_cvt_roundu64_ss`] -
   * [x] [`_mm_cvti32_sd`]
   * [x] [`_mm_cvti32_ss`]
-  * [ ] [`_mm_cvti64_sd`]
-  * [ ] [`_mm_cvti64_ss`]
+  * [ ] [`_mm_cvti64_sd`] -
+  * [ ] [`_mm_cvti64_ss`] -
   * [x] [`_mm_cvtsd_i32`]
-  * [ ] [`_mm_cvtsd_i64`]
+  * [ ] [`_mm_cvtsd_i64`] -
   * [x] [`_mm_cvtsd_u32`]
-  * [ ] [`_mm_cvtsd_u64`]
+  * [ ] [`_mm_cvtsd_u64`] -
   * [x] [`_mm_cvtss_i32`]
-  * [ ] [`_mm_cvtss_i64`]
+  * [ ] [`_mm_cvtss_i64`] -
   * [x] [`_mm_cvtss_u32`]
-  * [ ] [`_mm_cvtss_u64`]
+  * [ ] [`_mm_cvtss_u64`] -
   * [x] [`_mm_cvtt_roundsd_i32`]
   * [x] [`_mm_cvtt_roundsd_i64`]
   * [x] [`_mm_cvtt_roundsd_si32`]
-  * [ ] [`_mm_cvtt_roundsd_si64`]
+  * [ ] [`_mm_cvtt_roundsd_si64`] -
   * [x] [`_mm_cvtt_roundsd_u32`]
-  * [ ] [`_mm_cvtt_roundsd_u64`]
+  * [ ] [`_mm_cvtt_roundsd_u64`] -
   * [x] [`_mm_cvtt_roundss_i32`]
-  * [ ] [`_mm_cvtt_roundss_i64`]
+  * [ ] [`_mm_cvtt_roundss_i64`] -
   * [x] [`_mm_cvtt_roundss_si32`]
-  * [ ] [`_mm_cvtt_roundss_si64`]
+  * [ ] [`_mm_cvtt_roundss_si64`] -
   * [x] [`_mm_cvtt_roundss_u32`]
-  * [ ] [`_mm_cvtt_roundss_u64`]
+  * [ ] [`_mm_cvtt_roundss_u64`] -
   * [x] [`_mm_cvttsd_i32`]
-  * [ ] [`_mm_cvttsd_i64`]
+  * [ ] [`_mm_cvttsd_i64`] -
   * [x] [`_mm_cvttsd_u32`]
-  * [ ] [`_mm_cvttsd_u64`]
+  * [ ] [`_mm_cvttsd_u64`] -
   * [x] [`_mm_cvttss_i32`]
-  * [ ] [`_mm_cvttss_i64`]
+  * [ ] [`_mm_cvttss_i64`] -
   * [x] [`_mm_cvttss_u32`]
-  * [ ] [`_mm_cvttss_u64`]
+  * [ ] [`_mm_cvttss_u64`] -
   * [x] [`_mm_cvtu32_sd`]
   * [x] [`_mm_cvtu32_ss`]
   * [x] [`_mm_cvtu64_sd`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 6fae81a3ad..933fb965ae 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -37858,7 +37858,6 @@ pub unsafe fn _mm_cvt_roundss_si32(a: __m128, rounding: i32) -> i32 {
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
-///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
@@ -37882,8 +37881,55 @@ pub unsafe fn _mm_cvt_roundss_i32(a: __m128, rounding: i32) -> i32 {
     transmute(r)
 }
 
-/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_i64&expand=1370)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundss_i64(a: __m128, rounding: i32) -> i64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_si64&expand=1375)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundss_si64(a: __m128, rounding: i32) -> i64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
@@ -37907,6 +37953,30 @@ pub unsafe fn _mm_cvt_roundss_u32(a: __m128, rounding: i32) -> u32 {
     transmute(r)
 }
 
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_u64&expand=1377)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundss_u64(a: __m128, rounding: i32) -> u64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2usi64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtss_i32&expand=1893)
@@ -37928,7 +37998,6 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
-///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
@@ -37953,7 +38022,6 @@ pub unsafe fn _mm_cvt_roundsd_si32(a: __m128d, rounding: i32) -> i32 {
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
-///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
@@ -37978,7 +38046,6 @@ pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d, rounding: i32) -> i32 {
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
-///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
@@ -38002,8 +38069,31 @@ pub unsafe fn _mm_cvt_roundsd_i64(a: __m128d, rounding: i32) -> i64 {
     transmute(r)
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_si64&expand=1360)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundsd_si64(a: __m128d, rounding: i32) -> i64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
@@ -38027,6 +38117,30 @@ pub unsafe fn _mm_cvt_roundsd_u32(a: __m128d, rounding: i32) -> u32 {
     transmute(r)
 }
 
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_u64&expand=1365)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundsd_u64(a: __m128d, rounding: i32) -> u64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2usi64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtsd_i32&expand=1791)
@@ -38120,6 +38234,54 @@ pub unsafe fn _mm_cvt_roundi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d
     transmute(r)
 }
 
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_ss&expand=1368)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundsi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsi2ss64(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_sd&expand=1367)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundsi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsi2sd(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
@@ -38146,7 +38308,6 @@ pub unsafe fn _mm_cvt_roundsi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
 }
 
 /// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
@@ -38170,6 +38331,54 @@ pub unsafe fn _mm_cvt_roundu32_ss(a: __m128, b: u32, rounding: i32) -> __m128 {
     transmute(r)
 }
 
+/// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_ss&expand=1380)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundu64_ss(a: __m128, b: u64, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtusi2ss64(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_sd&expand=1379)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2sd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundu64_sd(a: __m128d, b: u64, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtusi2sd(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_ss&expand=1643)
@@ -39613,7 +39822,6 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.cvtusi2ss"]
     fn vcvtusi2ss(a: f32x4, b: u32, rounding: i32) -> f32x4;
-
     #[link_name = "llvm.x86.avx512.cvtusi642ss"]
     fn vcvtusi2ss64(a: f32x4, b: u64, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.cvtusi642sd"]
@@ -55267,6 +55475,22 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_i64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_si64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_si64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55275,6 +55499,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_u64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtss_i32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55315,6 +55547,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_si64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_si64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_u32() {
         let a = _mm_set_pd(1., -1.5);
@@ -55323,6 +55563,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_u64() {
+        let a = _mm_set_pd(1., f64::MAX);
+        let r = _mm_cvt_roundsd_u64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtsd_i32() {
         let a = _mm_set_pd(1., -1.5);
@@ -55366,6 +55614,24 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundsi64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundsi64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsi32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55384,6 +55650,24 @@ mod tests {
         assert_eq_m128(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvt_roundu64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvt_roundu64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvti32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);

From b8bd2fb5d1f6e9aed41ecfc1b642e965953abc72 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Fri, 19 Feb 2021 14:23:23 -0500
Subject: [PATCH 26/31] _mm_cvti64_ss,sd; _mm_cvtsd_i64,u64; _mm_cvtss_i64,u64

---
 crates/core_arch/avx512f.md         |  30 +++---
 crates/core_arch/src/x86/avx512f.rs | 156 ++++++++++++++++++++++++----
 2 files changed, 148 insertions(+), 38 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 00d97569f0..632d6bb4b2 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2347,35 +2347,35 @@
   * [x] [`_mm_cvt_roundsd_i32`]
   * [x] [`_mm_cvt_roundsd_i64`]
   * [x] [`_mm_cvt_roundsd_si32`]
-  * [x] [`_mm_cvt_roundsd_si64`] -
+  * [x] [`_mm_cvt_roundsd_si64`]
   * [x] [`_mm_cvt_roundsd_ss`]
   * [x] [`_mm_cvt_roundsd_u32`]
-  * [x] [`_mm_cvt_roundsd_u64`] -
+  * [x] [`_mm_cvt_roundsd_u64`]
   * [x] [`_mm_cvt_roundsi32_ss`]
-  * [x] [`_mm_cvt_roundsi64_sd`] -
-  * [x] [`_mm_cvt_roundsi64_ss`] -
+  * [x] [`_mm_cvt_roundsi64_sd`]
+  * [x] [`_mm_cvt_roundsi64_ss`]
   * [x] [`_mm_cvt_roundss_i32`]
-  * [x] [`_mm_cvt_roundss_i64`] -
+  * [x] [`_mm_cvt_roundss_i64`]
   * [x] [`_mm_cvt_roundss_sd`]
   * [x] [`_mm_cvt_roundss_si32`]
-  * [x] [`_mm_cvt_roundss_si64`] -
+  * [x] [`_mm_cvt_roundss_si64`]
   * [x] [`_mm_cvt_roundss_u32`]
-  * [x] [`_mm_cvt_roundss_u64`] -
+  * [x] [`_mm_cvt_roundss_u64`]
   * [x] [`_mm_cvt_roundu32_ss`]
-  * [x] [`_mm_cvt_roundu64_sd`] -
-  * [x] [`_mm_cvt_roundu64_ss`] -
+  * [x] [`_mm_cvt_roundu64_sd`]
+  * [x] [`_mm_cvt_roundu64_ss`]
   * [x] [`_mm_cvti32_sd`]
   * [x] [`_mm_cvti32_ss`]
-  * [ ] [`_mm_cvti64_sd`] -
-  * [ ] [`_mm_cvti64_ss`] -
+  * [x] [`_mm_cvti64_sd`] -
+  * [x] [`_mm_cvti64_ss`] -
   * [x] [`_mm_cvtsd_i32`]
-  * [ ] [`_mm_cvtsd_i64`] -
+  * [x] [`_mm_cvtsd_i64`] -
   * [x] [`_mm_cvtsd_u32`]
-  * [ ] [`_mm_cvtsd_u64`] -
+  * [x] [`_mm_cvtsd_u64`] -
   * [x] [`_mm_cvtss_i32`]
-  * [ ] [`_mm_cvtss_i64`] -
+  * [x] [`_mm_cvtss_i64`] -
   * [x] [`_mm_cvtss_u32`]
-  * [ ] [`_mm_cvtss_u64`] -
+  * [x] [`_mm_cvtss_u64`] -
   * [x] [`_mm_cvtt_roundsd_i32`]
   * [x] [`_mm_cvtt_roundsd_i64`]
   * [x] [`_mm_cvtt_roundsd_si32`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 933fb965ae..cb1e1329aa 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1,5 +1,5 @@
 use crate::{
-    core_arch::{simd::*, simd_llvm::*, x86::*},
+    core_arch::{simd::*, simd_llvm::*, x86::*, x86_64::*},
     mem::{self, transmute},
     ptr,
 };
@@ -37427,8 +37427,8 @@ pub unsafe fn _mm_maskz_fixupimm_sd(
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
-///
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_round_ss&expand=2511)
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -37456,8 +37456,8 @@ pub unsafe fn _mm_fixupimm_round_ss(
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
-///
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_round_ss&expand=2512)
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -37486,8 +37486,8 @@ pub unsafe fn _mm_mask_fixupimm_round_ss(
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
-///
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_round_ss&expand=2513)
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -37516,8 +37516,8 @@ pub unsafe fn _mm_maskz_fixupimm_round_ss(
 }
 
 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
-///
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_round_sd&expand=2508)
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -37545,8 +37545,8 @@ pub unsafe fn _mm_fixupimm_round_sd(
 }
 
 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
-///
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_round_sd&expand=2509)
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -37575,8 +37575,8 @@ pub unsafe fn _mm_mask_fixupimm_round_sd(
 }
 
 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
-///
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_round_sd&expand=2510)
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -37740,13 +37740,12 @@ pub unsafe fn _mm_maskz_cvt_roundss_sd(k: __mmask8, a: __m128d, b: __m128, sae:
     transmute(r)
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_ss&expand=1361)
@@ -37767,13 +37766,12 @@ pub unsafe fn _mm_cvt_roundsd_ss(a: __m128, b: __m128d, rounding: i32) -> __m128
     transmute(r)
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundsd_ss&expand=1362)
@@ -37801,7 +37799,6 @@ pub unsafe fn _mm_mask_cvt_roundsd_ss(
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
-///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
@@ -37833,7 +37830,6 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss(
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
-///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
@@ -37997,6 +37993,16 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
     transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
 }
 
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_u64&expand=1902)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
+    transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
@@ -38151,6 +38157,26 @@ pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
     transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
 }
 
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_i64&expand=1792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub unsafe fn _mm_cvtsd_i64(a: __m128d) -> i64 {
+    _mm_cvtsd_si64(a)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_i64&expand=1894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
+    _mm_cvtss_si64(a)
+}
+
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtsd_u32&expand=1799)
@@ -38161,6 +38187,16 @@ pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
     transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
 }
 
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_u64&expand=1800)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
+    transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
@@ -38403,6 +38439,30 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
     transmute(r)
 }
 
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_ss&expand=1643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss))]
+pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
+    let b = b as f32;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti64_sd&expand=1644)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd))]
+pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
+    let b = b as f64;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
@@ -55515,6 +55575,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55523,6 +55591,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_si32() {
         let a = _mm_set_pd(1., -1.5);
@@ -55579,6 +55655,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtsd_u32() {
         let a = _mm_set_pd(1., -1.5);
@@ -55587,6 +55671,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundi32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55686,6 +55778,24 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvti64_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvti64_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_si32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);

From cea23e377e72b7b362db2e4b4c24cfca5b6c897c Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Fri, 19 Feb 2021 14:41:50 -0500
Subject: [PATCH 27/31] fix build doc

---
 crates/core_arch/src/x86/avx512f.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index cb1e1329aa..4ab99d23bb 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1,5 +1,6 @@
 use crate::{
-    core_arch::{simd::*, simd_llvm::*, x86::*, x86_64::*},
+    core_arch::x86_64::{_mm_cvtsd_si64, _mm_cvtss_si64},
+    core_arch::{simd::*, simd_llvm::*, x86::*},
     mem::{self, transmute},
     ptr,
 };

From 1378c48d13f347231e6341c3fd988aca34b40e21 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Fri, 19 Feb 2021 15:04:18 -0500
Subject: [PATCH 28/31] fix build doc 2

---
 crates/core_arch/avx512f.md            | 12 +++----
 crates/core_arch/src/x86/avx512f.rs    | 38 +--------------------
 crates/core_arch/src/x86_64/avx512f.rs | 46 +++++++++++++++++++++++---
 3 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 632d6bb4b2..63ffd6ba10 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2366,16 +2366,16 @@
   * [x] [`_mm_cvt_roundu64_ss`]
   * [x] [`_mm_cvti32_sd`]
   * [x] [`_mm_cvti32_ss`]
-  * [x] [`_mm_cvti64_sd`] -
-  * [x] [`_mm_cvti64_ss`] -
+  * [x] [`_mm_cvti64_sd`]
+  * [x] [`_mm_cvti64_ss`]
   * [x] [`_mm_cvtsd_i32`]
-  * [x] [`_mm_cvtsd_i64`] -
+  * [x] [`_mm_cvtsd_i64`]
   * [x] [`_mm_cvtsd_u32`]
-  * [x] [`_mm_cvtsd_u64`] -
+  * [x] [`_mm_cvtsd_u64`]
   * [x] [`_mm_cvtss_i32`]
-  * [x] [`_mm_cvtss_i64`] -
+  * [x] [`_mm_cvtss_i64`]
   * [x] [`_mm_cvtss_u32`]
-  * [x] [`_mm_cvtss_u64`] -
+  * [x] [`_mm_cvtss_u64`]
   * [x] [`_mm_cvtt_roundsd_i32`]
   * [x] [`_mm_cvtt_roundsd_i64`]
   * [x] [`_mm_cvtt_roundsd_si32`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 4ab99d23bb..c6b4f517f4 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1,5 +1,5 @@
 use crate::{
-    core_arch::x86_64::{_mm_cvtsd_si64, _mm_cvtss_si64},
+    //core_arch::x86_64::{_mm_cvtsd_si64, _mm_cvtss_si64},
     core_arch::{simd::*, simd_llvm::*, x86::*},
     mem::{self, transmute},
     ptr,
@@ -38158,26 +38158,6 @@ pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
     transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_i64&expand=1792)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si))]
-pub unsafe fn _mm_cvtsd_i64(a: __m128d) -> i64 {
-    _mm_cvtsd_si64(a)
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_i64&expand=1894)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si))]
-pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
-    _mm_cvtss_si64(a)
-}
-
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtsd_u32&expand=1799)
@@ -55576,14 +55556,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtss_i64() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtss_i64(a);
-        let e: i64 = -2;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55656,14 +55628,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtsd_i64() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtsd_i64(a);
-        let e: i64 = -2;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtsd_u32() {
         let a = _mm_set_pd(1., -1.5);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index c7798dda2f..46f06bd083 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -1,8 +1,28 @@
-//use crate::{
-//
-//    core_arch::{simd::*, simd_llvm::*, x86::*},
-//    mem::transmute,
-//};
+use crate::core_arch::x86::*;
+use crate::core_arch::x86_64::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_i64&expand=1792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub unsafe fn _mm_cvtsd_i64(a: __m128d) -> i64 {
+    _mm_cvtsd_si64(a)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_i64&expand=1894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
+    _mm_cvtss_si64(a)
+}
 
 #[cfg(test)]
 mod tests {
@@ -11785,4 +11805,20 @@ mod tests {
         let e = _mm_set1_epi64x(11);
         assert_eq_m128i(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
 }

From 511506680b00504eee68c5b4dbffe26a7cbfcbf8 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Sat, 20 Feb 2021 09:38:26 -0500
Subject: [PATCH 29/31] mm_cvtt_roundsd_si64,u64; mm_cvtt_roundss_i64,si64,u64

---
 crates/core_arch/avx512f.md         |  10 +-
 crates/core_arch/src/x86/avx512f.rs | 163 +++++++++++++++++++++++++++-
 2 files changed, 167 insertions(+), 6 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 63ffd6ba10..48f6d39836 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2379,15 +2379,15 @@
   * [x] [`_mm_cvtt_roundsd_i32`]
   * [x] [`_mm_cvtt_roundsd_i64`]
   * [x] [`_mm_cvtt_roundsd_si32`]
-  * [ ] [`_mm_cvtt_roundsd_si64`] -
+  * [x] [`_mm_cvtt_roundsd_si64`] -
   * [x] [`_mm_cvtt_roundsd_u32`]
-  * [ ] [`_mm_cvtt_roundsd_u64`] -
+  * [x] [`_mm_cvtt_roundsd_u64`] -
   * [x] [`_mm_cvtt_roundss_i32`]
-  * [ ] [`_mm_cvtt_roundss_i64`] -
+  * [x] [`_mm_cvtt_roundss_i64`] -
   * [x] [`_mm_cvtt_roundss_si32`]
-  * [ ] [`_mm_cvtt_roundss_si64`] -
+  * [x] [`_mm_cvtt_roundss_si64`] -
   * [x] [`_mm_cvtt_roundss_u32`]
-  * [ ] [`_mm_cvtt_roundss_u64`] -
+  * [x] [`_mm_cvtt_roundss_u64`] -
   * [x] [`_mm_cvttsd_i32`]
   * [ ] [`_mm_cvttsd_i64`] -
   * [x] [`_mm_cvttsd_u32`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index c6b4f517f4..9a4ec5e339 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -38482,6 +38482,44 @@ pub unsafe fn _mm_cvtt_roundss_i32(a: __m128, sae: i32) -> i32 {
     transmute(r)
 }
 
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_i64&expand=1935)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundss_i64(a: __m128, sae: i32) -> i64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_si64&expand=1937)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundss_si64(a: __m128, sae: i32) -> i64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
@@ -38501,6 +38539,25 @@ pub unsafe fn _mm_cvtt_roundss_u32(a: __m128, sae: i32) -> u32 {
     transmute(r)
 }
 
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_u64&expand=1939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundss_u64(a: __m128, sae: i32) -> u64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2usi64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvttss_i32&expand=2022)
@@ -38559,6 +38616,44 @@ pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d, sae: i32) -> i32 {
     transmute(r)
 }
 
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si64&expand=1931)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundsd_si64(a: __m128d, sae: i32) -> i64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i64&expand=1929)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundsd_i64(a: __m128d, sae: i32) -> i64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
@@ -38578,6 +38673,25 @@ pub unsafe fn _mm_cvtt_roundsd_u32(a: __m128d, sae: i32) -> u32 {
     transmute(r)
 }
 
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_u64&expand=1933)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundsd_u64(a: __m128d, sae: i32) -> u64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2usi64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvttsd_i32&expand=2015)
@@ -39845,7 +39959,6 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.vcvtsd2si32"]
     fn vcvtsd2si(a: f64x2, rounding: i32) -> i32;
-
     #[link_name = "llvm.x86.avx512.vcvtsd2si64"]
     fn vcvtsd2si64(a: f64x2, rounding: i32) -> i64;
 
@@ -55777,6 +55890,22 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_i64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_si64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_si64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55785,6 +55914,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_u64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvttss_i32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55817,6 +55954,22 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_i64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_si64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_si64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_u32() {
         let a = _mm_set_pd(1., -1.5);
@@ -55825,6 +55978,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_u64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvttsd_i32() {
         let a = _mm_set_pd(1., -1.5);

From b092f2782988d1d5cd70fecde5e8404cf0a9bca5 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Sat, 20 Feb 2021 10:07:16 -0500
Subject: [PATCH 30/31] cvttsd_i64,u64; cvttss_i64,u64

---
 crates/core_arch/avx512f.md         | 20 +++---
 crates/core_arch/src/x86/avx512f.rs | 96 +++++++++++++++++++++++++----
 2 files changed, 94 insertions(+), 22 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 48f6d39836..b45be34078 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2261,7 +2261,7 @@
   * [x] [`_mm512_mask_cvtusepi64_storeu_epi8`]
   * [x] [`_mm_mask_cvtusepi64_storeu_epi8`]
   * [x] [`_mm256_mask_cvtusepi64_storeu_epi8`]
-  * [x] ['_mm512_cvtsi512_si32']
+  * [x] [`_mm512_cvtsi512_si32`]
   * [x] [`_mm512_cvttpd_epi32`]
   * [x] [`_mm512_mask_cvttpd_epi32`]
   * [x] [`_mm512_maskz_cvttpd_epi32`]
@@ -2379,23 +2379,23 @@
   * [x] [`_mm_cvtt_roundsd_i32`]
   * [x] [`_mm_cvtt_roundsd_i64`]
   * [x] [`_mm_cvtt_roundsd_si32`]
-  * [x] [`_mm_cvtt_roundsd_si64`] -
+  * [x] [`_mm_cvtt_roundsd_si64`]
   * [x] [`_mm_cvtt_roundsd_u32`]
-  * [x] [`_mm_cvtt_roundsd_u64`] -
+  * [x] [`_mm_cvtt_roundsd_u64`]
   * [x] [`_mm_cvtt_roundss_i32`]
-  * [x] [`_mm_cvtt_roundss_i64`] -
+  * [x] [`_mm_cvtt_roundss_i64`]
   * [x] [`_mm_cvtt_roundss_si32`]
-  * [x] [`_mm_cvtt_roundss_si64`] -
+  * [x] [`_mm_cvtt_roundss_si64`]
   * [x] [`_mm_cvtt_roundss_u32`]
-  * [x] [`_mm_cvtt_roundss_u64`] -
+  * [x] [`_mm_cvtt_roundss_u64`]
   * [x] [`_mm_cvttsd_i32`]
-  * [ ] [`_mm_cvttsd_i64`] -
+  * [x] [`_mm_cvttsd_i64`] -
   * [x] [`_mm_cvttsd_u32`]
-  * [ ] [`_mm_cvttsd_u64`] -
+  * [x] [`_mm_cvttsd_u64`] -
   * [x] [`_mm_cvttss_i32`]
-  * [ ] [`_mm_cvttss_i64`] -
+  * [x] [`_mm_cvttss_i64`] -
   * [x] [`_mm_cvttss_u32`]
-  * [ ] [`_mm_cvttss_u64`] -
+  * [x] [`_mm_cvttss_u64`] -
   * [x] [`_mm_cvtu32_sd`]
   * [x] [`_mm_cvtu32_ss`]
   * [x] [`_mm_cvtu64_sd`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 9a4ec5e339..769e243919 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -38560,7 +38560,7 @@ pub unsafe fn _mm_cvtt_roundss_u64(a: __m128, sae: i32) -> u64 {
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvttss_i32&expand=2022)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_i32&expand=2022)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2si))]
@@ -38568,9 +38568,19 @@ pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
     transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
 }
 
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=#text=_mm_cvttss_i64&expand=2023)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
+    transmute(vcvtss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvttss_u32&expand=2026)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_u32&expand=2026)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtss2usi))]
@@ -38578,10 +38588,20 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
     transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
 }
 
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_u64&expand=2027)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
+    transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundsd_si32&expand=1930)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si32&expand=1930)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
@@ -38600,7 +38620,7 @@ pub unsafe fn _mm_cvtt_roundsd_si32(a: __m128d, sae: i32) -> i32 {
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundsd_i32&expand=1928)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i32&expand=1928)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
@@ -38694,7 +38714,7 @@ pub unsafe fn _mm_cvtt_roundsd_u64(a: __m128d, sae: i32) -> u64 {
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvttsd_i32&expand=2015)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_i32&expand=2015)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2si))]
@@ -38702,9 +38722,19 @@ pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
     transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
 }
 
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_i64&expand=2016)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
+    transmute(vcvtsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvttsd_u32&expand=2020)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_u32&expand=2020)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
@@ -38712,9 +38742,19 @@ pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
     transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
 }
 
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_u64&expand=2021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
+    transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
 /// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtu32_ss&expand=2032)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_ss&expand=2032)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtusi2ss))]
@@ -38726,7 +38766,7 @@ pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
 
 /// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtu32_sd&expand=2031)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sd&expand=2031)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcvtusi2sd))]
@@ -38738,7 +38778,7 @@ pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
 
 /// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtu64_ss&expand=2035)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_ss&expand=2035)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(mov))] // should be vcvtusi2ss
@@ -38750,7 +38790,7 @@ pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
 
 /// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtu64_sd&expand=2034)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_sd&expand=2034)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(mov))] // should be vcvtusi2sd
@@ -38763,7 +38803,7 @@ pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_comi_round_ss&expand=1175)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_ss&expand=1175)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp, imm8 = 5, sae = 4))] //should be vcomiss
@@ -38783,7 +38823,7 @@ pub unsafe fn _mm_comi_round_ss(a: __m128, b: __m128, imm8: i32, sae: i32) -> i3
 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_comi_round_sd&expand=1174)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sd&expand=1174)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp, imm8 = 5, sae = 4))] //should be vcomisd
@@ -55930,6 +55970,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvttss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55938,6 +55986,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_si32() {
         let a = _mm_set_pd(1., -1.5);
@@ -55994,6 +56050,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvttsd_u32() {
         let a = _mm_set_pd(1., -1.5);
@@ -56002,6 +56066,14 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtu32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);

From 7ef85889efb084f621118b85d68ca03208c1907f Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Sat, 20 Feb 2021 17:42:14 -0500
Subject: [PATCH 31/31] move x86_64 ones to x86_64

---
 crates/core_arch/avx512bw.md           |  34 +-
 crates/core_arch/avx512f.md            |  10 +-
 crates/core_arch/avx512vbmi2.md        |  36 +-
 crates/core_arch/src/x86/avx512f.rs    | 758 ------------------------
 crates/core_arch/src/x86_64/avx512f.rs | 771 ++++++++++++++++++++++++-
 crates/core_arch/src/x86_64/macros.rs  |  32 +
 crates/core_arch/src/x86_64/mod.rs     |   3 +
 7 files changed, 844 insertions(+), 800 deletions(-)
 create mode 100644 crates/core_arch/src/x86_64/macros.rs

diff --git a/crates/core_arch/avx512bw.md b/crates/core_arch/avx512bw.md
index 7484e8792b..367cb5de2a 100644
--- a/crates/core_arch/avx512bw.md
+++ b/crates/core_arch/avx512bw.md
@@ -1,34 +1,34 @@
 <summary>["AVX512BW"]</summary><p>
 
   * [x] [`_mm512_loadu_epi16`]
-  * [_] [`_mm512_mask_loadu_epi16`]
-  * [_] [`_mm512_maskz_loadu_epi16`]
+  * [_] [`_mm512_mask_loadu_epi16`] //need i1
+  * [_] [`_mm512_maskz_loadu_epi16`] //need i1
   * [x] [`_mm_loadu_epi16`]
-  * [_] [`_mm_mask_loadu_epi16`]
-  * [_] [`_mm_maskz_loadu_epi16`]
+  * [_] [`_mm_mask_loadu_epi16`] //need i1
+  * [_] [`_mm_maskz_loadu_epi16`] //need i1
   * [x] [`_mm256_loadu_epi16`]
-  * [_] [`_mm256_mask_loadu_epi16`]
-  * [_] [`_mm256_maskz_loadu_epi16`]
+  * [_] [`_mm256_mask_loadu_epi16`] //need i1
+  * [_] [`_mm256_maskz_loadu_epi16`] //need i1
   * [x] [`_mm512_loadu_epi8`]
-  * [_] [`_mm512_mask_loadu_epi8`]
-  * [_] [`_mm512_maskz_loadu_epi8`]
+  * [_] [`_mm512_mask_loadu_epi8`] //need i1
+  * [_] [`_mm512_maskz_loadu_epi8`] //need i1
   * [x] [`_mm_loadu_epi8`]
-  * [_] [`_mm_mask_loadu_epi8`]
-  * [_] [`_mm_maskz_loadu_epi8`]
+  * [_] [`_mm_mask_loadu_epi8`] //need i1
+  * [_] [`_mm_maskz_loadu_epi8`] //need i1
   * [x] [`_mm256_loadu_epi8`]
-  * [_] [`_mm256_mask_loadu_epi8`]
-  * [_] [`_mm256_maskz_loadu_epi8`]
+  * [_] [`_mm256_mask_loadu_epi8`] //need i1
+  * [_] [`_mm256_maskz_loadu_epi8`] //need i1
   * [_] [`_mm512_mask_storeu_epi16`]
   * [x] [`_mm512_storeu_epi16`]
-  * [_] [`_mm_mask_storeu_epi16`]
+  * [_] [`_mm_mask_storeu_epi16`] //need i1
   * [x] [`_mm_storeu_epi16`]
-  * [_] [`_mm256_mask_storeu_epi16`]
+  * [_] [`_mm256_mask_storeu_epi16`] //need i1
   * [x] [`_mm256_storeu_epi16`]
-  * [_] [`_mm512_mask_storeu_epi8`]
+  * [_] [`_mm512_mask_storeu_epi8`] //need i1
   * [x] [`_mm512_storeu_epi8`]
-  * [_] [`_mm_mask_storeu_epi8`]
+  * [_] [`_mm_mask_storeu_epi8`] //need i1
   * [x] [`_mm_storeu_epi8`]
-  * [_] [`_mm256_mask_storeu_epi8`]
+  * [_] [`_mm256_mask_storeu_epi8`] //need i1
   * [x] [`_mm256_storeu_epi8`]
   * [x] [`_mm512_abs_epi16`]
   * [x] [`_mm512_mask_abs_epi16`]
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index b45be34078..1ad80147cf 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -2389,13 +2389,13 @@
   * [x] [`_mm_cvtt_roundss_u32`]
   * [x] [`_mm_cvtt_roundss_u64`]
   * [x] [`_mm_cvttsd_i32`]
-  * [x] [`_mm_cvttsd_i64`] -
+  * [x] [`_mm_cvttsd_i64`]
   * [x] [`_mm_cvttsd_u32`]
-  * [x] [`_mm_cvttsd_u64`] -
+  * [x] [`_mm_cvttsd_u64`]
   * [x] [`_mm_cvttss_i32`]
-  * [x] [`_mm_cvttss_i64`] -
+  * [x] [`_mm_cvttss_i64`]
   * [x] [`_mm_cvttss_u32`]
-  * [x] [`_mm_cvttss_u64`] -
+  * [x] [`_mm_cvttss_u64`]
   * [x] [`_mm_cvtu32_sd`]
   * [x] [`_mm_cvtu32_ss`]
   * [x] [`_mm_cvtu64_sd`]
@@ -2625,7 +2625,7 @@
   * [x] [`_mm512_knot`]
   * [x] [`_mm512_kor`]
   * [x] [`_mm512_kortestc`]
-  * [ ] [`_mm512_kortestz`]
+  * [ ] [`_mm512_kortestz`] //not sure
   * [x] [`_mm512_kunpackb`]
   * [x] [`_mm512_kxnor`]
   * [x] [`_mm512_kxor`]
diff --git a/crates/core_arch/avx512vbmi2.md b/crates/core_arch/avx512vbmi2.md
index 4bb6a0ed0c..693af9d930 100644
--- a/crates/core_arch/avx512vbmi2.md
+++ b/crates/core_arch/avx512vbmi2.md
@@ -12,12 +12,12 @@
   * [x] [`_mm256_maskz_compress_epi8`]
   * [x] [`_mm512_mask_compress_epi8`]
   * [x] [`_mm512_maskz_compress_epi8`]
-  * [_] [`_mm_mask_compressstoreu_epi16`]
-  * [_] [`_mm256_mask_compressstoreu_epi16`]
-  * [_] [`_mm512_mask_compressstoreu_epi16`]
-  * [_] [`_mm_mask_compressstoreu_epi8`]
-  * [_] [`_mm256_mask_compressstoreu_epi8`]
-  * [_] [`_mm512_mask_compressstoreu_epi8`]
+  * [_] [`_mm_mask_compressstoreu_epi16`] //need i1
+  * [_] [`_mm256_mask_compressstoreu_epi16`] //need i1
+  * [_] [`_mm512_mask_compressstoreu_epi16`] //need i1
+  * [_] [`_mm_mask_compressstoreu_epi8`] //need i1
+  * [_] [`_mm256_mask_compressstoreu_epi8`] //need i1
+  * [_] [`_mm512_mask_compressstoreu_epi8`] //need i1
   * [x] [`_mm_mask_expand_epi16`]
   * [x] [`_mm_maskz_expand_epi16`]
   * [x] [`_mm256_mask_expand_epi16`]
@@ -30,18 +30,18 @@
   * [x] [`_mm256_maskz_expand_epi8`]
   * [x] [`_mm512_mask_expand_epi8`]
   * [x] [`_mm512_maskz_expand_epi8`]
-  * [_] [`_mm_mask_expandloadu_epi16`]
-  * [_] [`_mm_maskz_expandloadu_epi16`]
-  * [_] [`_mm256_mask_expandloadu_epi16`]
-  * [_] [`_mm256_maskz_expandloadu_epi16`]
-  * [_] [`_mm512_mask_expandloadu_epi16`]
-  * [_] [`_mm512_maskz_expandloadu_epi16`]
-  * [_] [`_mm_mask_expandloadu_epi8`]
-  * [_] [`_mm_maskz_expandloadu_epi8`]
-  * [_] [`_mm256_mask_expandloadu_epi8`]
-  * [_] [`_mm256_maskz_expandloadu_epi8`]
-  * [_] [`_mm512_mask_expandloadu_epi8`]
-  * [_] [`_mm512_maskz_expandloadu_epi8`]
+  * [_] [`_mm_mask_expandloadu_epi16`] //need i1
+  * [_] [`_mm_maskz_expandloadu_epi16`] //need i1
+  * [_] [`_mm256_mask_expandloadu_epi16`] //need i1
+  * [_] [`_mm256_maskz_expandloadu_epi16`] //need i1
+  * [_] [`_mm512_mask_expandloadu_epi16`] //need i1
+  * [_] [`_mm512_maskz_expandloadu_epi16`] //need i1
+  * [_] [`_mm_mask_expandloadu_epi8`] //need i1
+  * [_] [`_mm_maskz_expandloadu_epi8`] //need i1
+  * [_] [`_mm256_mask_expandloadu_epi8`] //need i1
+  * [_] [`_mm256_maskz_expandloadu_epi8`] //need i1
+  * [_] [`_mm512_mask_expandloadu_epi8`] //need i1
+  * [_] [`_mm512_maskz_expandloadu_epi8`] //need i1
   * [x] [`_mm_mask_shldi_epi16`]
   * [x] [`_mm_maskz_shldi_epi16`]
   * [x] [`_mm_shldi_epi16`]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 769e243919..f72f56a355 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1,5 +1,4 @@
 use crate::{
-    //core_arch::x86_64::{_mm_cvtsd_si64, _mm_cvtss_si64},
     core_arch::{simd::*, simd_llvm::*, x86::*},
     mem::{self, transmute},
     ptr,
@@ -37878,54 +37877,6 @@ pub unsafe fn _mm_cvt_roundss_i32(a: __m128, rounding: i32) -> i32 {
     transmute(r)
 }
 
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_i64&expand=1370)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_i64(a: __m128, rounding: i32) -> i64 {
-    let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_si64&expand=1375)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_si64(a: __m128, rounding: i32) -> i64 {
-    let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
@@ -37950,30 +37901,6 @@ pub unsafe fn _mm_cvt_roundss_u32(a: __m128, rounding: i32) -> u32 {
     transmute(r)
 }
 
-/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_u64&expand=1377)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_u64(a: __m128, rounding: i32) -> u64 {
-    let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtss_i32&expand=1893)
@@ -37994,16 +37921,6 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
     transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
 }
 
-/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_u64&expand=1902)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi))]
-pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
-    transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
-}
-
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
@@ -38052,54 +37969,6 @@ pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d, rounding: i32) -> i32 {
     transmute(r)
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_i64&expand=1358)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_i64(a: __m128d, rounding: i32) -> i64 {
-    let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_si64&expand=1360)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_si64(a: __m128d, rounding: i32) -> i64 {
-    let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
@@ -38124,30 +37993,6 @@ pub unsafe fn _mm_cvt_roundsd_u32(a: __m128d, rounding: i32) -> u32 {
     transmute(r)
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_u64&expand=1365)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_u64(a: __m128d, rounding: i32) -> u64 {
-    let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtsd_i32&expand=1791)
@@ -38168,16 +38013,6 @@ pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
     transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_u64&expand=1800)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi))]
-pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
-    transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
-}
-
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
@@ -38203,102 +38038,6 @@ pub unsafe fn _mm_cvt_roundi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
     transmute(r)
 }
 
-/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_ss&expand=1314)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
-    let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2ss64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
-/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_sd&expand=1313)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2sd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d {
-    let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2sd(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
-/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_ss&expand=1368)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundsi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
-    let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2ss64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
-/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_sd&expand=1367)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2sd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundsi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d {
-    let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2sd(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
@@ -38348,54 +38087,6 @@ pub unsafe fn _mm_cvt_roundu32_ss(a: __m128, b: u32, rounding: i32) -> __m128 {
     transmute(r)
 }
 
-/// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_ss&expand=1380)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtusi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundu64_ss(a: __m128, b: u64, rounding: i32) -> __m128 {
-    let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtusi2ss64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
-/// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_sd&expand=1379)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtusi2sd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundu64_sd(a: __m128d, b: u64, rounding: i32) -> __m128d {
-    let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtusi2sd(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
-    transmute(r)
-}
-
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_ss&expand=1643)
@@ -38420,30 +38111,6 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
     transmute(r)
 }
 
-/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_ss&expand=1643)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss))]
-pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
-    let b = b as f32;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
-}
-
-/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti64_sd&expand=1644)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2sd))]
-pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
-    let b = b as f64;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
-}
-
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
@@ -38482,44 +38149,6 @@ pub unsafe fn _mm_cvtt_roundss_i32(a: __m128, sae: i32) -> i32 {
     transmute(r)
 }
 
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_i64&expand=1935)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_i64(a: __m128, sae: i32) -> i64 {
-    let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
-    transmute(r)
-}
-
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_si64&expand=1937)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_si64(a: __m128, sae: i32) -> i64 {
-    let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
-    transmute(r)
-}
-
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
@@ -38539,25 +38168,6 @@ pub unsafe fn _mm_cvtt_roundss_u32(a: __m128, sae: i32) -> u32 {
     transmute(r)
 }
 
-/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_u64&expand=1939)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_u64(a: __m128, sae: i32) -> u64 {
-    let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
-    transmute(r)
-}
-
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_i32&expand=2022)
@@ -38568,16 +38178,6 @@ pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
     transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
 }
 
-/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=#text=_mm_cvttss_i64&expand=2023)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si))]
-pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
-    transmute(vcvtss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
-}
-
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_u32&expand=2026)
@@ -38588,16 +38188,6 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
     transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
 }
 
-/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_u64&expand=2027)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi))]
-pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
-    transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
-}
-
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
@@ -38636,44 +38226,6 @@ pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d, sae: i32) -> i32 {
     transmute(r)
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si64&expand=1931)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_si64(a: __m128d, sae: i32) -> i64 {
-    let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
-    transmute(r)
-}
-
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i64&expand=1929)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_i64(a: __m128d, sae: i32) -> i64 {
-    let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
-    transmute(r)
-}
-
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
@@ -38693,25 +38245,6 @@ pub unsafe fn _mm_cvtt_roundsd_u32(a: __m128d, sae: i32) -> u32 {
     transmute(r)
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_u64&expand=1933)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_u64(a: __m128d, sae: i32) -> u64 {
-    let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
-    transmute(r)
-}
-
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_i32&expand=2015)
@@ -38722,16 +38255,6 @@ pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
     transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_i64&expand=2016)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si))]
-pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
-    transmute(vcvtsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
-}
-
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_u32&expand=2020)
@@ -38742,16 +38265,6 @@ pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
     transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
 }
 
-/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_u64&expand=2021)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi))]
-pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
-    transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
-}
-
 /// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_ss&expand=2032)
@@ -38776,30 +38289,6 @@ pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
     transmute(r)
 }
 
-/// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_ss&expand=2035)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(mov))] // should be vcvtusi2ss
-pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
-    let b = b as f32;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
-}
-
-/// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_sd&expand=2034)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(mov))] // should be vcvtusi2sd
-pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
-    let b = b as f64;
-    let r = simd_insert(a, 0, b);
-    transmute(r)
-}
-
 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
@@ -39990,34 +39479,21 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.vcvtss2si32"]
     fn vcvtss2si(a: f32x4, rounding: i32) -> i32;
-    #[link_name = "llvm.x86.avx512.vcvtss2si64"]
-    fn vcvtss2si64(a: f32x4, rounding: i32) -> i64;
     #[link_name = "llvm.x86.avx512.vcvtss2usi32"]
     fn vcvtss2usi(a: f32x4, rounding: i32) -> u32;
-    #[link_name = "llvm.x86.avx512.vcvtss2usi64"]
-    fn vcvtss2usi64(a: f32x4, rounding: i32) -> u64;
 
     #[link_name = "llvm.x86.avx512.vcvtsd2si32"]
     fn vcvtsd2si(a: f64x2, rounding: i32) -> i32;
-    #[link_name = "llvm.x86.avx512.vcvtsd2si64"]
-    fn vcvtsd2si64(a: f64x2, rounding: i32) -> i64;
-
     #[link_name = "llvm.x86.avx512.vcvtsd2usi32"]
     fn vcvtsd2usi(a: f64x2, rounding: i32) -> u32;
-    #[link_name = "llvm.x86.avx512.vcvtsd2usi64"]
-    fn vcvtsd2usi64(a: f64x2, rounding: i32) -> u64;
 
     #[link_name = "llvm.x86.avx512.cvtsi2ss32"]
     fn vcvtsi2ss(a: f32x4, b: i32, rounding: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.cvtsi2ss64"]
-    fn vcvtsi2ss64(a: f32x4, b: i64, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.cvtsi2sd64"]
     fn vcvtsi2sd(a: f64x2, b: i64, rounding: i32) -> f64x2;
 
     #[link_name = "llvm.x86.avx512.cvtusi2ss"]
     fn vcvtusi2ss(a: f32x4, b: u32, rounding: i32) -> f32x4;
-    #[link_name = "llvm.x86.avx512.cvtusi642ss"]
-    fn vcvtusi2ss64(a: f32x4, b: u64, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.cvtusi642sd"]
     fn vcvtusi2sd(a: f64x2, b: u64, rounding: i32) -> f64x2;
 
@@ -55669,22 +55145,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundss_i64() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_i64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e: i64 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundss_si64() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_si64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e: i64 = -1;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55693,14 +55153,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundss_u64() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_u64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e: u64 = u64::MAX;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtss_i32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55717,14 +55169,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtss_u64() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtss_u64(a);
-        let e: u64 = u64::MAX;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_si32() {
         let a = _mm_set_pd(1., -1.5);
@@ -55741,22 +55185,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundsd_i64() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_i64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e: i64 = -1;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundsd_si64() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_si64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e: i64 = -1;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_u32() {
         let a = _mm_set_pd(1., -1.5);
@@ -55765,14 +55193,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundsd_u64() {
-        let a = _mm_set_pd(1., f64::MAX);
-        let r = _mm_cvt_roundsd_u64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e: u64 = u64::MAX;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtsd_i32() {
         let a = _mm_set_pd(1., -1.5);
@@ -55789,14 +55209,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtsd_u64() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtsd_u64(a);
-        let e: u64 = u64::MAX;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundi32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55806,42 +55218,6 @@ mod tests {
         assert_eq_m128(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundi64_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b: i64 = 9;
-        let r = _mm_cvt_roundi64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e = _mm_set_ps(0., -0.5, 1., 9.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundi64_sd() {
-        let a = _mm_set_pd(1., -1.5);
-        let b: i64 = 9;
-        let r = _mm_cvt_roundi64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e = _mm_set_pd(1., 9.);
-        assert_eq_m128d(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundsi64_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b: i64 = 9;
-        let r = _mm_cvt_roundsi64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e = _mm_set_ps(0., -0.5, 1., 9.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundsi64_sd() {
-        let a = _mm_set_pd(1., -1.5);
-        let b: i64 = 9;
-        let r = _mm_cvt_roundsi64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e = _mm_set_pd(1., 9.);
-        assert_eq_m128d(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsi32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55860,24 +55236,6 @@ mod tests {
         assert_eq_m128(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundu64_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b: u64 = 9;
-        let r = _mm_cvt_roundu64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e = _mm_set_ps(0., -0.5, 1., 9.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvt_roundu64_sd() {
-        let a = _mm_set_pd(1., -1.5);
-        let b: u64 = 9;
-        let r = _mm_cvt_roundu64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        let e = _mm_set_pd(1., 9.);
-        assert_eq_m128d(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvti32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55896,24 +55254,6 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvti64_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b: i64 = 9;
-        let r = _mm_cvti64_ss(a, b);
-        let e = _mm_set_ps(0., -0.5, 1., 9.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvti64_sd() {
-        let a = _mm_set_pd(1., -1.5);
-        let b: i64 = 9;
-        let r = _mm_cvti64_sd(a, b);
-        let e = _mm_set_pd(1., 9.);
-        assert_eq_m128d(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_si32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55930,22 +55270,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundss_i64() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_i64(a, _MM_FROUND_CUR_DIRECTION);
-        let e: i64 = -2;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundss_si64() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_si64(a, _MM_FROUND_CUR_DIRECTION);
-        let e: i64 = -2;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55954,14 +55278,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundss_u64() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_u64(a, _MM_FROUND_CUR_DIRECTION);
-        let e: u64 = u64::MAX;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvttss_i32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55970,14 +55286,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvttss_i64() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvttss_i64(a);
-        let e: i64 = -2;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvttss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -55986,14 +55294,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvttss_u64() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvttss_u64(a);
-        let e: u64 = u64::MAX;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_si32() {
         let a = _mm_set_pd(1., -1.5);
@@ -56010,22 +55310,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundsd_i64() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_i64(a, _MM_FROUND_CUR_DIRECTION);
-        let e: i64 = -2;
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundsd_si64() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_si64(a, _MM_FROUND_CUR_DIRECTION);
-        let e: i64 = -2;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_u32() {
         let a = _mm_set_pd(1., -1.5);
@@ -56034,14 +55318,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtt_roundsd_u64() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_u64(a, _MM_FROUND_CUR_DIRECTION);
-        let e: u64 = u64::MAX;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvttsd_i32() {
         let a = _mm_set_pd(1., -1.5);
@@ -56050,14 +55326,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvttsd_i64() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvttsd_i64(a);
-        let e: i64 = -2;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvttsd_u32() {
         let a = _mm_set_pd(1., -1.5);
@@ -56066,14 +55334,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvttsd_u64() {
-        let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvttsd_u64(a);
-        let e: u64 = u64::MAX;
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtu32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
@@ -56092,24 +55352,6 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtu64_ss() {
-        let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let b: u64 = 9;
-        let r = _mm_cvtu64_ss(a, b);
-        let e = _mm_set_ps(0., -0.5, 1., 9.);
-        assert_eq_m128(r, e);
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm_cvtu64_sd() {
-        let a = _mm_set_pd(1., -1.5);
-        let b: u64 = 9;
-        let r = _mm_cvtu64_sd(a, b);
-        let e = _mm_set_pd(1., 9.);
-        assert_eq_m128d(r, e);
-    }
-
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_comi_round_ss() {
         let a = _mm_set1_ps(2.2);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 46f06bd083..cf1b4b6220 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -1,5 +1,7 @@
-use crate::core_arch::x86::*;
-use crate::core_arch::x86_64::*;
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*, x86_64::*},
+    mem::transmute,
+};
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
@@ -24,6 +26,537 @@ pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
     _mm_cvtss_si64(a)
 }
 
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_u64&expand=1902)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
+    transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_u64&expand=1800)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
+    transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_ss&expand=1643)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss))]
+pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
+    let b = b as f32;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti64_sd&expand=1644)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd))]
+pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
+    let b = b as f64;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_ss&expand=2035)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(mov))] // should be vcvtusi2ss
+pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
+    let b = b as f32;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_sd&expand=2034)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(mov))] // should be vcvtusi2sd
+pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
+    let b = b as f64;
+    let r = simd_insert(a, 0, b);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_i64&expand=2016)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si))]
+pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
+    transmute(vcvtsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_u64&expand=2021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi))]
+pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
+    transmute(vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=#text=_mm_cvttss_i64&expand=2023)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si))]
+pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
+    transmute(vcvtss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_u64&expand=2027)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi))]
+pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
+    transmute(vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_sd&expand=1313)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsi2sd64(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_sd&expand=1367)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2sd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundsi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsi2sd64(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_ss&expand=1314)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsi2ss64(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_sd&expand=1379)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2sd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundu64_sd(a: __m128d, b: u64, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtusi2sd64(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_ss&expand=1368)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundsi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsi2ss64(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_ss&expand=1380)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtusi2ss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_cvt_roundu64_ss(a: __m128, b: u64, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtusi2ss64(a, b, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_si64&expand=1360)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundsd_si64(a: __m128d, rounding: i32) -> i64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_i64&expand=1358)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundsd_i64(a: __m128d, rounding: i32) -> i64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_u64&expand=1365)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundsd_u64(a: __m128d, rounding: i32) -> u64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2usi64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_si64&expand=1375)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundss_si64(a: __m128, rounding: i32) -> i64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_i64&expand=1370)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundss_i64(a: __m128, rounding: i32) -> i64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_u64&expand=1377)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, rounding = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvt_roundss_u64(a: __m128, rounding: i32) -> u64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2usi64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si64&expand=1931)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundsd_si64(a: __m128d, sae: i32) -> i64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i64&expand=1929)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundsd_i64(a: __m128d, sae: i32) -> i64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_u64&expand=1933)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtsd2usi, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundsd_u64(a: __m128d, sae: i32) -> u64 {
+    let a = a.as_f64x2();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtsd2usi64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_i64&expand=1935)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundss_i64(a: __m128, sae: i32) -> i64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_si64&expand=1937)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundss_si64(a: __m128, sae: i32) -> i64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2si64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_u64&expand=1939)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcvtss2usi, sae = 8))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm_cvtt_roundss_u64(a: __m128, sae: i32) -> u64 {
+    let a = a.as_f32x4();
+    macro_rules! call {
+        ($imm4:expr) => {
+            vcvtss2usi64(a, $imm4)
+        };
+    }
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.vcvtss2si64"]
+    fn vcvtss2si64(a: f32x4, rounding: i32) -> i64;
+    #[link_name = "llvm.x86.avx512.vcvtss2usi64"]
+    fn vcvtss2usi64(a: f32x4, rounding: i32) -> u64;
+    #[link_name = "llvm.x86.avx512.vcvtsd2si64"]
+    fn vcvtsd2si64(a: f64x2, rounding: i32) -> i64;
+    #[link_name = "llvm.x86.avx512.vcvtsd2usi64"]
+    fn vcvtsd2usi64(a: f64x2, rounding: i32) -> u64;
+
+    #[link_name = "llvm.x86.avx512.cvtsi2ss64"]
+    fn vcvtsi2ss64(a: f32x4, b: i64, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.cvtsi2sd64"]
+    fn vcvtsi2sd64(a: f64x2, b: i64, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.cvtusi642ss"]
+    fn vcvtusi2ss64(a: f32x4, b: u64, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.cvtusi642sd"]
+    fn vcvtusi2sd64(a: f64x2, b: u64, rounding: i32) -> f64x2;
+}
+
 #[cfg(test)]
 mod tests {
 
@@ -11821,4 +12354,238 @@ mod tests {
         let e: i64 = -2;
         assert_eq!(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundi64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundsi64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvti64_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvti64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvti64_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_si64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_si64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvt_roundsd_i64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsd_u64() {
+        let a = _mm_set_pd(1., f64::MAX);
+        let r = _mm_cvt_roundsd_u64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtsd_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_i64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_si64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_si64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: i64 = -1;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvt_roundss_u64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtss_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_i64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_i64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_si64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_si64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvtt_roundsd_u64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttsd_u64() {
+        let a = _mm_set_pd(1., -1.5);
+        let r = _mm_cvttsd_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_i64(a);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_i64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_i64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_si64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_si64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: i64 = -2;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtt_roundss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvtt_roundss_u64(a, _MM_FROUND_CUR_DIRECTION);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvttss_u64() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let r = _mm_cvttss_u64(a);
+        let e: u64 = u64::MAX;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvtu64_ss(a, b);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvtu64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvtu64_sd(a, b);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu64_ss() {
+        let a = _mm_set_ps(0., -0.5, 1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvt_roundu64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(0., -0.5, 1., 9.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundu64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: u64 = 9;
+        let r = _mm_cvt_roundu64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundi64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundi64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cvt_roundsi64_sd() {
+        let a = _mm_set_pd(1., -1.5);
+        let b: i64 = 9;
+        let r = _mm_cvt_roundsi64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 9.);
+        assert_eq_m128d(r, e);
+    }
 }
diff --git a/crates/core_arch/src/x86_64/macros.rs b/crates/core_arch/src/x86_64/macros.rs
new file mode 100644
index 0000000000..e3682d40fe
--- /dev/null
+++ b/crates/core_arch/src/x86_64/macros.rs
@@ -0,0 +1,32 @@
+//! Utility macros.
+
+// For round instructions, the only valid values for rounding are 4, 8, 9, 10 and 11.
+// This macro enforces that.
+#[allow(unused)]
+macro_rules! constify_imm4_round {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) & 0b1111 {
+            4 => $expand!(4),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            _ => panic!("Invalid round value"),
+        }
+    };
+}
+
+// For sae instructions, the only valid values for sae are 4 and 8.
+// This macro enforces that.
+#[allow(unused)]
+macro_rules! constify_imm4_sae {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) & 0b1111 {
+            4 => $expand!(4),
+            8 => $expand!(8),
+            _ => panic!("Invalid sae value"),
+        }
+    };
+}
diff --git a/crates/core_arch/src/x86_64/mod.rs b/crates/core_arch/src/x86_64/mod.rs
index c9f3bd637c..461874ece0 100644
--- a/crates/core_arch/src/x86_64/mod.rs
+++ b/crates/core_arch/src/x86_64/mod.rs
@@ -1,5 +1,8 @@
 //! `x86_64` intrinsics
 
+#[macro_use]
+mod macros;
+
 mod fxsr;
 pub use self::fxsr::*;