Add _mm_mpsadbw_epu8

p32blo · p32blo · commit 920da4f171c2 · 2017-11-07T12:14:43.000Z
diff --git a/src/x86/macros.rs b/src/x86/macros.rs
@@ -328,6 +328,22 @@ macro_rules! constify_imm4 {
     }
 }
 
+macro_rules! constify_imm3 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match $imm8 & 0b111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            _ => $expand!(7),
+        }
+    }
+}
+
 macro_rules! constify_imm2 {
     ($imm8:expr, $expand:ident) => {
         #[allow(overflowing_literals)]
diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
@@ -151,8 +151,7 @@ pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
 /// Then zero elements according to `imm8`.
 ///
 /// `imm8` specifies which bits from operand `a` will be copied, which bits in
-/// the
-/// result they will be copied to, and which bits in the result will be
+/// the result they will be copied to, and which bits in the result will be
 /// cleared. The following assignments are made:
 ///
 /// * Bits `[7:6]` specify the bits to copy from operand `a`:
@@ -375,14 +374,14 @@ pub unsafe fn _mm_mullo_epi32 (a: i32x4, b:i32x4) -> i32x4 {
 
 /// Tests whether the specified bits in a 128-bit integer vector are all
 /// zeros.
-/// 
+///
 /// Arguments:
-/// 
+///
 /// * `a` - A 128-bit integer vector containing the bits to be tested.
 /// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`.
-/// 
+///
 /// Returns:
-/// 
+///
 /// * `1` - if the specified bits are all zeros,
 /// * `0` - otherwise.
 #[inline(always)]
@@ -397,12 +396,12 @@ pub unsafe fn _mm_testz_si128(a: i64x2, mask: i64x2) -> i32 {
 /// ones.
 ///
 /// Arguments:
-/// 
+///
 /// * `a` - A 128-bit integer vector containing the bits to be tested.
 /// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`.
-/// 
+///
 /// Returns:
-/// 
+///
 /// * `1` - if the specified bits are all ones,
 /// * `0` - otherwise.
 #[inline(always)]
@@ -416,12 +415,12 @@ pub unsafe fn _mm_testc_si128(a: i64x2, mask: i64x2) -> i32 {
 /// neither all zeros nor all ones.
 ///
 /// Arguments:
-/// 
+///
 /// * `a` - A 128-bit integer vector containing the bits to be tested.
 /// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`.
-/// 
+///
 /// Returns:
-/// 
+///
 /// * `1` - if the specified bits are neither all zeros nor all ones,
 /// * `0` - otherwise.
 #[inline(always)]
@@ -433,14 +432,14 @@ pub unsafe fn _mm_testnzc_si128(a: i64x2, mask: i64x2) -> i32 {
 
 /// Tests whether the specified bits in a 128-bit integer vector are all
 /// zeros.
-/// 
+///
 /// Arguments:
-/// 
+///
 /// * `a` - A 128-bit integer vector containing the bits to be tested.
 /// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`.
-/// 
+///
 /// Returns:
-/// 
+///
 /// * `1` - if the specified bits are all zeros,
 /// * `0` - otherwise.
 #[inline(always)]
@@ -452,13 +451,13 @@ pub unsafe fn _mm_test_all_zeros(a: i64x2, mask: i64x2) -> i32 {
 
 /// Tests whether the specified bits in `a` 128-bit integer vector are all
 /// ones.
-/// 
+///
 /// Argument:
-/// 
+///
 /// * `a` - A 128-bit integer vector containing the bits to be tested.
-/// 
+///
 /// Returns:
-///    
+///
 /// * `1` - if the bits specified in the operand are all set to 1,
 /// * `0` - otherwise.
 #[inline(always)]
@@ -473,12 +472,12 @@ pub unsafe fn _mm_test_all_ones(a: i64x2) -> i32 {
 /// neither all zeros nor all ones.
 ///
 /// Arguments:
-/// 
+///
 /// * `a` - A 128-bit integer vector containing the bits to be tested.
 /// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`.
-/// 
+///
 /// Returns:
-/// 
+///
 /// * `1` - if the specified bits are neither all zeros nor all ones,
 /// * `0` - otherwise.
 #[inline(always)]
@@ -731,22 +730,17 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 {
 }
 
 /// Finds the minimum unsigned 16-bit element in the input 128-bit
-/// vector of [8 x u16] and returns it and along with its index.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
-/// instruction.
+/// vector of `u16x8` and returns it and along with its index.
 ///
 /// Arguments:
-/// 
+///
 /// * `a` - A 128-bit vector of type `u16x8`.
-/// 
+///
 /// Returns:
-/// 
+///
 /// A 128-bit value where:
-/// 
-/// * bits `[15:0]` - contain the minimum value found in parameter `a`, 
+///
+/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
 /// * bits `[18:16]` - contain the index of the minimum value
 /// * remaining bits are set to `0`.
 #[inline(always)]
@@ -756,6 +750,47 @@ pub unsafe fn _mm_minpos_epu16(a: u16x8) -> u16x8 {
     phminposuw(a)
 }
 
+/// Subtracts 8-bit unsigned integer values and computes the absolute
+/// values of the differences to the corresponding bits in the destination.
+/// Then sums of the absolute differences are returned according to the bit
+/// fields in the immediate operand.
+///
+/// The following algorithm is performed:
+/// 
+/// ```ignore
+/// i = imm8[2] * 4
+/// j = imm8[1:0] * 4
+/// for k := 0 to 7
+///     d0 = abs(a[i + k + 0] - b[j + 0])
+///     d1 = abs(a[i + k + 1] - b[j + 1])
+///     d2 = abs(a[i + k + 2] - b[j + 2])
+///     d3 = abs(a[i + k + 3] - b[j + 3])
+///     r[k] = d0 + d1 + d2 + d3
+/// ```
+/// 
+/// Arguments:
+///
+/// * `a` - A 128-bit vector of type `i8x16`.
+/// * `b` - A 128-bit vector of type `i8x16`.
+/// * `imm8` - An 8-bit immediate operand specifying how the absolute differences are to
+///            be calculated
+///     * Bit `[2]` specify the offset for operand `a`
+///     * Bits `[1:0]` specify the offset for operand `b`
+///
+/// Returns:
+///
+/// * A `i16x8` vector containing the sums of the sets of
+///   absolute differences between both operands.
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(test, assert_instr(mpsadbw, imm8=0))]
+pub unsafe fn _mm_mpsadbw_epu8(a: i8x16, b: i8x16, imm8: u8) -> i16x8 {
+    macro_rules! call {
+        ($imm8:expr) => { mpsadbw(a, b, $imm8) }
+    }
+    constify_imm3!(imm8, call)
+}
+
 
 #[allow(improper_ctypes)]
 extern "C" {
@@ -805,6 +840,8 @@ extern "C" {
     fn roundss(a: f32x4, b: f32x4, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.sse41.phminposuw"]
     fn phminposuw(a: u16x8) -> u16x8;
+    #[link_name = "llvm.x86.sse41.mpsadbw"]
+    fn mpsadbw(a: i8x16, b: i8x16, imm8: u8) -> i16x8;
 }
 
 #[cfg(test)]
@@ -1083,7 +1120,7 @@ mod tests {
         let e = i64x2::splat(-10);
         assert_eq!(r, e);
     }
-    
+
     #[simd_test = "sse4.1"]
     unsafe fn _mm_cvtepi32_epi64() {
         let a = i32x4::splat(10);
@@ -1393,4 +1430,29 @@ mod tests {
         let e = u16x8::splat(0).replace(0, 1).replace(1, 5);
         assert_eq!(r, e);
     }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_mpsadbw_epu8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+        let r = sse41::_mm_mpsadbw_epu8(a, a, 0b000);
+        let e = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        assert_eq!(r, e);
+
+        let r = sse41::_mm_mpsadbw_epu8(a, a, 0b001);
+        let e = i16x8::new(16, 12, 8, 4, 0, 4, 8, 12);
+        assert_eq!(r, e);
+        
+        let r = sse41::_mm_mpsadbw_epu8(a, a, 0b100);
+        let e = i16x8::new(16, 20, 24, 28, 32, 36, 40, 44);
+        assert_eq!(r, e);
+
+        let r = sse41::_mm_mpsadbw_epu8(a, a, 0b101);
+        let e = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
+        assert_eq!(r, e);
+
+        let r = sse41::_mm_mpsadbw_epu8(a, a, 0b111);
+        let e = i16x8::new(32, 28, 24, 20, 16, 12, 8, 4);
+        assert_eq!(r, e);
+    }
 }