From c9a400e488ed250f45fbe67c7d647d661d3021e2 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Fri, 26 May 2017 18:08:55 -0500 Subject: [PATCH 01/24] start adding avx2 --- src/x86/avx2.rs | 164 +++++++++++++++++++++++++++++++++++++++++++++++- src/x86/mod.rs | 2 + 2 files changed, 163 insertions(+), 3 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 0a4588b178..2a29b48e17 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -1,3 +1,161 @@ -use simd::*; -use v128::*; -use v64::*; +use v256::*; + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_abs_epi32(a: i32x8) -> i32x8 { + unsafe { pabsd(a) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_abs_epi16(a: i16x16) -> i16x16 { + unsafe { pabsw(a) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_abs_epi8(a: i8x32) -> i8x32 { + unsafe { pabsb(a) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 { + a + b +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 { + a + b +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 { + a + b +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { + a + b +} + + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx2.pabs.b"] + fn pabsb(a: i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.pabs.w"] + fn pabsw(a: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pabs.d"] + fn pabsd(a: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.padds_b"] + fn paddsb(a:i8x32,b:i8x32) -> i8x32; + + +} + + +#[cfg(test)] +mod tests { + use v256::*; + use x86::avx2; + use std; + + #[test] + #[target_feature = "+avx2"] + fn _mm_256_abs_epi32() { + let a = i32x8::new(0, 1, -1, std::i32::MAX, + std::i32::MIN + 1, 100, -100, -32); + let r = avx2::_mm256_abs_epi32(a); + let e = i32x8::new(0, 1, 1, std::i32::MAX, + (std::i32::MIN + 1).abs(), 100, 100, 32); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm_256_abs_epi16() { + let a = i16x16::new(0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i16::MAX, + std::i16::MIN + 1, 100, -100, -32); + let r = avx2::_mm256_abs_epi16(a); + let e = i16x16::new(0, 1, 1, 2, + 2, 3, 3, 4, + 4, 5, 5, std::i16::MAX, + (std::i16::MIN + 1).abs(), 100, 100, 32); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm_256_abs_epi8() { + let a = i8x32::new(0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, + std::i8::MIN + 1, 100, -100, -32, + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, + std::i8::MIN + 1, 100, -100, -32); + let r = avx2::_mm256_abs_epi8(a); + let e = i8x32::new(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn __mm_256_add_eip64() { + let a = i64x4::new(-10, 0, 100, 1_000_000_000); + let b = i64x4::new(-1, 0, 1, 2); + let r = avx2::_mm256_add_epi64(a, b); + let e = i64x4::new(-11, 0, 101, 1_000_000_002); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn __mm_256_add_eip32() { + let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6); + let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let r = avx2::_mm256_add_epi32(a, b); + let e = i32x8::new(0, 2, 4, 6, 8, 10, 12, 14); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn __mm_256_add_eip16() { + let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + let r = avx2::_mm256_add_epi16(a, b); + let e = i16x16::new(0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn __mm_256_add_eip8() { + let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); + let b = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); + let r = avx2::_mm256_add_epi8(a, b); + let e = i8x32::new(0, 2, 4, 6, 8, 10, 12, 14, 16, + 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 52, 54, 56, 58, 60,62); + assert_eq!(r, e); + } + +} diff --git a/src/x86/mod.rs b/src/x86/mod.rs index 610bf657d0..45ba6e158d 100644 --- a/src/x86/mod.rs +++ b/src/x86/mod.rs @@ -2,6 +2,7 @@ pub use self::sse::*; pub use self::sse2::*; pub use self::ssse3::*; pub use self::sse42::*; +pub use self::avx2::*; #[allow(non_camel_case_types)] pub type __m128i = ::v128::i8x16; @@ -10,3 +11,4 @@ mod sse; mod sse2; mod ssse3; mod sse42; +mod avx2; \ No newline at end of file From b052dc5a5522dd73902e3d01506b67dbc671a7b5 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Fri, 26 May 2017 19:14:06 -0500 Subject: [PATCH 02/24] test for adds_epi8 --- src/x86/avx2.rs | 104 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 87 insertions(+), 17 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 2a29b48e17..91b4236d47 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -42,6 +42,31 @@ pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { a + b } +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 { + unsafe { paddsb(a,b) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { paddsw(a,b) } +} + + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { + unsafe { paddusb(a,b) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epu6(a: u16x16, b: u16x16) -> u16x16 { + unsafe { paddusw(a,b) } +} + #[allow(improper_ctypes)] extern "C" { @@ -51,8 +76,14 @@ extern "C" { fn pabsw(a: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pabs.d"] fn pabsd(a: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx2.padds_b"] + #[link_name = "llvm.x86.avx2.padds.b"] fn paddsb(a:i8x32,b:i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.padds.w"] + fn paddsw(a:i16x16,b:i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.paddus.b"] + fn paddusb(a:u8x32,b:u8x32) -> u8x32; + #[link_name = "llvm.x86.avx2.paddus.w"] + fn paddusw(a:u16x16,b:u16x16) -> u16x16; } @@ -108,7 +139,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn __mm_256_add_eip64() { + fn _mm_256_add_epi64() { let a = i64x4::new(-10, 0, 100, 1_000_000_000); let b = i64x4::new(-1, 0, 1, 2); let r = avx2::_mm256_add_epi64(a, b); @@ -118,7 +149,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn __mm_256_add_eip32() { + fn _mm_256_add_epi32() { let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6); let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); let r = avx2::_mm256_add_epi32(a, b); @@ -128,7 +159,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn __mm_256_add_eip16() { + fn _mm_256_add_epi16() { let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, @@ -141,21 +172,60 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn __mm_256_add_eip8() { - let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31); - let b = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31); + fn _mm_256_add_epi8() { + let a = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); + let b = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); let r = avx2::_mm256_add_epi8(a, b); - let e = i8x32::new(0, 2, 4, 6, 8, 10, 12, 14, 16, - 18, 20, 22, 24, 26, 28, 30, 32, - 34, 36, 38, 40, 42, 44, 46, 48, - 50, 52, 54, 56, 58, 60,62); + let e = i8x32::new( + 0, 2, 4, 6, 8, 10, 12, 14, 16, + 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 52, 54, 56, 58, 60,62); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm_256_adds_epi8() { + let a = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + let b = i8x32::new( + 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, + 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63); + let r = avx2::_mm256_adds_epi8(a, b); + let e = i8x32::new( + 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62, + 64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94); assert_eq!(r, e); } + #[test] + #[target_feature = "+avx2"] + fn _mm_adds_epi8_saturate_positive() { + let a = i8x32::splat(0x7F); + let b = i8x32::splat(1); + let r = avx2::_mm256_adds_epi8(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm_adds_epi8_saturate_negative() { + let a = i8x32::splat(-0x80); + let b = i8x32::splat(-1); + let r = avx2::_mm256_adds_epi8(a, b); + assert_eq!(r, a); + } + + + } From e280aa4dcd2a8b3469b0070266fcdff4453a0f26 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Fri, 26 May 2017 19:21:19 -0500 Subject: [PATCH 03/24] all adds tests --- src/x86/avx2.rs | 105 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 94 insertions(+), 11 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 91b4236d47..c7caab83a1 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -63,7 +63,7 @@ pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_adds_epu6(a: u16x16, b: u16x16) -> u16x16 { +pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { paddusw(a,b) } } @@ -97,7 +97,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_abs_epi32() { + fn _mm256_abs_epi32() { let a = i32x8::new(0, 1, -1, std::i32::MAX, std::i32::MIN + 1, 100, -100, -32); let r = avx2::_mm256_abs_epi32(a); @@ -108,7 +108,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_abs_epi16() { + fn _mm256_abs_epi16() { let a = i16x16::new(0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, std::i16::MAX, @@ -123,7 +123,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_abs_epi8() { + fn _mm256_abs_epi8() { let a = i8x32::new(0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, std::i8::MAX, @@ -139,7 +139,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_add_epi64() { + fn _mm256_add_epi64() { let a = i64x4::new(-10, 0, 100, 1_000_000_000); let b = i64x4::new(-1, 0, 1, 2); let r = avx2::_mm256_add_epi64(a, b); @@ -149,7 +149,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_add_epi32() { + fn _mm256_add_epi32() { let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6); let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); let r = avx2::_mm256_add_epi32(a, b); @@ -159,7 +159,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_add_epi16() { + fn _mm256_add_epi16() { let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, @@ -172,7 +172,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_add_epi8() { + fn _mm256_add_epi8() { let a = i8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -194,7 +194,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_adds_epi8() { + fn _mm256_adds_epi8() { let a = i8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); @@ -210,7 +210,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_adds_epi8_saturate_positive() { + fn _mm256_adds_epi8_saturate_positive() { let a = i8x32::splat(0x7F); let b = i8x32::splat(1); let r = avx2::_mm256_adds_epi8(a, b); @@ -219,13 +219,96 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_adds_epi8_saturate_negative() { + fn _mm256_adds_epi8_saturate_negative() { let a = i8x32::splat(-0x80); let b = i8x32::splat(-1); let r = avx2::_mm256_adds_epi8(a, b); assert_eq!(r, a); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi16() { + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new( + 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47); + let r = avx2::_mm256_adds_epi16(a, b); + let e = i16x16::new( + 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi16_saturate_positive() { + let a = i16x16::splat(0x7FFF); + let b = i16x16::splat(1); + let r = avx2::_mm256_adds_epi16(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi16_saturate_negative() { + let a = i16x16::splat(-0x8000); + let b = i16x16::splat(-1); + let r = avx2::_mm256_adds_epi16(a, b); + assert_eq!(r, a); + } + + //-------------- + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu8() { + let a = u8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + let b = u8x32::new( + 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, + 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63); + let r = avx2::_mm256_adds_epu8(a, b); + let e = u8x32::new( + 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62, + 64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu8_saturate() { + let a = u8x32::splat(0xFF); + let b = u8x32::splat(1); + let r = avx2::_mm256_adds_epu8(a, b); + assert_eq!(r, a); + } + + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu16() { + let a = u16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = u16x16::new( + 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47); + let r = avx2::_mm256_adds_epu16(a, b); + let e = u16x16::new( + 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu16_saturate() { + let a = u16x16::splat(0xFFFF); + let b = u16x16::splat(1); + let r = avx2::_mm256_adds_epu16(a, b); + assert_eq!(r, a); + } + + } From 0d653b50eae3f7b7b9e8e03e63a7077ebf89a1fe Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sat, 27 May 2017 10:47:18 -0500 Subject: [PATCH 04/24] doc comments --- src/x86/avx2.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index c7caab83a1..372850bdd7 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -1,66 +1,76 @@ use v256::*; +/// Computes the absolute values of packed 32-bit integers in `a`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_abs_epi32(a: i32x8) -> i32x8 { unsafe { pabsd(a) } } +/// Computes the absolute values of packed 16-bit integers in `a`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_abs_epi16(a: i16x16) -> i16x16 { unsafe { pabsw(a) } } +/// Computes the absolute values of packed 8-bit integers in `a`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_abs_epi8(a: i8x32) -> i8x32 { unsafe { pabsb(a) } } +/// Add packed 64-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 { a + b } +/// Add packed 32-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 { a + b } +/// Add packed 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 { a + b } +/// Add packed 8-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { a + b } +/// Add packed 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 { unsafe { paddsb(a,b) } } +/// Add packed 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 { unsafe { paddsw(a,b) } } - +/// Add packed unsigned 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { unsafe { paddusb(a,b) } } +/// Add packed unsigned 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { @@ -257,8 +267,7 @@ mod tests { let r = avx2::_mm256_adds_epi16(a, b); assert_eq!(r, a); } - - //-------------- + #[test] #[target_feature = "+avx2"] fn _mm256_adds_epu8() { From fef7664eb953676d53aeb983910aff8e43949509 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sat, 27 May 2017 11:01:12 -0500 Subject: [PATCH 05/24] and and andnot --- src/x86/avx2.rs | 31 ++++++++++++++++++++++++++++++- src/x86/mod.rs | 2 ++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 372850bdd7..73baf0e330 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -1,4 +1,5 @@ use v256::*; +use x86::__m256i; /// Computes the absolute values of packed 32-bit integers in `a`. #[inline(always)] @@ -77,6 +78,22 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { paddusw(a,b) } } +/// Compute the bitwise AND of 256 bits (representing integer data) +/// in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_and_si256(a: __m256i, b:__m256i) -> __m256i { + a & b +} + +/// Compute the bitwise NOT of 256 bits (representing integer data) +/// in `a` and then AND with `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_andnot_si256(a: __m256i, b:__m256i) -> __m256i { + (!a) & b +} + #[allow(improper_ctypes)] extern "C" { @@ -103,6 +120,7 @@ extern "C" { mod tests { use v256::*; use x86::avx2; + use x86::__m256i; use std; #[test] @@ -316,8 +334,19 @@ mod tests { let r = avx2::_mm256_adds_epu16(a, b); assert_eq!(r, a); } - + #[test] + fn _mm_and_si256() { + assert_eq!( + avx2::_mm256_and_si256(__m256i::splat(5), __m256i::splat(3)), + __m256i::splat(1)); + } + #[test] + fn _mm_andnot_si256() { + assert_eq!( + avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3)), + __m256i::splat(2)); + } } diff --git a/src/x86/mod.rs b/src/x86/mod.rs index 45ba6e158d..d36fa4444d 100644 --- a/src/x86/mod.rs +++ b/src/x86/mod.rs @@ -6,6 +6,8 @@ pub use self::avx2::*; #[allow(non_camel_case_types)] pub type __m128i = ::v128::i8x16; +#[allow(non_camel_case_types)] +pub type __m256i = ::v256::i8x32; mod sse; mod sse2; From ef8252caa5dcc92d40fe5403ea3e6bab4821bf9a Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sat, 27 May 2017 11:12:57 -0500 Subject: [PATCH 06/24] avg --- src/x86/avx2.rs | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 73baf0e330..93da86a6f1 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -94,6 +94,19 @@ pub fn _mm256_andnot_si256(a: __m256i, b:__m256i) -> __m256i { (!a) & b } +/// Average packed unsigned 16-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_avg_epu16 (a:u16x16,b:u16x16) -> u16x16 { + unsafe { pavgw(a,b) } +} + +/// Average packed unsigned 8-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_avg_epu8 (a:u8x32,b:u8x32) -> u8x32 { + unsafe { pavgb(a,b) } +} #[allow(improper_ctypes)] extern "C" { @@ -111,8 +124,10 @@ extern "C" { fn paddusb(a:u8x32,b:u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.paddus.w"] fn paddusw(a:u16x16,b:u16x16) -> u16x16; - - + #[link_name = "llvm.x86.avx2.pavg.b"] + fn pavgb(a:u8x32,b:u8x32) -> u8x32; + #[link_name = "llvm.x86.avx2.pavg.w"] + fn pavgw(a:u16x16,b:u16x16) -> u16x16; } @@ -336,17 +351,35 @@ mod tests { } #[test] - fn _mm_and_si256() { + #[target_feature = "+avx2"] + fn _mm256_and_si256() { assert_eq!( avx2::_mm256_and_si256(__m256i::splat(5), __m256i::splat(3)), __m256i::splat(1)); } #[test] - fn _mm_andnot_si256() { + #[target_feature = "+avx2"] + fn _mm256_andnot_si256() { assert_eq!( avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3)), __m256i::splat(2)); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_avg_epu8() { + let (a, b) = (u8x32::splat(3), u8x32::splat(9)); + let r = avx2::_mm256_avg_epu8(a, b); + assert_eq!(r, u8x32::splat(6)); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_avg_epu16() { + let (a, b) = (u16x16::splat(3), u16x16::splat(9)); + let r = avx2::_mm256_avg_epu16(a, b); + assert_eq!(r, u16x16::splat(6)); + } + } From 9d393ba3df40035792a178b6cc3712b1eefd7f46 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sat, 27 May 2017 12:23:17 -0500 Subject: [PATCH 07/24] blendv --- src/x86/avx2.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 93da86a6f1..1b4d5c9315 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -108,6 +108,13 @@ pub fn _mm256_avg_epu8 (a:u8x32,b:u8x32) -> u8x32 { unsafe { pavgb(a,b) } } +/// Blend packed 8-bit integers from `a` and `b` using `mask`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { + unsafe { pblendvb(a,b,mask) } +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -128,6 +135,8 @@ extern "C" { fn pavgb(a:u8x32,b:u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.pavg.w"] fn pavgw(a:u16x16,b:u16x16) -> u16x16; + #[link_name = "llvm.x86.avx2.pblendvb"] + fn pblendvb(a:i8x32,b:i8x32,mask:__m256i) -> i8x32; } @@ -382,4 +391,14 @@ mod tests { assert_eq!(r, u16x16::splat(6)); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_blendv_epi8() { + let (a,b) = (i8x32::splat(4),i8x32::splat(2)); + let mask = i8x32::splat(0).replace(2,-1); + let e = i8x32::splat(4).replace(2,2); + let r= avx2::_mm256_blendv_epi8(a,b,mask); + assert_eq!(r,e); + } + } From 77c2560d99f53922a338b25c98edf411ce383ff9 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 07:08:16 -0500 Subject: [PATCH 08/24] cmpeq and cmpgt --- src/x86/avx2.rs | 127 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 1b4d5c9315..2fc803809c 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -115,6 +115,64 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { unsafe { pblendvb(a,b,mask) } } +/// Compare packed 64-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi64(a:i64x4,b:i64x4) -> i64x4 { + a.eq(b) +} + +/// Compare packed 32-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi32(a:i32x8,b:i32x8) -> i32x8 { + a.eq(b) +} + +/// Compare packed 16-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi16(a:i16x16,b:i16x16) -> i16x16 { + a.eq(b) +} + +/// Compare packed 8-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi8(a:i8x32,b:i8x32) -> i8x32 { + a.eq(b) +} + +/// Compare packed 64-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi64(a:i64x4,b:i64x4) -> i64x4 { + a.gt(b) +} + +/// Compare packed 32-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi32(a:i32x8,b:i32x8) -> i32x8 { + a.gt(b) +} + +/// Compare packed 16-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi16(a:i16x16,b:i16x16) -> i16x16 { + a.gt(b) +} + +/// Compare packed 8-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi8(a:i8x32,b:i8x32) -> i8x32 { + a.gt(b) +} + + + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -401,4 +459,73 @@ mod tests { assert_eq!(r,e); } + + #[test] + fn _mm256_cmpeq_epi8() { + let a = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + let b = i8x32::new( + 31,30,2,28,27,26,25,24,23,22,21,20,19,18,17,16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let r = avx2::_mm256_cmpeq_epi8(a, b); + assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8)); + } + + #[test] + fn _mm256_cmpeq_epi16() { + let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7,8,9,10,11,12,13,14,15); + let b = i16x16::new(15,14,2,12,11,10,9,8,7, 6, 5, 4, 3, 2, 1, 0); + let r = avx2::_mm256_cmpeq_epi16(a, b); + assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16)); + } + + #[test] + fn _mm256_cmpeq_epi32() { + let a = i32x8::new(0, 1, 2, 3,4,5,6,7); + let b = i32x8::new(7,6,2,4,3, 2, 1, 0); + let r = avx2::_mm256_cmpeq_epi32(a, b); + assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32)); + } + + #[test] + fn _mm256_cmpeq_epi64() { + let a = i64x4::new(0, 1, 2, 3); + let b = i64x4::new(3, 2, 2, 0); + let r = avx2::_mm256_cmpeq_epi64(a, b); + assert_eq!(r, i64x4::splat(0).replace(2, 0xFFFFFFFFFFFFFFFFu64 as i64)); + } + + #[test] + fn _mm256_cmpgt_epi8() { + let a = i8x32::splat(0).replace(0, 5); + let b = i8x32::splat(0); + let r = avx2::_mm256_cmpgt_epi8(a, b); + assert_eq!(r, i8x32::splat(0).replace(0, 0xFFu8 as i8)); + } + + #[test] + fn _mm256_cmpgt_epi16() { + let a = i16x16::splat(0).replace(0, 5); + let b = i16x16::splat(0); + let r = avx2::_mm256_cmpgt_epi16(a, b); + assert_eq!(r, i16x16::splat(0).replace(0, 0xFFFFu16 as i16)); + } + + #[test] + fn _mm256_cmpgt_epi32() { + let a = i32x8::splat(0).replace(0, 5); + let b = i32x8::splat(0); + let r = avx2::_mm256_cmpgt_epi32(a, b); + assert_eq!(r, i32x8::splat(0).replace(0, 0xFFFFFFFFu32 as i32)); + } + + #[test] + fn _mm256_cmpgt_epi64() { + let a = i64x4::splat(0).replace(0, 5); + let b = i64x4::splat(0); + let r = avx2::_mm256_cmpgt_epi64(a, b); + assert_eq!(r, i64x4::splat(0).replace(0, 0xFFFFFFFFFFFFFFFFu64 as i64)); + } + } From 3c3a2da6f2f3e17900d64b31540c64660aca43f5 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 07:09:46 -0500 Subject: [PATCH 09/24] todo comments --- src/x86/avx2.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 2fc803809c..df2b145b3a 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -108,6 +108,8 @@ pub fn _mm256_avg_epu8 (a:u8x32,b:u8x32) -> u8x32 { unsafe { pavgb(a,b) } } +// TODO alignr + /// Blend packed 8-bit integers from `a` and `b` using `mask`. #[inline(always)] #[target_feature = "+avx2"] @@ -115,6 +117,10 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { unsafe { pblendvb(a,b,mask) } } +// TODO rest of blend + +// TODO broadcast + /// Compare packed 64-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] From 2070eb504d93046d3d5b3129717877bd0d7d11d0 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 09:26:55 -0500 Subject: [PATCH 10/24] style fixes --- src/x86/avx2.rs | 76 +++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index df2b145b3a..9d20890bf5 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -54,35 +54,35 @@ pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 { - unsafe { paddsb(a,b) } + unsafe { paddsb(a, b) } } /// Add packed 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { paddsw(a,b) } + unsafe { paddsw(a, b) } } /// Add packed unsigned 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { - unsafe { paddusb(a,b) } + unsafe { paddusb(a, b) } } /// Add packed unsigned 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { - unsafe { paddusw(a,b) } + unsafe { paddusw(a, b) } } /// Compute the bitwise AND of 256 bits (representing integer data) /// in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_and_si256(a: __m256i, b:__m256i) -> __m256i { +pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { a & b } @@ -90,25 +90,28 @@ pub fn _mm256_and_si256(a: __m256i, b:__m256i) -> __m256i { /// in `a` and then AND with `b`. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_andnot_si256(a: __m256i, b:__m256i) -> __m256i { +pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { (!a) & b } /// Average packed unsigned 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_avg_epu16 (a:u16x16,b:u16x16) -> u16x16 { - unsafe { pavgw(a,b) } +pub fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 { + unsafe { pavgw(a, b) } } /// Average packed unsigned 8-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_avg_epu8 (a:u8x32,b:u8x32) -> u8x32 { - unsafe { pavgb(a,b) } +pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 { + unsafe { pavgb(a, b) } } -// TODO alignr +// TODO _mm256_alignr_epi8 +// TODO _mm256_blend_epi16 +// TODO _mm_blend_epi32 +// TODO _mm256_blend_epi32 /// Blend packed 8-bit integers from `a` and `b` using `mask`. #[inline(always)] @@ -117,68 +120,80 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { unsafe { pblendvb(a,b,mask) } } -// TODO rest of blend +// TODO _mm_broadcastb_epi8 +// TODO _mm256_broadcastb_epi8 +// TODO _mm_broadcastd_epi32 +// TODO _mm256_broadcastd_epi32 +// TODO _mm_broadcastq_epi64 +// TODO _mm256_broadcastq_epi64 +// TODO _mm_broadcastsd_pd +// TODO _mm256_broadcastsd_pd +// TODO _mm_broadcastsi128_si256 +// TODO _mm256_broadcastsi128_si256 +// TODO _mm_broadcastss_ps +// TODO _mm256_broadcastss_ps +// TODO _mm_broadcastw_epi16 +// TODO _mm256_broadcastw_epi16 +// TODO _mm256_bslli_epi128 +// TODO _mm256_bsrli_epi128 -// TODO broadcast /// Compare packed 64-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpeq_epi64(a:i64x4,b:i64x4) -> i64x4 { +pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 { a.eq(b) } /// Compare packed 32-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpeq_epi32(a:i32x8,b:i32x8) -> i32x8 { +pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 { a.eq(b) } /// Compare packed 16-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpeq_epi16(a:i16x16,b:i16x16) -> i16x16 { +pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 { a.eq(b) } /// Compare packed 8-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpeq_epi8(a:i8x32,b:i8x32) -> i8x32 { +pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 { a.eq(b) } /// Compare packed 64-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpgt_epi64(a:i64x4,b:i64x4) -> i64x4 { +pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 { a.gt(b) } /// Compare packed 32-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpgt_epi32(a:i32x8,b:i32x8) -> i32x8 { +pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 { a.gt(b) } /// Compare packed 16-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpgt_epi16(a:i16x16,b:i16x16) -> i16x16 { +pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 { a.gt(b) } /// Compare packed 8-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpgt_epi8(a:i8x32,b:i8x32) -> i8x32 { +pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 { a.gt(b) } - - #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -188,19 +203,19 @@ extern "C" { #[link_name = "llvm.x86.avx2.pabs.d"] fn pabsd(a: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.padds.b"] - fn paddsb(a:i8x32,b:i8x32) -> i8x32; + fn paddsb(a: i8x32, b: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.padds.w"] - fn paddsw(a:i16x16,b:i16x16) -> i16x16; + fn paddsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.paddus.b"] - fn paddusb(a:u8x32,b:u8x32) -> u8x32; + fn paddusb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.paddus.w"] - fn paddusw(a:u16x16,b:u16x16) -> u16x16; + fn paddusw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pavg.b"] - fn pavgb(a:u8x32,b:u8x32) -> u8x32; + fn pavgb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.pavg.w"] - fn pavgw(a:u16x16,b:u16x16) -> u16x16; + fn pavgw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pblendvb"] - fn pblendvb(a:i8x32,b:i8x32,mask:__m256i) -> i8x32; + fn pblendvb(a: i8x32, b: i8x32, mask: __m256i) -> i8x32; } @@ -465,7 +480,6 @@ mod tests { assert_eq!(r,e); } - #[test] fn _mm256_cmpeq_epi8() { let a = i8x32::new( From 922dacd44b424ed537d8a11ca73b3f0d62ac731b Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 12:26:53 -0500 Subject: [PATCH 11/24] update TODO, fix styles, add hadd and hsub --- TODO.md | 389 ++++++++++++++++++++++++++++++++++++++++++++++++ src/x86/avx2.rs | 316 ++++++++++++++++++++++++++++----------- 2 files changed, 621 insertions(+), 84 deletions(-) diff --git a/TODO.md b/TODO.md index 42ef6e43bb..2764cd8d97 100644 --- a/TODO.md +++ b/TODO.md @@ -5,6 +5,7 @@ Intel intrinsics. Replace `SSE4.2` with the intended type. rg '^> TODO.md ``` +rg calls the ripgrep tool, which can be installed with `cargo install ripgrep` sse --- @@ -535,3 +536,391 @@ sse4.2 * [ ] `_mm_crc32_u16` * [ ] `_mm_crc32_u32` * [ ] `_mm_crc32_u64` + + +avx +--- +* [ ] `_mm256_add_pd` +* [ ] `_mm256_add_ps` +* [ ] `_mm256_addsub_pd` +* [ ] `_mm256_addsub_ps` +* [ ] `_mm256_and_pd` +* [ ] `_mm256_and_ps` +* [ ] `_mm256_andnot_pd` +* [ ] `_mm256_andnot_ps` +* [ ] `_mm256_blend_pd` +* [ ] `_mm256_blend_ps` +* [ ] `_mm256_blendv_pd` +* [ ] `_mm256_blendv_ps` +* [ ] `_mm256_div_pd` +* [ ] `_mm256_div_ps` +* [ ] `_mm256_dp_ps` +* [ ] `_mm256_hadd_pd` +* [ ] `_mm256_hadd_ps` +* [ ] `_mm256_hsub_pd` +* [ ] `_mm256_hsub_ps` +* [ ] `_mm256_max_pd` +* [ ] `_mm256_max_ps` +* [ ] `_mm256_min_pd` +* [ ] `_mm256_min_ps` +* [ ] `_mm256_mul_pd` +* [ ] `_mm256_mul_ps` +* [ ] `_mm256_or_pd` +* [ ] `_mm256_or_ps` +* [ ] `_mm256_shuffle_pd` +* [ ] `_mm256_shuffle_ps` +* [ ] `_mm256_sub_pd` +* [ ] `_mm256_sub_ps` +* [ ] `_mm256_xor_pd` +* [ ] `_mm256_xor_ps` +* [ ] `_mm_cmp_pd` +* [ ] `_mm256_cmp_pd` +* [ ] `_mm_cmp_ps` +* [ ] `_mm256_cmp_ps` +* [ ] `_mm_cmp_sd` +* [ ] `_mm_cmp_ss` +* [ ] `_mm256_cvtepi32_pd` +* [ ] `_mm256_cvtepi32_ps` +* [ ] `_mm256_cvtpd_ps` +* [ ] `_mm256_cvtps_epi32` +* [ ] `_mm256_cvtps_pd` +* [ ] `_mm256_cvttpd_epi32` +* [ ] `_mm256_cvtpd_epi32` +* [ ] `_mm256_cvttps_epi32` +* [ ] `_mm256_extractf128_ps` +* [ ] `_mm256_extractf128_pd` +* [ ] `_mm256_extractf128_si256` +* [ ] `_mm256_extract_epi8` +* [ ] `_mm256_extract_epi16` +* [ ] `_mm256_extract_epi32` +* [ ] `_mm256_extract_epi64` +* [ ] `_mm256_zeroall` +* [ ] `_mm256_zeroupper` +* [ ] `_mm256_permutevar_ps` +* [ ] `_mm_permutevar_ps` +* [ ] `_mm256_permute_ps` +* [ ] `_mm_permute_ps` +* [ ] `_mm256_permutevar_pd` +* [ ] `_mm_permutevar_pd` +* [ ] `_mm256_permute_pd` +* [ ] `_mm_permute_pd` +* [ ] `_mm256_permute2f128_ps` +* [ ] `_mm256_permute2f128_pd` +* [ ] `_mm256_permute2f128_si256` +* [ ] `_mm256_broadcast_ss` +* [ ] `_mm_broadcast_ss` +* [ ] `_mm256_broadcast_sd` +* [ ] `_mm256_broadcast_ps` +* [ ] `_mm256_broadcast_pd` +* [ ] `_mm256_insertf128_ps` +* [ ] `_mm256_insertf128_pd` +* [ ] `_mm256_insertf128_si256` +* [ ] `_mm256_insert_epi8` +* [ ] `_mm256_insert_epi16` +* [ ] `_mm256_insert_epi32` +* [ ] `_mm256_insert_epi64` +* [ ] `_mm256_load_pd` +* [ ] `_mm256_store_pd` +* [ ] `_mm256_load_ps` +* [ ] `_mm256_store_ps` +* [ ] `_mm256_loadu_pd` +* [ ] `_mm256_storeu_pd` +* [ ] `_mm256_loadu_ps` +* [ ] `_mm256_storeu_ps` +* [ ] `_mm256_load_si256` +* [ ] `_mm256_store_si256` +* [ ] `_mm256_loadu_si256` +* [ ] `_mm256_storeu_si256` +* [ ] `_mm256_maskload_pd` +* [ ] `_mm256_maskstore_pd` +* [ ] `_mm_maskload_pd` +* [ ] `_mm_maskstore_pd` +* [ ] `_mm256_maskload_ps` +* [ ] `_mm256_maskstore_ps` +* [ ] `_mm_maskload_ps` +* [ ] `_mm_maskstore_ps` +* [ ] `_mm256_movehdup_ps` +* [ ] `_mm256_moveldup_ps` +* [ ] `_mm256_movedup_pd` +* [ ] `_mm256_lddqu_si256` +* [ ] `_mm256_stream_si256` +* [ ] `_mm256_stream_pd` +* [ ] `_mm256_stream_ps` +* [ ] `_mm256_rcp_ps` +* [ ] `_mm256_rsqrt_ps` +* [ ] `_mm256_sqrt_pd` +* [ ] `_mm256_sqrt_ps` +* [ ] `_mm256_round_pd` +* [ ] `_mm256_round_ps` +* [ ] `_mm256_unpackhi_pd` +* [ ] `_mm256_unpackhi_ps` +* [ ] `_mm256_unpacklo_pd` +* [ ] `_mm256_unpacklo_ps` +* [ ] `_mm256_testz_si256` +* [ ] `_mm256_testc_si256` +* [ ] `_mm256_testnzc_si256` +* [ ] `_mm256_testz_pd` +* [ ] `_mm256_testc_pd` +* [ ] `_mm256_testnzc_pd` +* [ ] `_mm_testz_pd` +* [ ] `_mm_testc_pd` +* [ ] `_mm_testnzc_pd` +* [ ] `_mm256_testz_ps` +* [ ] `_mm256_testc_ps` +* [ ] `_mm256_testnzc_ps` +* [ ] `_mm_testz_ps` +* [ ] `_mm_testc_ps` +* [ ] `_mm_testnzc_ps` +* [ ] `_mm256_movemask_pd` +* [ ] `_mm256_movemask_ps` +* [ ] `_mm256_setzero_pd` +* [ ] `_mm256_setzero_ps` +* [ ] `_mm256_setzero_si256` +* [ ] `_mm256_set_pd` +* [ ] `_mm256_set_ps` +* [ ] `_mm256_set_epi8` +* [ ] `_mm256_set_epi16` +* [ ] `_mm256_set_epi32` +* [ ] `_mm256_set_epi64x` +* [ ] `_mm256_setr_pd` +* [ ] `_mm256_setr_ps` +* [ ] `_mm256_setr_epi8` +* [ ] `_mm256_setr_epi16` +* [ ] `_mm256_setr_epi32` +* [ ] `_mm256_setr_epi64x` +* [ ] `_mm256_set1_pd` +* [ ] `_mm256_set1_ps` +* [ ] `_mm256_set1_epi8` +* [ ] `_mm256_set1_epi16` +* [ ] `_mm256_set1_epi32` +* [ ] `_mm256_set1_epi64x` +* [ ] `_mm256_castpd_ps` +* [ ] `_mm256_castps_pd` +* [ ] `_mm256_castps_si256` +* [ ] `_mm256_castpd_si256` +* [ ] `_mm256_castsi256_ps` +* [ ] `_mm256_castsi256_pd` +* [ ] `_mm256_castps256_ps128` +* [ ] `_mm256_castpd256_pd128` +* [ ] `_mm256_castsi256_si128` +* [ ] `_mm256_castps128_ps256` +* [ ] `_mm256_castpd128_pd256` +* [ ] `_mm256_castsi128_si256` +* [ ] `_mm256_zextps128_ps256` +* [ ] `_mm256_zextpd128_pd256` +* [ ] `_mm256_zextsi128_si256` +* [ ] `_mm256_floor_ps` +* [ ] `_mm256_ceil_ps` +* [ ] `_mm256_floor_pd` +* [ ] `_mm256_ceil_pd` +* [ ] `_mm256_undefined_ps` +* [ ] `_mm256_undefined_pd` +* [ ] `_mm256_undefined_si256` +* [ ] `_mm256_set_m128` +* [ ] `_mm256_set_m128d` +* [ ] `_mm256_set_m128i` +* [ ] `_mm256_setr_m128` +* [ ] `_mm256_setr_m128d` +* [ ] `_mm256_setr_m128i` +* [ ] `_mm256_loadu2_m128` +* [ ] `_mm256_loadu2_m128d` +* [ ] `_mm256_loadu2_m128i` +* [ ] `_mm256_storeu2_m128` +* [ ] `_mm256_storeu2_m128d` +* [ ] `_mm256_storeu2_m128i` + + + +avx2 +---- +* [x] `_mm256_abs_epi8` +* [x] `_mm256_abs_epi16` +* [x] `_mm256_abs_epi32` +* [x] `_mm256_add_epi8` +* [x] `_mm256_add_epi16` +* [x] `_mm256_add_epi32` +* [x] `_mm256_add_epi64` +* [x] `_mm256_adds_epi8` +* [x] `_mm256_adds_epi16` +* [x] `_mm256_adds_epu8` +* [x] `_mm256_adds_epu16` +* [ ] `_mm256_alignr_epi8` +* [x] `_mm256_and_si256` +* [x] `_mm256_andnot_si256` +* [x] `_mm256_avg_epu8` +* [x] `_mm256_avg_epu16` +* [ ] `_mm256_blend_epi16` +* [ ] `_mm_blend_epi32` +* [ ] `_mm256_blend_epi32` +* [x] `_mm256_blendv_epi8` +* [ ] `_mm_broadcastb_epi8` +* [ ] `_mm256_broadcastb_epi8` +* [ ] `_mm_broadcastd_epi32` +* [ ] `_mm256_broadcastd_epi32` +* [ ] `_mm_broadcastq_epi64` +* [ ] `_mm256_broadcastq_epi64` +* [ ] `_mm_broadcastsd_pd` +* [ ] `_mm256_broadcastsd_pd` +* [ ] `_mm_broadcastsi128_si256` +* [ ] `_mm256_broadcastsi128_si256` +* [ ] `_mm_broadcastss_ps` +* [ ] `_mm256_broadcastss_ps` +* [ ] `_mm_broadcastw_epi16` +* [ ] `_mm256_broadcastw_epi16` +* [x] `_mm256_cmpeq_epi8` +* [x] `_mm256_cmpeq_epi16` +* [x] `_mm256_cmpeq_epi32` +* [x] `_mm256_cmpeq_epi64` +* [x] `_mm256_cmpgt_epi8` +* [x] `_mm256_cmpgt_epi16` +* [x] `_mm256_cmpgt_epi32` +* [x] `_mm256_cmpgt_epi64` +* [ ] `_mm256_cvtepi16_epi32` +* [ ] `_mm256_cvtepi16_epi64` +* [ ] `_mm256_cvtepi32_epi64` +* [ ] `_mm256_cvtepi8_epi16` +* [ ] `_mm256_cvtepi8_epi32` +* [ ] `_mm256_cvtepi8_epi64` +* [ ] `_mm256_cvtepu16_epi32` +* [ ] `_mm256_cvtepu16_epi64` +* [ ] `_mm256_cvtepu32_epi64` +* [ ] `_mm256_cvtepu8_epi16` +* [ ] `_mm256_cvtepu8_epi32` +* [ ] `_mm256_cvtepu8_epi64` +* [ ] `_mm256_extracti128_si256` +* [x] `_mm256_hadd_epi16` +* [x] `_mm256_hadd_epi32` +* [x] `_mm256_hadds_epi16` +* [x] `_mm256_hsub_epi16` +* [x] `_mm256_hsub_epi32` +* [x] `_mm256_hsubs_epi16` +* [ ] `_mm_i32gather_pd` +* [ ] `_mm256_i32gather_pd` +* [ ] `_mm_i32gather_ps` +* [ ] `_mm256_i32gather_ps` +* [ ] `_mm_i32gather_epi32` +* [ ] `_mm256_i32gather_epi32` +* [ ] `_mm_i32gather_epi64` +* [ ] `_mm256_i32gather_epi64` +* [ ] `_mm_i64gather_pd` +* [ ] `_mm256_i64gather_pd` +* [ ] `_mm_i64gather_ps` +* [ ] `_mm256_i64gather_ps` +* [ ] `_mm_i64gather_epi32` +* [ ] `_mm256_i64gather_epi32` +* [ ] `_mm_i64gather_epi64` +* [ ] `_mm256_i64gather_epi64` +* [ ] `_mm256_inserti128_si256` +* [ ] `_mm256_madd_epi16` +* [ ] `_mm256_maddubs_epi16` +* [ ] `_mm_mask_i32gather_pd` +* [ ] `_mm256_mask_i32gather_pd` +* [ ] `_mm_mask_i32gather_ps` +* [ ] `_mm256_mask_i32gather_ps` +* [ ] `_mm_mask_i32gather_epi32` +* [ ] `_mm256_mask_i32gather_epi32` +* [ ] `_mm_mask_i32gather_epi64` +* [ ] `_mm256_mask_i32gather_epi64` +* [ ] `_mm_mask_i64gather_pd` +* [ ] `_mm256_mask_i64gather_pd` +* [ ] `_mm_mask_i64gather_ps` +* [ ] `_mm256_mask_i64gather_ps` +* [ ] `_mm_mask_i64gather_epi32` +* [ ] `_mm256_mask_i64gather_epi32` +* [ ] `_mm_mask_i64gather_epi64` +* [ ] `_mm256_mask_i64gather_epi64` +* [ ] `_mm_maskload_epi32` +* [ ] `_mm256_maskload_epi32` +* [ ] `_mm_maskload_epi64` +* [ ] `_mm256_maskload_epi64` +* [ ] `_mm_maskstore_epi32` +* [ ] `_mm256_maskstore_epi32` +* [ ] `_mm_maskstore_epi64` +* [ ] `_mm256_maskstore_epi64` +* [ ] `_mm256_max_epi8` +* [ ] `_mm256_max_epi16` +* [ ] `_mm256_max_epi32` +* [ ] `_mm256_max_epu8` +* [ ] `_mm256_max_epu16` +* [ ] `_mm256_max_epu32` +* [ ] `_mm256_min_epi8` +* [ ] `_mm256_min_epi16` +* [ ] `_mm256_min_epi32` +* [ ] `_mm256_min_epu8` +* [ ] `_mm256_min_epu16` +* [ ] `_mm256_min_epu32` +* [ ] `_mm256_movemask_epi8` +* [ ] `_mm256_mpsadbw_epu8` +* [ ] `_mm256_mul_epi32` +* [ ] `_mm256_mul_epu32` +* [ ] `_mm256_mulhi_epi16` +* [ ] `_mm256_mulhi_epu16` +* [ ] `_mm256_mulhrs_epi16` +* [ ] `_mm256_mullo_epi16` +* [ ] `_mm256_mullo_epi32` +* [ ] `_mm256_or_si256` +* [ ] `_mm256_packs_epi16` +* [ ] `_mm256_packs_epi32` +* [ ] `_mm256_packus_epi16` +* [ ] `_mm256_packus_epi32` +* [ ] `_mm256_permute2x128_si256` +* [ ] `_mm256_permute4x64_epi64` +* [ ] `_mm256_permute4x64_pd` +* [ ] `_mm256_permutevar8x32_epi32` +* [ ] `_mm256_permutevar8x32_ps` +* [ ] `_mm256_sad_epu8` +* [ ] `_mm256_shuffle_epi32` +* [ ] `_mm256_shuffle_epi8` +* [ ] `_mm256_shufflehi_epi16` +* [ ] `_mm256_shufflelo_epi16` +* [ ] `_mm256_sign_epi8` +* [ ] `_mm256_sign_epi16` +* [ ] `_mm256_sign_epi32` +* [ ] `_mm256_slli_si256` +* [ ] `_mm256_bslli_epi128` +* [ ] `_mm256_sll_epi16` +* [ ] `_mm256_slli_epi16` +* [ ] `_mm256_sll_epi32` +* [ ] `_mm256_slli_epi32` +* [ ] `_mm256_sll_epi64` +* [ ] `_mm256_slli_epi64` +* [ ] `_mm_sllv_epi32` +* [ ] `_mm256_sllv_epi32` +* [ ] `_mm_sllv_epi64` +* [ ] `_mm256_sllv_epi64` +* [ ] `_mm256_sra_epi16` +* [ ] `_mm256_srai_epi16` +* [ ] `_mm256_sra_epi32` +* [ ] `_mm256_srai_epi32` +* [ ] `_mm_srav_epi32` +* [ ] `_mm256_srav_epi32` +* [ ] `_mm256_srli_si256` +* [ ] `_mm256_bsrli_epi128` +* [ ] `_mm256_srl_epi16` +* [ ] `_mm256_srli_epi16` +* [ ] `_mm256_srl_epi32` +* [ ] `_mm256_srli_epi32` +* [ ] `_mm256_srl_epi64` +* [ ] `_mm256_srli_epi64` +* [ ] `_mm_srlv_epi32` +* [ ] `_mm256_srlv_epi32` +* [ ] `_mm_srlv_epi64` +* [ ] `_mm256_srlv_epi64` +* [ ] `_mm256_stream_load_si256` +* [ ] `_mm256_sub_epi8` +* [ ] `_mm256_sub_epi16` +* [ ] `_mm256_sub_epi32` +* [ ] `_mm256_sub_epi64` +* [ ] `_mm256_subs_epi8` +* [ ] `_mm256_subs_epi16` +* [ ] `_mm256_subs_epu8` +* [ ] `_mm256_subs_epu16` +* [ ] `_mm256_xor_si256` +* [ ] `_mm256_unpackhi_epi8` +* [ ] `_mm256_unpackhi_epi16` +* [ ] `_mm256_unpackhi_epi32` +* [ ] `_mm256_unpackhi_epi64` +* [ ] `_mm256_unpacklo_epi8` +* [ ] `_mm256_unpacklo_epi16` +* [ ] `_mm256_unpacklo_epi32` +* [ ] `_mm256_unpacklo_epi64` diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 9d20890bf5..cbe14b1b80 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -78,7 +78,7 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { paddusw(a, b) } } -/// Compute the bitwise AND of 256 bits (representing integer data) +/// Compute the bitwise AND of 256 bits (representing integer data) /// in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] @@ -86,7 +86,7 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { a & b } -/// Compute the bitwise NOT of 256 bits (representing integer data) +/// Compute the bitwise NOT of 256 bits (representing integer data) /// in `a` and then AND with `b`. #[inline(always)] #[target_feature = "+avx2"] @@ -121,21 +121,21 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { } // TODO _mm_broadcastb_epi8 -// TODO _mm256_broadcastb_epi8 -// TODO _mm_broadcastd_epi32 -// TODO _mm256_broadcastd_epi32 -// TODO _mm_broadcastq_epi64 +// TODO _mm256_broadcastb_epi8 +// TODO _mm_broadcastd_epi32 +// TODO _mm256_broadcastd_epi32 +// TODO _mm_broadcastq_epi64 // TODO _mm256_broadcastq_epi64 -// TODO _mm_broadcastsd_pd -// TODO _mm256_broadcastsd_pd -// TODO _mm_broadcastsi128_si256 -// TODO _mm256_broadcastsi128_si256 -// TODO _mm_broadcastss_ps -// TODO _mm256_broadcastss_ps -// TODO _mm_broadcastw_epi16 -// TODO _mm256_broadcastw_epi16 -// TODO _mm256_bslli_epi128 -// TODO _mm256_bsrli_epi128 +// TODO _mm_broadcastsd_pd +// TODO _mm256_broadcastsd_pd +// TODO _mm_broadcastsi128_si256 +// TODO _mm256_broadcastsi128_si256 +// TODO _mm_broadcastss_ps +// TODO _mm256_broadcastss_ps +// TODO _mm_broadcastw_epi16 +// TODO _mm256_broadcastw_epi16 +// TODO _mm256_bslli_epi128 +// TODO _mm256_bsrli_epi128 /// Compare packed 64-bit integers in `a` and `b` for equality. @@ -194,6 +194,64 @@ pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 { a.gt(b) } +// TODO _mm256_cvtepi16_epi32 +// TODO _mm256_cvtepi16_epi64 +// TODO _mm256_cvtepi32_epi64 +// TODO _mm256_cvtepi8_epi16 +// TODO _mm256_cvtepi8_epi32 +// TODO _mm256_cvtepi8_epi64 +// TODO _mm256_cvtepu16_epi32 +// TODO _mm256_cvtepu16_epi64 +// TODO _mm256_cvtepu32_epi64 +// TODO _mm256_cvtepu8_epi16 +// TODO _mm256_cvtepu8_epi32 +// TODO _mm256_cvtepu8_epi64 +// TODO _m128i _mm256_extracti128_si256 + +/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phaddw(a, b) } +} + +/// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { phaddd(a, b) } +} + +/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b` +/// using saturation. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phaddsw(a, b) } +} + +/// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phsubw(a, b) } +} + +/// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { phsubd(a, b) } +} + +/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b` +/// using saturation. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phsubsw(a, b) } +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -201,7 +259,7 @@ extern "C" { #[link_name = "llvm.x86.avx2.pabs.w"] fn pabsw(a: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pabs.d"] - fn pabsd(a: i32x8) -> i32x8; + fn pabsd(a: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.padds.b"] fn paddsb(a: i8x32, b: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.padds.w"] @@ -216,6 +274,18 @@ extern "C" { fn pavgw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pblendvb"] fn pblendvb(a: i8x32, b: i8x32, mask: __m256i) -> i8x32; + #[link_name = "llvm.x86.avx2.phadd.w"] + fn phaddw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.phadd.d"] + fn phaddd(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.phadd.sw"] + fn phaddsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.phsub.w"] + fn phsubw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.phsub.d"] + fn phsubd(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.phsub.sw"] + fn phsubsw(a: i16x16, b: i16x16) -> i16x16; } @@ -229,42 +299,51 @@ mod tests { #[test] #[target_feature = "+avx2"] fn _mm256_abs_epi32() { - let a = i32x8::new(0, 1, -1, std::i32::MAX, - std::i32::MIN + 1, 100, -100, -32); + let a = i32x8::new( + 0, 1, -1, std::i32::MAX, + std::i32::MIN + 1, 100, -100, -32); let r = avx2::_mm256_abs_epi32(a); - let e = i32x8::new(0, 1, 1, std::i32::MAX, - (std::i32::MIN + 1).abs(), 100, 100, 32); + let e = i32x8::new( + 0, 1, 1, std::i32::MAX, + (std::i32::MIN + 1).abs(), 100, 100, 32); assert_eq!(r, e); } #[test] #[target_feature = "+avx2"] fn _mm256_abs_epi16() { - let a = i16x16::new(0, 1, -1, 2, - -2, 3, -3, 4, - -4, 5, -5, std::i16::MAX, - std::i16::MIN + 1, 100, -100, -32); + let a = i16x16::new( + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i16::MAX, + std::i16::MIN + 1, 100, -100, -32); let r = avx2::_mm256_abs_epi16(a); - let e = i16x16::new(0, 1, 1, 2, - 2, 3, 3, 4, - 4, 5, 5, std::i16::MAX, - (std::i16::MIN + 1).abs(), 100, 100, 32); + let e = i16x16::new( + 0, 1, 1, 2, + 2, 3, 3, 4, + 4, 5, 5, std::i16::MAX, + (std::i16::MIN + 1).abs(), 100, 100, 32); assert_eq!(r, e); } #[test] #[target_feature = "+avx2"] fn _mm256_abs_epi8() { - let a = i8x32::new(0, 1, -1, 2, - -2, 3, -3, 4, - -4, 5, -5, std::i8::MAX, - std::i8::MIN + 1, 100, -100, -32, - 0, 1, -1, 2, - -2, 3, -3, 4, - -4, 5, -5, std::i8::MAX, - std::i8::MIN + 1, 100, -100, -32); + let a = i8x32::new( + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, + std::i8::MIN + 1, 100, -100, -32, + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, + std::i8::MIN + 1, 100, -100, -32); let r = avx2::_mm256_abs_epi8(a); - let e = i8x32::new(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32); + let e = i8x32::new( + 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32, + 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32); assert_eq!(r, e); } @@ -291,13 +370,16 @@ mod tests { #[test] #[target_feature = "+avx2"] fn _mm256_add_epi16() { - let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); - let b = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); let r = avx2::_mm256_add_epi16(a, b); - let e = i16x16::new(0, 2, 4, 6, 8, 10, 12, 14, - 16, 18, 20, 22, 24, 26, 28, 30); + let e = i16x16::new( + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30); assert_eq!(r, e); } @@ -305,21 +387,21 @@ mod tests { #[target_feature = "+avx2"] fn _mm256_add_epi8() { let a = i8x32::new( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let b = i8x32::new( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let r = avx2::_mm256_add_epi8(a, b); let e = i8x32::new( - 0, 2, 4, 6, 8, 10, 12, 14, 16, - 18, 20, 22, 24, 26, 28, 30, 32, - 34, 36, 38, 40, 42, 44, 46, 48, - 50, 52, 54, 56, 58, 60,62); + 0, 2, 4, 6, 8, 10, 12, 14, 16, + 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 52, 54, 56, 58, 60, 62); assert_eq!(r, e); } @@ -328,14 +410,14 @@ mod tests { fn _mm256_adds_epi8() { let a = i8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let b = i8x32::new( - 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, - 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63); + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); let r = avx2::_mm256_adds_epi8(a, b); let e = i8x32::new( - 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62, - 64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94); + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94); assert_eq!(r, e); } @@ -361,13 +443,13 @@ mod tests { #[target_feature = "+avx2"] fn _mm256_adds_epi16() { let a = i16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i16x16::new( - 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47); - let r = avx2::_mm256_adds_epi16(a, b); + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47); + let r = avx2::_mm256_adds_epi16(a, b); let e = i16x16::new( - 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62); - + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62); + assert_eq!(r, e); } @@ -388,20 +470,20 @@ mod tests { let r = avx2::_mm256_adds_epi16(a, b); assert_eq!(r, a); } - + #[test] #[target_feature = "+avx2"] fn _mm256_adds_epu8() { let a = u8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let b = u8x32::new( - 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, - 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63); + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); let r = avx2::_mm256_adds_epu8(a, b); let e = u8x32::new( - 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62, - 64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94); + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94); assert_eq!(r, e); } @@ -414,18 +496,18 @@ mod tests { assert_eq!(r, a); } - + #[test] #[target_feature = "+avx2"] fn _mm256_adds_epu16() { let a = u16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = u16x16::new( - 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47); + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47); let r = avx2::_mm256_adds_epu16(a, b); let e = u16x16::new( - 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62); - + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62); + assert_eq!(r, e); } @@ -437,13 +519,13 @@ mod tests { let r = avx2::_mm256_adds_epu16(a, b); assert_eq!(r, a); } - + #[test] #[target_feature = "+avx2"] fn _mm256_and_si256() { assert_eq!( - avx2::_mm256_and_si256(__m256i::splat(5), __m256i::splat(3)), - __m256i::splat(1)); + avx2::_mm256_and_si256( + __m256i::splat(5), __m256i::splat(3)),__m256i::splat(1)); } #[test] @@ -484,9 +566,9 @@ mod tests { fn _mm256_cmpeq_epi8() { let a = i8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let b = i8x32::new( - 31,30,2,28,27,26,25,24,23,22,21,20,19,18,17,16, + 31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let r = avx2::_mm256_cmpeq_epi8(a, b); assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8)); @@ -494,8 +576,10 @@ mod tests { #[test] fn _mm256_cmpeq_epi16() { - let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7,8,9,10,11,12,13,14,15); - let b = i16x16::new(15,14,2,12,11,10,9,8,7, 6, 5, 4, 3, 2, 1, 0); + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new( + 15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let r = avx2::_mm256_cmpeq_epi16(a, b); assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16)); } @@ -513,7 +597,8 @@ mod tests { let a = i64x4::new(0, 1, 2, 3); let b = i64x4::new(3, 2, 2, 0); let r = avx2::_mm256_cmpeq_epi64(a, b); - assert_eq!(r, i64x4::splat(0).replace(2, 0xFFFFFFFFFFFFFFFFu64 as i64)); + assert_eq!(r, i64x4::splat(0).replace( + 2, 0xFFFFFFFFFFFFFFFFu64 as i64)); } #[test] @@ -545,7 +630,70 @@ mod tests { let a = i64x4::splat(0).replace(0, 5); let b = i64x4::splat(0); let r = avx2::_mm256_cmpgt_epi64(a, b); - assert_eq!(r, i64x4::splat(0).replace(0, 0xFFFFFFFFFFFFFFFFu64 as i64)); + assert_eq!(r, i64x4::splat(0).replace( + 0, 0xFFFFFFFFFFFFFFFFu64 as i64)); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hadd_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_hadd_epi16(a, b); + let e = i16x16::new(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hadd_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_hadd_epi32(a, b); + let e = i32x8::new(4, 4, 8, 8, 4, 4, 8, 8); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hadds_epi16() { + let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,1); + let b = i16x16::splat(4); + let r = avx2::_mm256_hadds_epi16(a, b); + let e = i16x16::new( + 0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + assert_eq!(r,e); } + #[test] + #[target_feature ="+avx2"] + fn _mm256_hsub_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_hsub_epi16(a, b); + let e = i16x16::splat(0); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hsub_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_hsub_epi32(a, b); + let e = i32x8::splat(0); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hsubs_epi16() { + let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1); + let b = i16x16::splat(4); + let r = avx2::_mm256_hsubs_epi16(a, b); + let e = i16x16::splat(0).replace(0,0x7FFF); + assert_eq!(r,e); + } + + } From ff85180789db61824a2aac6334bfbc1fcf7e3a24 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 12:34:18 -0500 Subject: [PATCH 12/24] spacing on tests --- src/x86/avx2.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index cbe14b1b80..7ec508231d 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -675,7 +675,7 @@ mod tests { assert_eq!(r,e); } - #[test] + #[test] #[target_feature = "+avx2"] fn _mm256_hsub_epi32() { let a = i32x8::splat(2); @@ -685,7 +685,7 @@ mod tests { assert_eq!(r,e); } - #[test] + #[test] #[target_feature = "+avx2"] fn _mm256_hsubs_epi16() { let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1); From 9a2409d29e414ddd2131075ba4b0b3df457c383f Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 14:53:21 -0500 Subject: [PATCH 13/24] start on avx --- TODO.md | 6 ++--- src/x86/avx.rs | 71 +++++++++++++++++++++++++++++++++++++++++++++++++ src/x86/avx2.rs | 16 +++++------ src/x86/mod.rs | 2 ++ 4 files changed, 84 insertions(+), 11 deletions(-) create mode 100644 src/x86/avx.rs diff --git a/TODO.md b/TODO.md index 2764cd8d97..1373340a72 100644 --- a/TODO.md +++ b/TODO.md @@ -540,9 +540,9 @@ sse4.2 avx --- -* [ ] `_mm256_add_pd` -* [ ] `_mm256_add_ps` -* [ ] `_mm256_addsub_pd` +* [x] `_mm256_add_pd` +* [x] `_mm256_add_ps` +* [x] `_mm256_addsub_pd` * [ ] `_mm256_addsub_ps` * [ ] `_mm256_and_pd` * [ ] `_mm256_and_ps` diff --git a/src/x86/avx.rs b/src/x86/avx.rs new file mode 100644 index 0000000000..5c48243702 --- /dev/null +++ b/src/x86/avx.rs @@ -0,0 +1,71 @@ +use v256::*; + +/// Add packed double-precision (64-bit) floating-point elements +/// in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx"] +pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 { + a + b +} + +/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx"] +pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 { + a + b +} + +/// Alternatively add and subtract packed double-precision (64-bit) +/// floating-point elements in `a` to/from packed elements in `b`. +#[inline(always)] +#[target_feature = "+avx"] +pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 { + unsafe { addsubpd256(a,b) } +} + + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx.addsub.pd.256"] + fn addsubpd256(a: f64x4, b:f64x4) -> f64x4; +} + + +#[cfg(test)] +mod tests { + use v256::*; + use x86::avx; + + #[test] + #[target_feature = "+avx"] + fn _mm256_add_pd() { + let a = f64x4::new(1.0, 2.0, 3.0, 4.0); + let b = f64x4::new(5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_add_pd(a, b); + let e = f64x4::new(6.0, 8.0, 10.0, 12.0); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx"] + fn _mm256_add_ps() { + let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0); + let r = avx::_mm256_add_ps(a, b); + let e = f32x8::new(10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx"] + fn _mm256_addsub_pd() { + let a = f64x4::new(1.0, 2.0, 3.0, 4.0); + let b = f64x4::new(5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_addsub_pd(a, b); + let e = f64x4::new(-4.0,8.0,-4.0,12.0); + assert_eq!(r,e); + } + + + +} \ No newline at end of file diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 7ec508231d..7fb6a62e24 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -252,6 +252,7 @@ pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { unsafe { phsubsw(a, b) } } + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -568,7 +569,7 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let b = i8x32::new( - 31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let r = avx2::_mm256_cmpeq_epi8(a, b); assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8)); @@ -641,7 +642,7 @@ mod tests { let b = i16x16::splat(4); let r = avx2::_mm256_hadd_epi16(a, b); let e = i16x16::new(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -651,7 +652,7 @@ mod tests { let b = i32x8::splat(4); let r = avx2::_mm256_hadd_epi32(a, b); let e = i32x8::new(4, 4, 8, 8, 4, 4, 8, 8); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -662,7 +663,7 @@ mod tests { let r = avx2::_mm256_hadds_epi16(a, b); let e = i16x16::new( 0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -672,7 +673,7 @@ mod tests { let b = i16x16::splat(4); let r = avx2::_mm256_hsub_epi16(a, b); let e = i16x16::splat(0); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -682,7 +683,7 @@ mod tests { let b = i32x8::splat(4); let r = avx2::_mm256_hsub_epi32(a, b); let e = i32x8::splat(0); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -692,8 +693,7 @@ mod tests { let b = i16x16::splat(4); let r = avx2::_mm256_hsubs_epi16(a, b); let e = i16x16::splat(0).replace(0,0x7FFF); - assert_eq!(r,e); + assert_eq!(r, e); } - } diff --git a/src/x86/mod.rs b/src/x86/mod.rs index d36fa4444d..839b531556 100644 --- a/src/x86/mod.rs +++ b/src/x86/mod.rs @@ -2,6 +2,7 @@ pub use self::sse::*; pub use self::sse2::*; pub use self::ssse3::*; pub use self::sse42::*; +pub use self::avx::*; pub use self::avx2::*; #[allow(non_camel_case_types)] @@ -13,4 +14,5 @@ mod sse; mod sse2; mod ssse3; mod sse42; +mod avx; mod avx2; \ No newline at end of file From 04519e2781d7dfa73861ab93906cb3b475c72fdf Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 15:01:08 -0500 Subject: [PATCH 14/24] style fixes --- src/x86/avx.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/x86/avx.rs b/src/x86/avx.rs index 5c48243702..6ec764c377 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -1,6 +1,6 @@ use v256::*; -/// Add packed double-precision (64-bit) floating-point elements +/// Add packed double-precision (64-bit) floating-point elements /// in `a` and `b`. #[inline(always)] #[target_feature = "+avx"] @@ -8,19 +8,19 @@ pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 { a + b } -/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. +/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. #[inline(always)] #[target_feature = "+avx"] pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 { a + b } -/// Alternatively add and subtract packed double-precision (64-bit) +/// Alternatively add and subtract packed double-precision (64-bit) /// floating-point elements in `a` to/from packed elements in `b`. #[inline(always)] #[target_feature = "+avx"] pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 { - unsafe { addsubpd256(a,b) } + unsafe { addsubpd256(a, b) } } @@ -34,7 +34,7 @@ extern "C" { #[cfg(test)] mod tests { use v256::*; - use x86::avx; + use x86::avx; #[test] #[target_feature = "+avx"] @@ -43,7 +43,7 @@ mod tests { let b = f64x4::new(5.0, 6.0, 7.0, 8.0); let r = avx::_mm256_add_pd(a, b); let e = f64x4::new(6.0, 8.0, 10.0, 12.0); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -53,7 +53,7 @@ mod tests { let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0); let r = avx::_mm256_add_ps(a, b); let e = f32x8::new(10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -63,7 +63,7 @@ mod tests { let b = f64x4::new(5.0, 6.0, 7.0, 8.0); let r = avx::_mm256_addsub_pd(a, b); let e = f64x4::new(-4.0,8.0,-4.0,12.0); - assert_eq!(r,e); + assert_eq!(r, e); } From fceb2caae3f99aaa82e211768715165013f8b0ae Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 18:01:17 -0500 Subject: [PATCH 15/24] working through max --- src/x86/avx2.rs | 144 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 2840b0144f..3101730ea0 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -252,6 +252,93 @@ pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { unsafe { phsubsw(a, b) } } + +// TODO _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) +// TODO _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) +// TODO _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) +// TODO _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) +// TODO _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale) +// TODO _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) +// TODO _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale) +// TODO _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale) +// TODO _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) +// TODO _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale) +// TODO _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale) +// TODO _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale) +// TODO _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale) +// TODO _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale) +// TODO _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale) +// TODO _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale) +// TODO _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale) +// TODO _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) +// TODO _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale) +// TODO _mm256_mask_i64gather_ps +// TODO _mm256_inserti128_si256 + +/// Multiply packed signed 16-bit integers in `a` and `b`, producing +/// intermediate signed 32-bit integers. Horizontally add adjacent pairs +/// of intermediate 32-bit integers. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 { + unsafe { pmaddwd(a, b) } +} + +/// Vertically multiply each unsigned 8-bit integer from `a` with the +/// corresponding signed 8-bit integer from `b`, producing intermediate +/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate +/// signed 16-bit integers +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 { + unsafe { pmaddubsw(a, b) } +} + +// TODO _mm_maskload_epi32 (int const* mem_addr, __m128i mask) +// TODO _mm256_maskload_epi32 (int const* mem_addr, __m256i mask) +// TODO _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask) +// TODO _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask) +// TODO _mm_maskstore_epi32 (int* mem_addr, __m128i mask, __m128i a) +// TODO _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a) +// TODO _mm_maskstore_epi64 (__int64* mem_addr, __m128i mask, __m128i a) +// TODO _mm256_maskstore_epi64 (__int64* mem_addr, __m256i mask, __m256i a) + +/// Compare packed 16-bit integers in `a` and `b`, and return the packed +/// maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { pmaxsw(a, b) } +} + +/// Compare packed 32-bit integers in `a` and `b`, and return the packed +/// maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { pmaxsd(a, b) } +} + +/// Compare packed 8-bit integers in `a` and `b`, and return the packed +/// maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 { + unsafe { pmaxsb(a, b) } +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -286,6 +373,16 @@ extern "C" { fn phsubd(a: i32x8, b: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.phsub.sw"] fn phsubsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pmadd.wd"] + fn pmaddwd(a: i16x16, b: i16x16) -> i32x8; + #[link_name = "llvm.x86.avx2.pmadd.ub.sw"] + fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16; + #[link_name = "llvm.x86.avx2.pmaxs.w"] + fn pmaxsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pmaxs.d"] + fn pmaxsd(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.pmaxs.b"] + fn pmaxsb(a: i8x32, b: i8x32) -> i8x32; } @@ -695,4 +792,51 @@ mod tests { assert_eq!(r, e); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_madd_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_madd_epi16(a, b); + let e = i32x8::splat(16); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_maddubs_epi16() { + let a = u8x32::splat(2); + let b = u8x32::splat(4); + let r = avx2::_mm256_maddubs_epi16(a, b); + let e = i16x16::splat(16); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_max_epi16(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_max_epi32(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epi8() { + let a = i8x32::splat(2); + let b = i8x32::splat(4); + let r = avx2::_mm256_max_epi8(a, b); + assert_eq!(r, b); + } + } From e2f5f24cdf42ac31809220d74c375f927ad3816c Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 18:33:15 -0500 Subject: [PATCH 16/24] max and min --- src/x86/avx2.rs | 171 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 3101730ea0..0822402f2c 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -339,6 +339,78 @@ pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 { unsafe { pmaxsb(a, b) } } +/// Compare packed unsigned 16-bit integers in `a` and `b`, and return +/// the packed maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 { + unsafe { pmaxuw(a, b) } +} + +/// Compare packed unsigned 32-bit integers in `a` and `b`, and return +/// the packed maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 { + unsafe { pmaxud(a, b) } +} + +/// Compare packed unsigned 8-bit integers in `a` and `b`, and return +/// the packed maximum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 { + unsafe { pmaxub(a, b) } +} + +/// Compare packed 16-bit integers in `a` and `b`, and return the packed +/// minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { pminsw(a, b) } +} + +/// Compare packed 32-bit integers in `a` and `b`, and return the packed +/// minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { pminsd(a, b) } +} + +/// Compare packed 8-bit integers in `a` and `b`, and return the packed +/// minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 { + unsafe { pminsb(a, b) } +} + +/// Compare packed unsigned 16-bit integers in `a` and `b`, and return +/// the packed minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 { + unsafe { pminuw(a, b) } +} + +/// Compare packed unsigned 32-bit integers in `a` and `b`, and return +/// the packed minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 { + unsafe { pminud(a, b) } +} + +/// Compare packed unsigned 8-bit integers in `a` and `b`, and return +/// the packed minimum values. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 { + unsafe { pminub(a, b) } +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -383,6 +455,24 @@ extern "C" { fn pmaxsd(a: i32x8, b: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.pmaxs.b"] fn pmaxsb(a: i8x32, b: i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.pmaxu.w"] + fn pmaxuw(a: u16x16, b: u16x16) -> u16x16; + #[link_name = "llvm.x86.avx2.pmaxu.d"] + fn pmaxud(a: u32x8, b: u32x8) -> u32x8; + #[link_name = "llvm.x86.avx2.pmaxu.b"] + fn pmaxub(a: u8x32, b: u8x32) -> u8x32; + #[link_name = "llvm.x86.avx2.pmins.w"] + fn pminsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pmins.d"] + fn pminsd(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.pmins.b"] + fn pminsb(a: i8x32, b: i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.pminu.w"] + fn pminuw(a: u16x16, b: u16x16) -> u16x16; + #[link_name = "llvm.x86.avx2.pminu.d"] + fn pminud(a: u32x8, b: u32x8) -> u32x8; + #[link_name = "llvm.x86.avx2.pminu.b"] + fn pminub(a: u8x32, b: u8x32) -> u8x32; } @@ -839,4 +929,85 @@ mod tests { assert_eq!(r, b); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epu16() { + let a = u16x16::splat(2); + let b = u16x16::splat(4); + let r = avx2::_mm256_max_epu16(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epu32() { + let a = u32x8::splat(2); + let b = u32x8::splat(4); + let r = avx2::_mm256_max_epu32(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_max_epu8() { + let a = u8x32::splat(2); + let b = u8x32::splat(4); + let r = avx2::_mm256_max_epu8(a, b); + assert_eq!(r, b); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_min_epi16(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_min_epi32(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epi8() { + let a = i8x32::splat(2); + let b = i8x32::splat(4); + let r = avx2::_mm256_min_epi8(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epu16() { + let a = u16x16::splat(2); + let b = u16x16::splat(4); + let r = avx2::_mm256_min_epu16(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epu32() { + let a = u32x8::splat(2); + let b = u32x8::splat(4); + let r = avx2::_mm256_min_epu32(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_min_epu8() { + let a = u8x32::splat(2); + let b = u8x32::splat(4); + let r = avx2::_mm256_min_epu8(a, b); + assert_eq!(r, a); + } + } From e6e7ecf4e008d09aa317db627fae615b8cff5ad0 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 20:25:17 -0500 Subject: [PATCH 17/24] movemask, mpsad, mul --- TODO.md | 28 +++++----- src/x86/avx2.rs | 133 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 125 insertions(+), 36 deletions(-) diff --git a/TODO.md b/TODO.md index 1373340a72..8e55a47095 100644 --- a/TODO.md +++ b/TODO.md @@ -811,8 +811,8 @@ avx2 * [ ] `_mm_i64gather_epi64` * [ ] `_mm256_i64gather_epi64` * [ ] `_mm256_inserti128_si256` -* [ ] `_mm256_madd_epi16` -* [ ] `_mm256_maddubs_epi16` +* [x] `_mm256_madd_epi16` +* [x] `_mm256_maddubs_epi16` * [ ] `_mm_mask_i32gather_pd` * [ ] `_mm256_mask_i32gather_pd` * [ ] `_mm_mask_i32gather_ps` @@ -837,18 +837,18 @@ avx2 * [ ] `_mm256_maskstore_epi32` * [ ] `_mm_maskstore_epi64` * [ ] `_mm256_maskstore_epi64` -* [ ] `_mm256_max_epi8` -* [ ] `_mm256_max_epi16` -* [ ] `_mm256_max_epi32` -* [ ] `_mm256_max_epu8` -* [ ] `_mm256_max_epu16` -* [ ] `_mm256_max_epu32` -* [ ] `_mm256_min_epi8` -* [ ] `_mm256_min_epi16` -* [ ] `_mm256_min_epi32` -* [ ] `_mm256_min_epu8` -* [ ] `_mm256_min_epu16` -* [ ] `_mm256_min_epu32` +* [x] `_mm256_max_epi8` +* [x] `_mm256_max_epi16` +* [x] `_mm256_max_epi32` +* [x] `_mm256_max_epu8` +* [x] `_mm256_max_epu16` +* [x] `_mm256_max_epu32` +* [x] `_mm256_min_epi8` +* [x] `_mm256_min_epi16` +* [x] `_mm256_min_epi32` +* [x] `_mm256_min_epu8` +* [x] `_mm256_min_epu16` +* [x] `_mm256_min_epu32` * [ ] `_mm256_movemask_epi8` * [ ] `_mm256_mpsadbw_epu8` * [ ] `_mm256_mul_epi32` diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 0822402f2c..3c456bb11b 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -78,6 +78,8 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { paddusw(a, b) } } +// TODO _mm256_alignr_epi8 + /// Compute the bitwise AND of 256 bits (representing integer data) /// in `a` and `b`. #[inline(always)] @@ -108,7 +110,9 @@ pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 { unsafe { pavgb(a, b) } } -// TODO _mm256_alignr_epi8 + + + // TODO _mm256_blend_epi16 // TODO _mm_blend_epi32 // TODO _mm256_blend_epi32 @@ -284,11 +288,11 @@ pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { // TODO _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale) // TODO _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale) // TODO _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale) -// TODO _mm256_mask_i64gather_ps +// TODO _mm256_mask_i64gather_ps // TODO _mm256_inserti128_si256 -/// Multiply packed signed 16-bit integers in `a` and `b`, producing -/// intermediate signed 32-bit integers. Horizontally add adjacent pairs +/// Multiply packed signed 16-bit integers in `a` and `b`, producing +/// intermediate signed 32-bit integers. Horizontally add adjacent pairs /// of intermediate 32-bit integers. #[inline(always)] #[target_feature = "+avx2"] @@ -296,9 +300,9 @@ pub fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 { unsafe { pmaddwd(a, b) } } -/// Vertically multiply each unsigned 8-bit integer from `a` with the -/// corresponding signed 8-bit integer from `b`, producing intermediate -/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate +/// Vertically multiply each unsigned 8-bit integer from `a` with the +/// corresponding signed 8-bit integer from `b`, producing intermediate +/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate /// signed 16-bit integers #[inline(always)] #[target_feature = "+avx2"] @@ -339,7 +343,7 @@ pub fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 { unsafe { pmaxsb(a, b) } } -/// Compare packed unsigned 16-bit integers in `a` and `b`, and return +/// Compare packed unsigned 16-bit integers in `a` and `b`, and return /// the packed maximum values. #[inline(always)] #[target_feature = "+avx2"] @@ -347,7 +351,7 @@ pub fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { pmaxuw(a, b) } } -/// Compare packed unsigned 32-bit integers in `a` and `b`, and return +/// Compare packed unsigned 32-bit integers in `a` and `b`, and return /// the packed maximum values. #[inline(always)] #[target_feature = "+avx2"] @@ -355,7 +359,7 @@ pub fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 { unsafe { pmaxud(a, b) } } -/// Compare packed unsigned 8-bit integers in `a` and `b`, and return +/// Compare packed unsigned 8-bit integers in `a` and `b`, and return /// the packed maximum values. #[inline(always)] #[target_feature = "+avx2"] @@ -387,7 +391,7 @@ pub fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 { unsafe { pminsb(a, b) } } -/// Compare packed unsigned 16-bit integers in `a` and `b`, and return +/// Compare packed unsigned 16-bit integers in `a` and `b`, and return /// the packed minimum values. #[inline(always)] #[target_feature = "+avx2"] @@ -395,7 +399,7 @@ pub fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { pminuw(a, b) } } -/// Compare packed unsigned 32-bit integers in `a` and `b`, and return +/// Compare packed unsigned 32-bit integers in `a` and `b`, and return /// the packed minimum values. #[inline(always)] #[target_feature = "+avx2"] @@ -403,7 +407,7 @@ pub fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 { unsafe { pminud(a, b) } } -/// Compare packed unsigned 8-bit integers in `a` and `b`, and return +/// Compare packed unsigned 8-bit integers in `a` and `b`, and return /// the packed minimum values. #[inline(always)] #[target_feature = "+avx2"] @@ -411,6 +415,44 @@ pub fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 { unsafe { pminub(a, b) } } +/// Create mask from the most significant bit of each 8-bit element in `a`, +/// return the result. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_movemask_epi8(a: i8x32) -> i32 { + unsafe { pmovmskb(a) } +} + +/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned +/// 8-bit integers in `a` compared to those in `b`, and store the 16-bit +/// results in dst. Eight SADs are performed for each 128-bit lane using one +/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is +/// selected from `b` starting at on the offset specified in `imm8`. Eight +/// quadruplets are formed from sequential 8-bit integers selected from `a` +/// starting at the offset specified in `imm8`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 { + unsafe { mpsadbw(a, b, imm8) } +} + +/// Multiply the low 32-bit integers from each packed 64-bit element in +/// `a` and `b` +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i32x8 { + a * b +} + +/// Multiply the low unsigned 32-bit integers from each packed 64-bit +/// element in `a` and `b` +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u32x8 { + a * b +} + + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -426,7 +468,7 @@ extern "C" { #[link_name = "llvm.x86.avx2.paddus.b"] fn paddusb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.paddus.w"] - fn paddusw(a: u16x16, b: u16x16) -> u16x16; + fn paddusw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pavg.b"] fn pavgb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.pavg.w"] @@ -452,27 +494,32 @@ extern "C" { #[link_name = "llvm.x86.avx2.pmaxs.w"] fn pmaxsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pmaxs.d"] - fn pmaxsd(a: i32x8, b: i32x8) -> i32x8; + fn pmaxsd(a: i32x8, b: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.pmaxs.b"] - fn pmaxsb(a: i8x32, b: i8x32) -> i8x32; + fn pmaxsb(a: i8x32, b: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.pmaxu.w"] fn pmaxuw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pmaxu.d"] - fn pmaxud(a: u32x8, b: u32x8) -> u32x8; + fn pmaxud(a: u32x8, b: u32x8) -> u32x8; #[link_name = "llvm.x86.avx2.pmaxu.b"] - fn pmaxub(a: u8x32, b: u8x32) -> u8x32; + fn pmaxub(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.pmins.w"] fn pminsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pmins.d"] - fn pminsd(a: i32x8, b: i32x8) -> i32x8; + fn pminsd(a: i32x8, b: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.pmins.b"] - fn pminsb(a: i8x32, b: i8x32) -> i8x32; + fn pminsb(a: i8x32, b: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.pminu.w"] fn pminuw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pminu.d"] - fn pminud(a: u32x8, b: u32x8) -> u32x8; + fn pminud(a: u32x8, b: u32x8) -> u32x8; #[link_name = "llvm.x86.avx2.pminu.b"] fn pminub(a: u8x32, b: u8x32) -> u8x32; + #[link_name = "llvm.x86.avx2.pmovmskb"] + fn pmovmskb(a: i8x32) -> i32; + #[link_name = "llvm.x86.avx2.mpsadbw"] + fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; + } @@ -955,7 +1002,7 @@ mod tests { let r = avx2::_mm256_max_epu8(a, b); assert_eq!(r, b); } - + #[test] #[target_feature = "+avx2"] fn _mm256_min_epi16() { @@ -1010,4 +1057,46 @@ mod tests { assert_eq!(r, a); } + + // TODO this fails in debug but not release, why? + #[test] + #[target_feature ="+avx2"] + fn _mm256_movemask_epi8() { + let a = i8x32::splat(-1); + let r = avx2::_mm256_movemask_epi8(a); + let e : i32 = -1; + assert_eq!(r, e); + } + + // TODO This fails in debug but not in release, whhhy? + #[test] + #[target_feature = "+avx2"] + fn _mm256_mpsadbw_epu8() { + let a = u8x32::splat(2); + let b = u8x32::splat(4); + let r = avx2::_mm256_mpsadbw_epu8(a, b, 0); + let e = u16x16::splat(8); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mul_epi32() { + let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2); + let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let r = avx2::_mm256_mul_epi32(a, b); + let e = i32x8::new(0, 0, 0, 0, 10, 12, 14, 16); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mul_epu32() { + let a = u32x8::new(0, 0, 0, 0, 2, 2, 2, 2); + let b = u32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let r = avx2::_mm256_mul_epu32(a, b); + let e = u32x8::new(0, 0, 0, 0, 10, 12, 14, 16); + assert_eq!(r, e); + } + } From fe068d720f947a195c22702986183b39a1cc3ba4 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Mon, 29 May 2017 05:57:53 -0500 Subject: [PATCH 18/24] mul --- src/x86/avx2.rs | 100 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 6 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 3c456bb11b..bfffce3ebb 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -438,21 +438,61 @@ pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 { /// Multiply the low 32-bit integers from each packed 64-bit element in /// `a` and `b` +/// +/// Return the 64-bit results. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i32x8 { - a * b +pub fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 { + unsafe { pmuldq(a, b) } } /// Multiply the low unsigned 32-bit integers from each packed 64-bit /// element in `a` and `b` +/// +/// Return the unsigned 64-bit results. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 { + unsafe { pmuludq(a, b) } +} + +/// Multiply the packed 16-bit integers in `a` and `b`, producing +/// intermediate 32-bit integers and returning the high 16 bits of the +/// intermediate integers. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u32x8 { +pub fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { pmulhw(a, b) } +} + +/// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing +/// intermediate 32-bit integers and returning the high 16 bits of the +/// intermediate integers. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 { + unsafe { pmulhuw(a, b) } +} + +/// Multiply the packed 16-bit integers in `a` and `b`, producing +/// intermediate 32-bit integers, and return the low 16 bits of the +/// intermediate integers +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mullo_epi16(a: i16x16, b:i16x16) -> i16x16 { a * b } +/// Multiply the packed 32-bit integers in `a` and `b`, producing +/// intermediate 64-bit integers, and return the low 16 bits of the +/// intermediate integers +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { + a * b +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -519,6 +559,15 @@ extern "C" { fn pmovmskb(a: i8x32) -> i32; #[link_name = "llvm.x86.avx2.mpsadbw"] fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; + #[link_name = "llvm.x86.avx2.pmulhu.w"] + fn pmulhuw(a: u16x16, b: u16x16) -> u16x16; + #[link_name = "llvm.x86.avx2.pmulh.w"] + fn pmulhw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pmul.dq"] + fn pmuldq(a: i32x8, b:i32x8) -> i64x4; + #[link_name = "llvm.x86.avx2.pmulu.dq"] + fn pmuludq(a: u32x8, b:u32x8) -> u64x4; + } @@ -1057,7 +1106,6 @@ mod tests { assert_eq!(r, a); } - // TODO this fails in debug but not release, why? #[test] #[target_feature ="+avx2"] @@ -1085,7 +1133,7 @@ mod tests { let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2); let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); let r = avx2::_mm256_mul_epi32(a, b); - let e = i32x8::new(0, 0, 0, 0, 10, 12, 14, 16); + let e = i64x4::new(0, 0, 10, 14); assert_eq!(r, e); } @@ -1095,7 +1143,47 @@ mod tests { let a = u32x8::new(0, 0, 0, 0, 2, 2, 2, 2); let b = u32x8::new(1, 2, 3, 4, 5, 6, 7, 8); let r = avx2::_mm256_mul_epu32(a, b); - let e = u32x8::new(0, 0, 0, 0, 10, 12, 14, 16); + let e = u64x4::new(0, 0, 10, 14); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mulhi_epi16() { + let a = i16x16::splat(6535); + let b = i16x16::splat(6535); + let r = avx2::_mm256_mulhi_epi16(a, b); + let e = i16x16::splat(651); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mulhi_epu16() { + let a = u16x16::splat(6535); + let b = u16x16::splat(6535); + let r = avx2::_mm256_mulhi_epu16(a, b); + let e = u16x16::splat(651); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mullo_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_mullo_epi16(a, b); + let e = i16x16::splat(8); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_mullo_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_mullo_epi32(a, b); + let e = i32x8::splat(8); assert_eq!(r, e); } From d3d51e82e5da5b65e5f618137421cbf60cab7312 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Mon, 29 May 2017 06:02:50 -0500 Subject: [PATCH 19/24] or --- src/x86/avx2.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index bfffce3ebb..ca8a10fd83 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -493,6 +493,13 @@ pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { a * b } +/// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b` +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { + a | b +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -1177,7 +1184,7 @@ mod tests { assert_eq!(r, e); } - #[test] + #[test] #[target_feature = "+avx2"] fn _mm256_mullo_epi32() { let a = i32x8::splat(2); @@ -1187,4 +1194,13 @@ mod tests { assert_eq!(r, e); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_or_si256() { + let a = __m256i::splat(-1); + let b = __m256i::splat(0); + let r = avx2::_mm256_or_si256(a, b); + assert_eq!(r, a); + + } } From aab40411bf55725ac64c95f1747850e31e95e0a1 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Mon, 29 May 2017 06:19:26 -0500 Subject: [PATCH 20/24] pack --- src/x86/avx2.rs | 106 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index ca8a10fd83..27cfe69408 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -500,6 +500,42 @@ pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { a | b } +/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using +/// signed saturation +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 { + unsafe { packsswb(a, b) } +} + +/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using +/// signed saturation +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 { + unsafe { packssdw(a, b) } +} + +/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using +/// unsigned saturation +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 { + unsafe { packuswb(a, b) } +} + +/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using +/// unsigned saturation +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 { + unsafe { packusdw(a, b) } +} + + + + + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -574,6 +610,14 @@ extern "C" { fn pmuldq(a: i32x8, b:i32x8) -> i64x4; #[link_name = "llvm.x86.avx2.pmulu.dq"] fn pmuludq(a: u32x8, b:u32x8) -> u64x4; + #[link_name = "llvm.x86.avx2.packsswb"] + fn packsswb(a: i16x16, b: i16x16) -> i8x32; + #[link_name = "llvm.x86.avx2.packssdw"] + fn packssdw(a: i32x8, b: i32x8) -> i16x16; + #[link_name = "llvm.x86.avx2.packuswb"] + fn packuswb(a: i16x16, b: i16x16) -> u8x32; + #[link_name = "llvm.x86.avx2.packusdw"] + fn packusdw(a: i32x8, b: i32x8) -> u16x16; } @@ -1201,6 +1245,68 @@ mod tests { let b = __m256i::splat(0); let r = avx2::_mm256_or_si256(a, b); assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_packs_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_packs_epi16(a, b); + let e = i8x32::new( + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4); + + assert_eq!(r, e); + } + #[test] + #[target_feature = "+avx2"] + fn _mm256_packs_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_packs_epi32(a, b); + let e = i16x16::new( + 2, 2, 2, 2, + 4, 4, 4, 4, + 2, 2, 2, 2, + 4, 4, 4, 4); + + assert_eq!(r, e); } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_packus_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_packus_epi16(a, b); + let e = u8x32::new( + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_packus_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_packus_epi32(a, b); + let e = u16x16::new( + 2, 2, 2, 2, + 4, 4, 4, 4, + 2, 2, 2, 2, + 4, 4, 4, 4); + + assert_eq!(r, e); + } + + + } From 8a7ffa38dc556c9ab9d94ff9bbeb0b105c39ad90 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Mon, 29 May 2017 06:29:45 -0500 Subject: [PATCH 21/24] sad --- src/x86/avx2.rs | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 27cfe69408..08ae1adc43 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -493,46 +493,61 @@ pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { a * b } -/// Compute the bitwise OR of 256 bits (representing integer data) in `a` and `b` +/// Compute the bitwise OR of 256 bits (representing integer data) in `a` +/// and `b` #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { a | b } -/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using -/// signed saturation +/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// using signed saturation #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 { unsafe { packsswb(a, b) } } -/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using -/// signed saturation +/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// using signed saturation #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 { unsafe { packssdw(a, b) } } -/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers using -/// unsigned saturation +/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// using unsigned saturation #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 { unsafe { packuswb(a, b) } } -/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers using -/// unsigned saturation +/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// using unsigned saturation #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 { unsafe { packusdw(a, b) } } +// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8) +// TODO _mm256_permute4x64_epi64 (__m256i a, const int imm8) +// TODO _mm256_permute4x64_pd (__m256d a, const int imm8) +// TODO _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx) +// TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx) +/// Compute the absolute differences of packed unsigned 8-bit integers in `a` +/// and `b`, then horizontally sum each consecutive 8 differences to +/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit +/// integers in the low 16 bits of the 64-bit return value +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 { + unsafe { psadbw(a, b) } +} @@ -618,7 +633,8 @@ extern "C" { fn packuswb(a: i16x16, b: i16x16) -> u8x32; #[link_name = "llvm.x86.avx2.packusdw"] fn packusdw(a: i32x8, b: i32x8) -> u16x16; - + #[link_name = "llvm.x86.avx2.psad.bw"] + fn psadbw(a: u8x32, b: u8x32) -> u64x4; } @@ -1307,6 +1323,14 @@ mod tests { assert_eq!(r, e); } - + #[test] + #[target_feature = "+avx2"] + fn _mm256_sad_epu8() { + let a = u8x32::splat(2); + let b = u8x32::splat(4); + let r = avx2::_mm256_sad_epu8(a, b); + let e = u64x4::splat(16); + assert_eq!(r, e); + } } From 8ee644cb6dc2a0aa6a27ebc5dc45ba4335ddb44a Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Mon, 29 May 2017 08:07:16 -0500 Subject: [PATCH 22/24] sign --- src/x86/avx2.rs | 70 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 08ae1adc43..b11a0eb6f6 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -415,6 +415,8 @@ pub fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 { unsafe { pminub(a, b) } } +/*** The following two functions fail in debug, but work in release + /// Create mask from the most significant bit of each 8-bit element in `a`, /// return the result. #[inline(always)] @@ -436,6 +438,8 @@ pub fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 { unsafe { mpsadbw(a, b, imm8) } } +***/ + /// Multiply the low 32-bit integers from each packed 64-bit element in /// `a` and `b` /// @@ -549,6 +553,29 @@ pub fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 { unsafe { psadbw(a, b) } } +// TODO _mm256_shuffle_epi32 (__m256i a, const int imm8) +// TODO _mm256_shuffle_epi8 (__m256i a, __m256i b) +// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8) +// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8) + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { psignw(a, b) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { psignd(a, b) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 { + unsafe { psignb(a, b) } +} + #[allow(improper_ctypes)] @@ -613,9 +640,9 @@ extern "C" { fn pminud(a: u32x8, b: u32x8) -> u32x8; #[link_name = "llvm.x86.avx2.pminu.b"] fn pminub(a: u8x32, b: u8x32) -> u8x32; - #[link_name = "llvm.x86.avx2.pmovmskb"] + #[link_name = "llvm.x86.avx2.pmovmskb"] //fails in debug fn pmovmskb(a: i8x32) -> i32; - #[link_name = "llvm.x86.avx2.mpsadbw"] + #[link_name = "llvm.x86.avx2.mpsadbw"] //fails in debug fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; #[link_name = "llvm.x86.avx2.pmulhu.w"] fn pmulhuw(a: u16x16, b: u16x16) -> u16x16; @@ -635,6 +662,12 @@ extern "C" { fn packusdw(a: i32x8, b: i32x8) -> u16x16; #[link_name = "llvm.x86.avx2.psad.bw"] fn psadbw(a: u8x32, b: u8x32) -> u64x4; + #[link_name = "llvm.x86.avx2.psign.b"] + fn psignb(a: i8x32, b: i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.psign.w"] + fn psignw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.psign.d"] + fn psignd(a: i32x8, b: i32x8) -> i32x8; } @@ -1173,6 +1206,8 @@ mod tests { assert_eq!(r, a); } + +/** // TODO this fails in debug but not release, why? #[test] #[target_feature ="+avx2"] @@ -1193,6 +1228,7 @@ mod tests { let e = u16x16::splat(8); assert_eq!(r, e); } +**/ #[test] #[target_feature = "+avx2"] @@ -1333,4 +1369,34 @@ mod tests { assert_eq!(r, e); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_sign_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(-1); + let r = avx2::_mm256_sign_epi16(a, b); + let e = i16x16::splat(-2); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_sign_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(-1); + let r = avx2::_mm256_sign_epi32(a, b); + let e = i32x8::splat(-2); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_sign_epi8() { + let a = i8x32::splat(2); + let b = i8x32::splat(-1); + let r = avx2::_mm256_sign_epi8(a, b); + let e = i8x32::splat(-2); + assert_eq!(r, e); + } + } From 78c3c28bf9048b96fa881ff7a72b2588eb860499 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Mon, 29 May 2017 08:14:31 -0500 Subject: [PATCH 23/24] mulhrs --- TODO.md | 30 +++++++++++++++--------------- src/x86/avx2.rs | 22 ++++++++++++++++++++++ 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/TODO.md b/TODO.md index 8e55a47095..cc48c26c09 100644 --- a/TODO.md +++ b/TODO.md @@ -851,31 +851,31 @@ avx2 * [x] `_mm256_min_epu32` * [ ] `_mm256_movemask_epi8` * [ ] `_mm256_mpsadbw_epu8` -* [ ] `_mm256_mul_epi32` -* [ ] `_mm256_mul_epu32` -* [ ] `_mm256_mulhi_epi16` -* [ ] `_mm256_mulhi_epu16` +* [x] `_mm256_mul_epi32` +* [x] `_mm256_mul_epu32` +* [x] `_mm256_mulhi_epi16` +* [x] `_mm256_mulhi_epu16` * [ ] `_mm256_mulhrs_epi16` -* [ ] `_mm256_mullo_epi16` -* [ ] `_mm256_mullo_epi32` -* [ ] `_mm256_or_si256` -* [ ] `_mm256_packs_epi16` -* [ ] `_mm256_packs_epi32` -* [ ] `_mm256_packus_epi16` -* [ ] `_mm256_packus_epi32` +* [x] `_mm256_mullo_epi16` +* [x] `_mm256_mullo_epi32` +* [x] `_mm256_or_si256` +* [x] `_mm256_packs_epi16` +* [x] `_mm256_packs_epi32` +* [x] `_mm256_packus_epi16` +* [x] `_mm256_packus_epi32` * [ ] `_mm256_permute2x128_si256` * [ ] `_mm256_permute4x64_epi64` * [ ] `_mm256_permute4x64_pd` * [ ] `_mm256_permutevar8x32_epi32` * [ ] `_mm256_permutevar8x32_ps` -* [ ] `_mm256_sad_epu8` +* [x] `_mm256_sad_epu8` * [ ] `_mm256_shuffle_epi32` * [ ] `_mm256_shuffle_epi8` * [ ] `_mm256_shufflehi_epi16` * [ ] `_mm256_shufflelo_epi16` -* [ ] `_mm256_sign_epi8` -* [ ] `_mm256_sign_epi16` -* [ ] `_mm256_sign_epi32` +* [x] `_mm256_sign_epi8` +* [x] `_mm256_sign_epi16` +* [x] `_mm256_sign_epi32` * [ ] `_mm256_slli_si256` * [ ] `_mm256_bslli_epi128` * [ ] `_mm256_sll_epi16` diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index b11a0eb6f6..a92f055276 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -497,6 +497,16 @@ pub fn _mm256_mullo_epi32(a: i32x8, b:i32x8) -> i32x8 { a * b } +/// Multiply packed 16-bit integers in `a` and `b`, producing +/// intermediate signed 32-bit integers. Truncate each intermediate +/// integer to the 18 most significant bits, round by adding 1, and +/// return bits [16:1] +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_mulhrs_epi16(a: i16x16, b:i16x16) -> i16x16 { + unsafe { pmulhrsw(a, b) } +} + /// Compute the bitwise OR of 256 bits (representing integer data) in `a` /// and `b` #[inline(always)] @@ -652,6 +662,8 @@ extern "C" { fn pmuldq(a: i32x8, b:i32x8) -> i64x4; #[link_name = "llvm.x86.avx2.pmulu.dq"] fn pmuludq(a: u32x8, b:u32x8) -> u64x4; + #[link_name = "llvm.x86.avx2.pmul.hr.sw"] + fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.packsswb"] fn packsswb(a: i16x16, b: i16x16) -> i8x32; #[link_name = "llvm.x86.avx2.packssdw"] @@ -1290,6 +1302,16 @@ mod tests { assert_eq!(r, e); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_mulhrs_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_mullo_epi16(a, b); + let e = i16x16::splat(8); + assert_eq!(r, e); + } + #[test] #[target_feature = "+avx2"] fn _mm256_or_si256() { From 9464d4680b2e5afd5bc9021d6cedc83b3e141a64 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Mon, 29 May 2017 08:15:18 -0500 Subject: [PATCH 24/24] update TODO --- TODO.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TODO.md b/TODO.md index cc48c26c09..48d81f7b2d 100644 --- a/TODO.md +++ b/TODO.md @@ -855,7 +855,7 @@ avx2 * [x] `_mm256_mul_epu32` * [x] `_mm256_mulhi_epi16` * [x] `_mm256_mulhi_epu16` -* [ ] `_mm256_mulhrs_epi16` +* [x] `_mm256_mulhrs_epi16` * [x] `_mm256_mullo_epi16` * [x] `_mm256_mullo_epi32` * [x] `_mm256_or_si256`