From 4c29b04d90f8999aeb1a57e919603cf1e1c44e16 Mon Sep 17 00:00:00 2001
From: Paolo Teti <paolo.teti@gmail.com>
Date: Wed, 18 Jul 2018 20:55:37 +0200
Subject: [PATCH 1/2] Add few ARM DSP Intrinsics

- Signed saturating add/sub
- Saturating four 8-bit integer add/sub
- Saturating two 8-bit integer add/sub

The intent is mainly to setup the module and to add all
the rest in the future.

Listed intrinsics are available on Cortex-M too (+dsp is required
on some model except for M4).
---
 coresimd/arm/dsp.rs            | 183 +++++++++++++++++++++++++++++++++
 coresimd/arm/mod.rs            |   5 +
 crates/stdsimd-test/src/lib.rs |   4 +
 3 files changed, 192 insertions(+)
 create mode 100644 coresimd/arm/dsp.rs

diff --git a/coresimd/arm/dsp.rs b/coresimd/arm/dsp.rs
new file mode 100644
index 0000000000..c6022802a6
--- /dev/null
+++ b/coresimd/arm/dsp.rs
@@ -0,0 +1,183 @@
+//! ARM DSP Intrinsics.
+
+use coresimd::simd::*;
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+types! {
+    /// ARM-specific 32-bit wide vector of four packed `i8`.
+    pub struct int8x4_t(i8, i8, i8, i8);
+    /// ARM-specific 32-bit wide vector of four packed `u8`.
+    pub struct uint8x4_t(u8, u8, u8, u8);
+    /// ARM-specific 32-bit wide vector of two packed `i16`.
+    pub struct int16x2_t(i16, i16);
+    /// ARM-specific 32-bit wide vector of two packed `u16`.
+    pub struct uint16x2_t(u16, u16);
+}
+
+impl_from_bits_!(int8x4_t: u32, i32, u16x2, i16x2, m16x2, u8x4, i8x4, m8x4);
+impl_from_bits_!(uint8x4_t: u32, i32, u16x2, i16x2, m16x2, u8x4, i8x4, m8x4);
+impl_from_bits_!(int16x2_t: u32, i32, u16x2, i16x2, m16x2, u8x4, i8x4, m8x4);
+impl_from_bits_!(uint16x2_t: u32, i32, u16x2, i16x2, m16x2, u8x4, i8x4, m8x4);
+
+extern "C" {
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd")]
+    fn arm_qadd(a: i32, b: i32) -> i32;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub")]
+    fn arm_qsub(a: i32, b: i32) -> i32;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd8")]
+    fn arm_qadd8(a: i32, b: i32) -> i32;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub8")]
+    fn arm_qsub8(a: i32, b: i32) -> i32;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd16")]
+    fn arm_qadd16(a: i32, b: i32) -> i32;
+
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qsub16")]
+    fn arm_qsub16(a: i32, b: i32) -> i32;
+}
+
+/// Signed saturating addition
+///
+/// Returns the 32-bit saturating signed equivalent of a + b.
+#[inline]
+#[cfg_attr(test, assert_instr(qadd))]
+pub unsafe fn qadd(a: i32, b: i32) -> i32 {
+    arm_qadd(a, b)
+}
+
+/// Signed saturating subtraction
+///
+/// Returns the 32-bit saturating signed equivalent of a - b.
+#[inline]
+#[cfg_attr(test, assert_instr(qsub))]
+pub unsafe fn qsub(a: i32, b: i32) -> i32 {
+    arm_qsub(a, b)
+}
+
+/// Saturating four 8-bit integer additions
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res[0] = a[0] + b[0]
+/// res[1] = a[1] + b[1]
+/// res[2] = a[2] + b[2]
+/// res[3] = a[3] + b[3]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd8))]
+pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    arm_qadd8(a.into_bits(), b.into_bits()).into_bits()
+}
+
+/// Saturating two 8-bit integer subtraction
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res[0] = a[0] - b[0]
+/// res[1] = a[1] - b[1]
+/// res[2] = a[2] - b[2]
+/// res[3] = a[3] - b[3]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub8))]
+pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    arm_qsub8(a.into_bits(), b.into_bits()).into_bits()
+}
+
+/// Saturating two 16-bit integer subtraction
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res[0] = a[0] - b[0]
+/// res[1] = a[1] - b[1]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub16))]
+pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    arm_qsub16(a.into_bits(), b.into_bits()).into_bits()
+}
+
+/// Saturating two 16-bit integer additions
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res[0] = a[0] + b[0]
+/// res[1] = a[1] + b[1]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd16))]
+pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    arm_qadd16(a.into_bits(), b.into_bits()).into_bits()
+}
+
+#[cfg(test)]
+mod tests {
+    use coresimd::arm::*;
+    use coresimd::simd::*;
+    use std::mem;
+    use stdsimd_test::simd_test;
+
+    #[test]
+    fn qadd() {
+        unsafe {
+            assert_eq!(dsp::qadd(-10, 60), 50);
+            assert_eq!(dsp::qadd(::std::i32::MAX, 10), ::std::i32::MAX);
+            assert_eq!(dsp::qadd(::std::i32::MIN, -10), ::std::i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qsub() {
+        unsafe {
+            assert_eq!(dsp::qsub(10, 60), -50);
+            assert_eq!(dsp::qsub(::std::i32::MAX, -10), ::std::i32::MAX);
+            assert_eq!(dsp::qsub(::std::i32::MIN, 10), ::std::i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
+            let r: i8x4 = dsp::qadd8(a.into_bits(), b.into_bits()).into_bits();
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
+            let r: i8x4 = dsp::qsub8(a.into_bits(), b.into_bits()).into_bits();
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, -1);
+            let c = i16x2::new(3, 1);
+            let r: i16x2 =
+                dsp::qadd16(a.into_bits(), b.into_bits()).into_bits();
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub16() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(20, -10);
+            let c = i16x2::new(-10, 30);
+            let r: i16x2 =
+                dsp::qsub16(a.into_bits(), b.into_bits()).into_bits();
+            assert_eq!(r, c);
+        }
+    }
+}
diff --git a/coresimd/arm/mod.rs b/coresimd/arm/mod.rs
index 5ad4439079..70869d6e2d 100644
--- a/coresimd/arm/mod.rs
+++ b/coresimd/arm/mod.rs
@@ -20,6 +20,11 @@ mod v7;
 #[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
 pub use self::v7::*;
 
+#[cfg(all(target_arch = "arm", target_feature = "v7"))]
+mod dsp;
+#[cfg(all(target_arch = "arm", target_feature = "v7"))]
+pub use self::dsp::*;
+
 // NEON is supported on AArch64, and on ARM when built with the v7 and neon
 // features. Building ARM without neon produces incorrect codegen.
 #[cfg(
diff --git a/crates/stdsimd-test/src/lib.rs b/crates/stdsimd-test/src/lib.rs
index 2c415e4a3f..77547377c1 100644
--- a/crates/stdsimd-test/src/lib.rs
+++ b/crates/stdsimd-test/src/lib.rs
@@ -351,6 +351,10 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
         // cases exceed the limit.
         "cvtpi2ps" => 25,
 
+        // In this case the overall length, counting also the 'mergefunc'
+        // workaround overhead, is exactly 20 instructions.
+        "qsub8" | "qadd8" | "qsub16" | "qadd16" => 22,
+
         _ => 20,
     };
     let probably_only_one_instruction = instrs.len() < instruction_limit;

From d01ae81d84ad08d96e3279c9b758230176ce0909 Mon Sep 17 00:00:00 2001
From: Paolo Teti <paolo.teti@gmail.com>
Date: Thu, 19 Jul 2018 18:48:44 +0200
Subject: [PATCH 2/2] Arm DSP: rebase and remove portable vector types

Rebase everything on top of master since the portable vector types
have been removed.
---
 coresimd/arm/dsp.rs | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/coresimd/arm/dsp.rs b/coresimd/arm/dsp.rs
index c6022802a6..740868ecc1 100644
--- a/coresimd/arm/dsp.rs
+++ b/coresimd/arm/dsp.rs
@@ -1,6 +1,5 @@
 //! ARM DSP Intrinsics.
 
-use coresimd::simd::*;
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
@@ -15,11 +14,6 @@ types! {
     pub struct uint16x2_t(u16, u16);
 }
 
-impl_from_bits_!(int8x4_t: u32, i32, u16x2, i16x2, m16x2, u8x4, i8x4, m8x4);
-impl_from_bits_!(uint8x4_t: u32, i32, u16x2, i16x2, m16x2, u8x4, i8x4, m8x4);
-impl_from_bits_!(int16x2_t: u32, i32, u16x2, i16x2, m16x2, u8x4, i8x4, m8x4);
-impl_from_bits_!(uint16x2_t: u32, i32, u16x2, i16x2, m16x2, u8x4, i8x4, m8x4);
-
 extern "C" {
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.qadd")]
     fn arm_qadd(a: i32, b: i32) -> i32;
@@ -69,7 +63,7 @@ pub unsafe fn qsub(a: i32, b: i32) -> i32 {
 #[inline]
 #[cfg_attr(test, assert_instr(qadd8))]
 pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
-    arm_qadd8(a.into_bits(), b.into_bits()).into_bits()
+    ::mem::transmute(arm_qadd8(::mem::transmute(a), ::mem::transmute(b)))
 }
 
 /// Saturating two 8-bit integer subtraction
@@ -83,7 +77,7 @@ pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 #[inline]
 #[cfg_attr(test, assert_instr(qsub8))]
 pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
-    arm_qsub8(a.into_bits(), b.into_bits()).into_bits()
+    ::mem::transmute(arm_qsub8(::mem::transmute(a), ::mem::transmute(b)))
 }
 
 /// Saturating two 16-bit integer subtraction
@@ -95,7 +89,7 @@ pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 #[inline]
 #[cfg_attr(test, assert_instr(qsub16))]
 pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    arm_qsub16(a.into_bits(), b.into_bits()).into_bits()
+    ::mem::transmute(arm_qsub16(::mem::transmute(a), ::mem::transmute(b)))
 }
 
 /// Saturating two 16-bit integer additions
@@ -107,7 +101,7 @@ pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 #[inline]
 #[cfg_attr(test, assert_instr(qadd16))]
 pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    arm_qadd16(a.into_bits(), b.into_bits()).into_bits()
+    ::mem::transmute(arm_qadd16(::mem::transmute(a), ::mem::transmute(b)))
 }
 
 #[cfg(test)]
@@ -141,7 +135,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(2, -1, 0, 1);
             let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
-            let r: i8x4 = dsp::qadd8(a.into_bits(), b.into_bits()).into_bits();
+            let r: i8x4 = ::mem::transmute(dsp::qadd8(::mem::transmute(a), ::mem::transmute(b)));
             assert_eq!(r, c);
         }
     }
@@ -152,7 +146,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
             let b = i8x4::new(2, -1, 0, 1);
             let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
-            let r: i8x4 = dsp::qsub8(a.into_bits(), b.into_bits()).into_bits();
+            let r: i8x4 = ::mem::transmute(dsp::qsub8(::mem::transmute(a),::mem::transmute(b)));
             assert_eq!(r, c);
         }
     }
@@ -163,8 +157,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(2, -1);
             let c = i16x2::new(3, 1);
-            let r: i16x2 =
-                dsp::qadd16(a.into_bits(), b.into_bits()).into_bits();
+            let r: i16x2 = ::mem::transmute(dsp::qadd16(::mem::transmute(a),::mem::transmute(b)));
             assert_eq!(r, c);
         }
     }
@@ -175,8 +168,7 @@ mod tests {
             let a = i16x2::new(10, 20);
             let b = i16x2::new(20, -10);
             let c = i16x2::new(-10, 30);
-            let r: i16x2 =
-                dsp::qsub16(a.into_bits(), b.into_bits()).into_bits();
+            let r: i16x2 = ::mem::transmute(dsp::qsub16(::mem::transmute(a), ::mem::transmute(b)));
             assert_eq!(r, c);
         }
     }